automatically get tokenizer from HF if possible

aws-samples · Oct 1, 2024 · d912923 · d912923
1 parent 268c2d7
commit d912923
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 18 deletions.
diff --git a/debug.sh b/debug.sh
@@ -4,7 +4,7 @@
 # 3. Runs fmbench as usual
 
 CONDA_ENV_PATH=$CONDA_PREFIX/lib/python3.11/site-packages
-CONFIG_FILE_PATH=src/fmbench/configs/llama3/8b/config-llama3-8b-trn1-32xlarge-triton-djl.yml
+CONFIG_FILE_PATH=src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml
 #src/fmbench/configs/llama3.1/8b/config-llama3.1-8b-trn1-32xl-deploy-tp-8-ec2.yml
 #src/fmbench/configs/llama3/8b/config-llama3-8b-trn1-32xlarge-triton-vllm.yml
 #src/fmbench/configs/llama3/8b/config-llama3-8b-trn1-32xlarge-triton-vllm.yml

diff --git a/src/fmbench/5_model_metric_analysis.ipynb b/src/fmbench/5_model_metric_analysis.ipynb
@@ -1216,13 +1216,16 @@
     "# but the facets row col counter starts at bottom left so the col counter is\n",
     "# correct but the row counter needs to be inverted\n",
     "best_price_perf = df_summary_metrics_dataset_overall.to_dict(orient='records')[0]\n",
+    "logger.info(f\"best_price_perf={best_price_perf}\")\n",
     "best_price_perf_x = best_price_perf['instance_type']\n",
     "best_price_perf_y = best_price_perf['price_per_txn']*10000\n",
     "best_price_perf_avg_latency = best_price_perf['latency_p95']\n",
     "best_price_perf_cost_per_10k = round(best_price_perf['price_per_txn']*10000, 2)\n",
     "best_price_perf_instance_type = best_price_perf['instance_type']\n",
     "best_price_perf_instance_count = best_price_perf['instance_count']\n",
     "best_price_perf_tpm = best_price_perf['transactions_per_minute']\n",
+    "best_completion_token_count_mean = int(best_price_perf['completion_token_count_mean'])\n",
+    "best_prompt_token_count_mean = int(best_price_perf['prompt_token_count_mean'])\n",
     "concurrencies = np.sort(df_summary_metrics_dataset.concurrency.unique())\n",
     "\n",
     "concurrencies = list(df_summary_metrics_dataset.concurrency.unique())\n",
@@ -1267,7 +1270,7 @@
     "    x=0, \n",
     "    yref='paper',\n",
     "    y=-0.15,\n",
-    "    text=f\"*<b>best price|performance</b>: {instance_count_str}{best_price_perf_instance_type}, p95 latency {best_price_perf_avg_latency}s, 10k txn cost ${best_price_perf_cost_per_10k}, transactions/minute {best_price_perf_tpm}.\",\n",
+    "    text=f\"*<b>best price|performance</b>: {instance_count_str}{best_price_perf_instance_type}, p95 latency {best_price_perf_avg_latency}s, 10k txn cost ${best_price_perf_cost_per_10k}, transactions/minute {best_price_perf_tpm}, completion tokens {best_completion_token_count_mean}, prompt tokens {best_prompt_token_count_mean}.\",\n",
     ")\n",
     "fig.show()"
    ]

diff --git a/src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml b/src/fmbench/configs/llama3/8b/config-ec2-llama3-8b.yml
@@ -112,18 +112,11 @@ datasets:
     min_length_in_tokens: 2000
     max_length_in_tokens: 3000
     payload_file: payload_en_2000-3000.jsonl
-  - language: en
-    min_length_in_tokens: 3000
-    max_length_in_tokens: 4000
-    payload_file: payload_en_3000-4000.jsonl
   - language: en
     min_length_in_tokens: 3000
     max_length_in_tokens: 3840
     payload_file: payload_en_3000-3840.jsonl
-  - language: en
-    min_length_in_tokens: 1
-    max_length_in_tokens: 924
-    payload_file: payload_en_1-924.jsonl
+
 
 # While the tests would run on all the datasets
 # configured in the experiment entries below but 

diff --git a/src/fmbench/globals.py b/src/fmbench/globals.py
@@ -208,6 +208,12 @@
 # this is for custom tokenizers
 TOKENIZER_DIR_S3 = config['s3_read_data']['tokenizer_prefix']
 TOKENIZER = 'tokenizer'
+# we take the first experiment's model id as the model whose tokenizer is used for 
+# counting tokens on the dataset. This is done just for ease of coding, this is just
+# token counting logic on the client side (does not impact the tokenizer the model uses)
+# NOTE: if tokenizer files are provided in the tokenizer directory then they take precedence
+# if the files are not present then we load the tokenizer for this model id from Hugging Face
+TOKENIZER_MODEL_ID = config['experiments'][0]['model_id']
 
 DEPLOYMENT_SCRIPT_S3 = config['s3_read_data']['scripts_prefix']
 

diff --git a/src/fmbench/prompt_template/prompt_template_llama3.txt b/src/fmbench/prompt_template/prompt_template_llama3.txt
@@ -2,7 +2,7 @@
 
 You are an assistant for question-answering tasks. Use the following pieces of retrieved context in the section demarcated by "```" to answer the question. 
 The context may contain multiple question answer pairs as an example. Only answer the final question provided in the question section below.
-If you dont know the answer just say that you dont know. 
+If you dont know the answer just say that you dont know. Be concise with your answer.
 
 ```
 {context}

diff --git a/src/fmbench/utils.py b/src/fmbench/utils.py
@@ -412,10 +412,17 @@ class CustomTokenizer:
     """A custom tokenizer class"""
     TOKENS: int = 1000
     WORDS: int = 750
+    HF_TOKEN_FNAME: str = os.path.join(os.path.dirname(os.path.realpath(__file__)), "scripts", "hf_token.txt")
+    if Path(HF_TOKEN_FNAME).is_file() is True:
+        print(f"{HF_TOKEN_FNAME} file found, going to set HF_TOKEN env var")
+        HF_TOKEN: str = Path(HF_TOKEN_FNAME).read_text().strip()
+        os.environ["HF_TOKEN"] = HF_TOKEN
+    else:
+        print(f"{HF_TOKEN_FNAME} file not found")
 
-    def __init__(self, bucket, prefix, local_dir):
+    def __init__(self, bucket, prefix, local_dir, model_id):
         print(f"CustomTokenizer, based on HF transformers, {bucket} "
-              f"prefix: {prefix} local_dir: {local_dir}")
+              f"prefix: {prefix} local_dir: {local_dir}, model_id: {model_id}")
         # Check if the tokenizer files exist in s3 and if not, use the autotokenizer
         download_multiple_files_from_s3(bucket, prefix, local_dir)
         # Load the tokenizer from the local directory
@@ -426,16 +433,23 @@ def __init__(self, bucket, prefix, local_dir):
         if dir_not_empty > 0:
             print(f"loading the provided tokenizer from local_dir={local_dir}, abs_path={abs_path}")
             self.tokenizer = AutoTokenizer.from_pretrained(local_dir)
-            print("successfully loaded the tokenizer using AutoTokenizer.from_pretrained")
+            print(f"successfully loaded the tokenizer using AutoTokenizer.from_pretrained from {local_dir}")
         else:
-            print(f"no tokenizer provided, the {local_dir}, abs_path={abs_path} is empty, "
-                  f"using default tokenizer i.e. {self.WORDS} words = {self.TOKENS} tokens")
-            self.tokenizer = None
+            print(f"{local_dir} directory is empty")
+            try:
+                print(f"going to download tokenizer from HF for \"{model_id}\"")
+                self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+                print(f"successfully loaded the tokenizer using AutoTokenizer.from_pretrained from HF for \"{model_id}\"")
+            except Exception as e:
+                print("exception while loading tokenizer from HuggingFace")
+                print(f"no tokenizer provided, the {local_dir}, abs_path={abs_path} is empty, "
+                      f"using default tokenizer i.e. {self.WORDS} words = {self.TOKENS} tokens")
+                self.tokenizer = None
 
     def count_tokens(self, text):
         if self.tokenizer is not None:
             return len(self.tokenizer.encode(text))
         else:
             return int(math.ceil((self.TOKENS/self.WORDS) * len(text.split())))
 
-_tokenizer = CustomTokenizer(globals.READ_BUCKET_NAME, globals.TOKENIZER_DIR_S3, globals.TOKENIZER)
+_tokenizer = CustomTokenizer(globals.READ_BUCKET_NAME, globals.TOKENIZER_DIR_S3, globals.TOKENIZER, globals.TOKENIZER_MODEL_ID)