pytorch · ebsmothers · Feb 13, 2025 · Feb 4, 2025 · Feb 5, 2025 · Feb 10, 2025
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -50,6 +50,7 @@ model specific tokenizers.
 
     transforms.tokenizers.SentencePieceBaseTokenizer
     transforms.tokenizers.TikTokenBaseTokenizer
+    transforms.tokenizers.HuggingFaceBaseTokenizer
     transforms.tokenizers.ModelTokenizer
     transforms.tokenizers.BaseTokenizer
 

diff --git a/docs/source/basics/tokenizers.rst b/docs/source/basics/tokenizers.rst
@@ -222,6 +222,30 @@ to do the actual encoding and decoding.
     print(sp_tokenizer.encode(text))
     # [1, 6312, 28709, 1526, 2]
 
+.. _hf_tokenizers:
+
+Using Hugging Face tokenizers
+-----------------------------
+
+Sometimes tokenizers hosted on Hugging Face do not contain files compatible with one of torchtune's
+existing tokenizer classes. In this case, we provide :class:`~torchtune.modules.transforms.tokenizers.HuggingFaceBaseTokenizer`
+to parse the Hugging Face ``tokenizer.json`` file and define the correct ``encode`` and ``decode`` methods to
+match torchtune's other :class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` classes. You should also pass the path to
+either ``tokenizer_config.json`` or ``generation_config.json``, which will allow torchtune to infer BOS and EOS tokens.
+Continuing with the Mistral example:
+
+.. code-block:: python
+
+    hf_tokenizer = HuggingFaceBaseTokenizer(
+        tokenizer_json_path="/tmp/Mistral-7B-v0.1/tokenizer.json",
+        tokenizer_config_json_path="/tmp/Mistral-7B-v0.1/tokenizer_config.json",
+    )
+
+    text = "hello world"
+
+    print(hf_tokenizer.encode(text))
+    # [1, 6312, 28709, 1526, 2]
+
 .. _model_tokenizers:
 
 Model tokenizers

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "sentencepiece",
     "tiktoken",
     "blobfile>=2",
+    "tokenizers",
 
     # Miscellaneous
     "numpy",

diff --git a/tests/assets/generation_config.json b/tests/assets/generation_config.json
@@ -0,0 +1 @@
+{"bos_token_id": 0, "eos_token_id": -1}