diff --git a/.gitignore b/.gitignore index 387eacb6..26569346 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth ch07/04_preference-tuning-with-dpo/loss-plot.pdf +# Tokenizer files +ch02/05_bpe-from-scratch/bpe_merges.txt +ch02/05_bpe-from-scratch/encoder.json +ch02/05_bpe-from-scratch/vocab.bpe +ch02/05_bpe-from-scratch/vocab.json + + # Other ch0?/0?_user_interface/.chainlit/ ch0?/0?_user_interface/chainlit.md diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index 2d245472..d14472f6 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -722,14 +722,14 @@ "import os\n", "import urllib.request\n", "\n", - "if not os.path.exists(\"the-verdict.txt\"):\n", + "if not os.path.exists(\"../01_main-chapter-code/the-verdict.txt\"):\n", " url = (\"https://raw.githubusercontent.com/rasbt/\"\n", " \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n", " \"the-verdict.txt\")\n", - " file_path = \"the-verdict.txt\"\n", + " file_path = \"../01_main-chapter-code/the-verdict.txt\"\n", " urllib.request.urlretrieve(url, file_path)\n", "\n", - "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", + "with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n", " text = f.read()" ] }, @@ -876,7 +876,7 @@ "id": "252693ee-e806-4dac-ab76-2c69086360f4", "metadata": {}, "source": [ - "- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:" + "- Note that the vocabulary itself is used in the `decode()` method, which allows us to map the token IDs back into text:" ] }, { @@ -1099,24 +1099,34 @@ "import os\n", "import urllib.request\n", "\n", - "def download_file_if_absent(url, filename):\n", - " if not os.path.exists(filename):\n", - " try:\n", - " with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n", - " out_file.write(response.read())\n", - " print(f\"Downloaded {filename}\")\n", - " except Exception as e:\n", - " print(f\"Failed to download {filename}. Error: {e}\")\n", - " else:\n", - " print(f\"{filename} already exists\")\n", + "def download_file_if_absent(url, filename, search_dirs):\n", + " for directory in search_dirs:\n", + " file_path = os.path.join(directory, filename)\n", + " if os.path.exists(file_path):\n", + " print(f\"{filename} already exists in {file_path}\")\n", + " return file_path\n", + "\n", + " target_path = os.path.join(search_dirs[0], filename)\n", + " try:\n", + " with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n", + " out_file.write(response.read())\n", + " print(f\"Downloaded {filename} to {target_path}\")\n", + " except Exception as e:\n", + " print(f\"Failed to download {filename}. Error: {e}\")\n", + " return target_path\n", + "\n", + "# Define the directories to search and the files to download\n", + "search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n", "\n", "files_to_download = {\n", " \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n", " \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n", "}\n", "\n", + "# Ensure directories exist and download files if needed\n", + "paths = {}\n", "for url, filename in files_to_download.items():\n", - " download_file_if_absent(url, filename)" + " paths[filename] = download_file_if_absent(url, filename, search_directories)" ] }, { @@ -1136,7 +1146,7 @@ "source": [ "tokenizer_gpt2 = BPETokenizerSimple()\n", "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n", - " vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n", + " vocab_path=paths[\"encoder.json\"], bpe_merges_path=paths[\"vocab.bpe\"]\n", ")" ] },