From 60bbd6f4b3b2255f0410905f58b89f7c70037c83 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Mon, 3 Feb 2025 17:40:51 -0800 Subject: [PATCH 1/9] wip --- pyproject.toml | 1 + .../tokenizers/test_hf_tokenizer.py | 20 ++++++++ .../modules/transforms/tokenizers/__init__.py | 2 + .../transforms/tokenizers/_hf_tokenizer.py | 49 +++++++++++++++++++ 4 files changed, 72 insertions(+) create mode 100644 tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py create mode 100644 torchtune/modules/transforms/tokenizers/_hf_tokenizer.py diff --git a/pyproject.toml b/pyproject.toml index f94732b58a..0e8dd7868a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "sentencepiece", "tiktoken", "blobfile>=2", + "tokenizers", # Miscellaneous "numpy", diff --git a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py new file mode 100644 index 0000000000..ecdc11a9c0 --- /dev/null +++ b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py @@ -0,0 +1,20 @@ +import pytest +from torchtune.modules.transforms.tokenizers import HFTokenizer + +# TODO: change this (just for testing) +TOKENIZER_DIR = "/data/users/ebs/phi4/" + + +class TestHFTokenizer: + @pytest.fixture + def tokenizer(self): + return HFTokenizer( + path=TOKENIZER_DIR + "tokenizer.json", + config_path=TOKENIZER_DIR + "tokenizer_config.json", + ) + + def test_tokenizer(self, tokenizer): + import pdb + + pdb.set_trace() + raise ValueError("done") diff --git a/torchtune/modules/transforms/tokenizers/__init__.py b/torchtune/modules/transforms/tokenizers/__init__.py index 2fecc279ee..dab7565ba5 100644 --- a/torchtune/modules/transforms/tokenizers/__init__.py +++ b/torchtune/modules/transforms/tokenizers/__init__.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from ._hf_tokenizer import HFTokenizer from ._sentencepiece import SentencePieceBaseTokenizer from ._tiktoken import TikTokenBaseTokenizer from ._utils import ( @@ -20,4 +21,5 @@ "BaseTokenizer", "tokenize_messages_no_special_tokens", "parse_hf_tokenizer_json", + "HFTokenizer", ] diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py new file mode 100644 index 0000000000..5ed9c533cf --- /dev/null +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -0,0 +1,49 @@ +import json +from typing import List + +from tokenizers import Tokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer + + +class HFTokenizer(BaseTokenizer): + """ + A wrapper around HuggingFace tokenizers. BLAH BLAH BLAH + + Args: + path (str): Path to tokenizer.json file + config_path (str): Path to tokenizer_config.json file + """ + + def __init__(self, path: str, config_path: str): + self.hf_tokenizer = Tokenizer.from_file(path) + with open(config_path, "rb") as f: + config = json.load(f) + + def _infer_tokenizer_class_from_config(self): + pass + + def encode( + self, text: str, add_bos: bool = False, add_eos: bool = False + ) -> List[int]: + """ + Encodes a string into a list of token ids. + + Args: + text (str): The text to encode. + add_bos (bool): Whether to add a beginning-of-sequence token to the beginning of the + """ + token_ids = self.hf_tokenizer.encode(text).ids + if add_bos: + token_ids.insert(0, self.hf_tokenizer.bos_id) + if add_eos: + token_ids.append(self.hf_tokenizer.eos_id) + return token_ids + + def decode(self, token_ids: List[int]) -> str: + """ + Decodes a list of token ids into a string. + + Args: + token_ids (List[int]): The list of token ids to decode. + """ + return self.hf_tokenizer.decode(token_ids) From 0b1133327305050782f376c8e96aee513e5e769f Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Wed, 5 Feb 2025 15:18:15 -0800 Subject: [PATCH 2/9] HF tokenizers: initial base tokenizer support --- tests/assets/generation_config.json | 1 + tests/assets/tokenizer.json | 12487 ++++++++++++++++ tests/assets/tokenizer_config.json | 1 + .../tokenizers/test_hf_tokenizer.py | 128 +- .../transforms/tokenizers/_hf_tokenizer.py | 98 +- 5 files changed, 12690 insertions(+), 25 deletions(-) create mode 100644 tests/assets/generation_config.json create mode 100644 tests/assets/tokenizer.json create mode 100644 tests/assets/tokenizer_config.json diff --git a/tests/assets/generation_config.json b/tests/assets/generation_config.json new file mode 100644 index 0000000000..2967d7f6a1 --- /dev/null +++ b/tests/assets/generation_config.json @@ -0,0 +1 @@ +{"bos_token_id": 0, "eos_token_id": -1} diff --git a/tests/assets/tokenizer.json b/tests/assets/tokenizer.json new file mode 100644 index 0000000000..df6d8fe46f --- /dev/null +++ b/tests/assets/tokenizer.json @@ -0,0 +1,12487 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": false + } + ] + }, + "post_processor": null, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": true, + "vocab": { + "Ā": 0, + "ā": 1, + "Ă": 2, + "ă": 3, + "Ą": 4, + "ą": 5, + "Ć": 6, + "ć": 7, + "Ĉ": 8, + "ĉ": 9, + "Ċ": 10, + "ċ": 11, + "Č": 12, + "č": 13, + "Ď": 14, + "ď": 15, + "Đ": 16, + "đ": 17, + "Ē": 18, + "ē": 19, + "Ĕ": 20, + "ĕ": 21, + "Ė": 22, + "ė": 23, + "Ę": 24, + "ę": 25, + "Ě": 26, + "ě": 27, + "Ĝ": 28, + "ĝ": 29, + "Ğ": 30, + "ğ": 31, + "Ġ": 32, + "!": 33, + "\"": 34, + "#": 35, + "$": 36, + "%": 37, + "&": 38, + "'": 39, + "(": 40, + ")": 41, + "*": 42, + "+": 43, + ",": 44, + "-": 45, + ".": 46, + "/": 47, + "0": 48, + "1": 49, + "2": 50, + "3": 51, + "4": 52, + "5": 53, + "6": 54, + "7": 55, + "8": 56, + "9": 57, + ":": 58, + ";": 59, + "<": 60, + "=": 61, + ">": 62, + "?": 63, + "@": 64, + "A": 65, + "B": 66, + "C": 67, + "D": 68, + "E": 69, + "F": 70, + "G": 71, + "H": 72, + "I": 73, + "J": 74, + "K": 75, + "L": 76, + "M": 77, + "N": 78, + "O": 79, + "P": 80, + "Q": 81, + "R": 82, + "S": 83, + "T": 84, + "U": 85, + "V": 86, + "W": 87, + "X": 88, + "Y": 89, + "Z": 90, + "[": 91, + "\\": 92, + "]": 93, + "^": 94, + "_": 95, + "`": 96, + "a": 97, + "b": 98, + "c": 99, + "d": 100, + "e": 101, + "f": 102, + "g": 103, + "h": 104, + "i": 105, + "j": 106, + "k": 107, + "l": 108, + "m": 109, + "n": 110, + "o": 111, + "p": 112, + "q": 113, + "r": 114, + "s": 115, + "t": 116, + "u": 117, + "v": 118, + "w": 119, + "x": 120, + "y": 121, + "z": 122, + "{": 123, + "|": 124, + "}": 125, + "~": 126, + "ġ": 127, + "Ģ": 128, + "ģ": 129, + "Ĥ": 130, + "ĥ": 131, + "Ħ": 132, + "ħ": 133, + "Ĩ": 134, + "ĩ": 135, + "Ī": 136, + "ī": 137, + "Ĭ": 138, + "ĭ": 139, + "Į": 140, + "į": 141, + "İ": 142, + "ı": 143, + "IJ": 144, + "ij": 145, + "Ĵ": 146, + "ĵ": 147, + "Ķ": 148, + "ķ": 149, + "ĸ": 150, + "Ĺ": 151, + "ĺ": 152, + "Ļ": 153, + "ļ": 154, + "Ľ": 155, + "ľ": 156, + "Ŀ": 157, + "ŀ": 158, + "Ł": 159, + "ł": 160, + "¡": 161, + "¢": 162, + "£": 163, + "¤": 164, + "¥": 165, + "¦": 166, + "§": 167, + "¨": 168, + "©": 169, + "ª": 170, + "«": 171, + "¬": 172, + "Ń": 173, + "®": 174, + "¯": 175, + "°": 176, + "±": 177, + "²": 178, + "³": 179, + "´": 180, + "µ": 181, + "¶": 182, + "·": 183, + "¸": 184, + "¹": 185, + "º": 186, + "»": 187, + "¼": 188, + "½": 189, + "¾": 190, + "¿": 191, + "À": 192, + "Á": 193, + "Â": 194, + "Ã": 195, + "Ä": 196, + "Å": 197, + "Æ": 198, + "Ç": 199, + "È": 200, + "É": 201, + "Ê": 202, + "Ë": 203, + "Ì": 204, + "Í": 205, + "Î": 206, + "Ï": 207, + "Ð": 208, + "Ñ": 209, + "Ò": 210, + "Ó": 211, + "Ô": 212, + "Õ": 213, + "Ö": 214, + "×": 215, + "Ø": 216, + "Ù": 217, + "Ú": 218, + "Û": 219, + "Ü": 220, + "Ý": 221, + "Þ": 222, + "ß": 223, + "à": 224, + "á": 225, + "â": 226, + "ã": 227, + "ä": 228, + "å": 229, + "æ": 230, + "ç": 231, + "è": 232, + "é": 233, + "ê": 234, + "ë": 235, + "ì": 236, + "í": 237, + "î": 238, + "ï": 239, + "ð": 240, + "ñ": 241, + "ò": 242, + "ó": 243, + "ô": 244, + "õ": 245, + "ö": 246, + "÷": 247, + "ø": 248, + "ù": 249, + "ú": 250, + "û": 251, + "ü": 252, + "ý": 253, + "þ": 254, + "ÿ": 255, + "Ġt": 256, + "he": 257, + "Ġa": 258, + "in": 259, + "Ġs": 260, + "Ġw": 261, + "Ġthe": 262, + "Ġo": 263, + "re": 264, + "Ġb": 265, + "ou": 266, + "ed": 267, + "Ġm": 268, + "nd": 269, + "ĠI": 270, + "ha": 271, + "it": 272, + "er": 273, + "ing": 274, + "Ġf": 275, + "is": 276, + "Ġto": 277, + "en": 278, + "on": 279, + "or": 280, + "as": 281, + "Ġc": 282, + "Ġof": 283, + "Ġand": 284, + "Ġd": 285, + "ll": 286, + "at": 287, + "an": 288, + "ar": 289, + "Ġp": 290, + "Ġn": 291, + "Ġin": 292, + "le": 293, + "om": 294, + "ot": 295, + "Ġbe": 296, + "Ġh": 297, + "ut": 298, + "ow": 299, + "es": 300, + "hat": 301, + "Ġg": 302, + "Ġhe": 303, + "Ġha": 304, + "Ġl": 305, + "Ġwas": 306, + "ld": 307, + "gh": 308, + "id": 309, + "ch": 310, + "Ġth": 311, + "Ġit": 312, + "ay": 313, + "Ġon": 314, + "ce": 315, + "se": 316, + "ent": 317, + "Ġst": 318, + "ly": 319, + "ve": 320, + "et": 321, + "st": 322, + "ĠT": 323, + "Ġe": 324, + "Ġy": 325, + "ght": 326, + "ir": 327, + "Ġme": 328, + "oo": 329, + "al": 330, + "ith": 331, + "Ġre": 332, + "im": 333, + "Ġthat": 334, + "Ġas": 335, + "ould": 336, + "ro": 337, + "ad": 338, + "ion": 339, + ".Ċ": 340, + "her": 341, + "Ġmy": 342, + "ct": 343, + "Ġnot": 344, + "Ġwith": 345, + "Ġfor": 346, + "Ġu": 347, + "ke": 348, + "Ġyou": 349, + "ĠS": 350, + "Ġis": 351, + "ight": 352, + "\"Ċ": 353, + "am": 354, + "ic": 355, + "ur": 356, + "Ġat": 357, + "..": 358, + "ac": 359, + "ter": 360, + "Ġwh": 361, + "Ġan": 362, + "Ġwe": 363, + "ĠThe": 364, + "if": 365, + "Ġor": 366, + "Ġbut": 367, + "ver": 368, + "Ġ\"": 369, + "Ġr": 370, + "out": 371, + "ome": 372, + "Ġhad": 373, + "pp": 374, + "qu": 375, + "Ġsu": 376, + "Ġthis": 377, + "red": 378, + "ard": 379, + "Ġso": 380, + "ell": 381, + "Ġwould": 382, + "Ġhis": 383, + "Ġsh": 384, + "ine": 385, + "ra": 386, + "Ġse": 387, + "Ġby": 388, + ".\"Ċ": 389, + "ĠP": 390, + "hen": 391, + "ĠA": 392, + "Ġhave": 393, + "Ġfr": 394, + "Ġsa": 395, + "ĠH": 396, + "Ġone": 397, + "em": 398, + "ked": 399, + "irt": 400, + "ect": 401, + "Ġhim": 402, + "Ġli": 403, + "Ġab": 404, + "ation": 405, + "hing": 406, + "the": 407, + "ĠR": 408, + "Ġle": 409, + "ss": 410, + "ĠW": 411, + "cu": 412, + "ill": 413, + "'t": 414, + "art": 415, + "all": 416, + ",Ċ": 417, + "own": 418, + "ore": 419, + "Ġall": 420, + "Ġk": 421, + "Ġgo": 422, + "hirt": 423, + "and": 424, + "Ġout": 425, + "ame": 426, + "ain": 427, + "Ġif": 428, + "Ġno": 429, + "Ġdo": 430, + "Ġthey": 431, + "ool": 432, + "un": 433, + "to": 434, + "Ġup": 435, + "ĠRed": 436, + "Ġne": 437, + "ĠK": 438, + "Ġfrom": 439, + "ĠShirt": 440, + "Ġwor": 441, + "ong": 442, + "Ġthere": 443, + "Ġsaid": 444, + "ri": 445, + "ant": 446, + "ĠB": 447, + "Ġany": 448, + "ud": 449, + "ind": 450, + "Ġwhi": 451, + "ab": 452, + "ound": 453, + "Ġabout": 454, + "Ġthem": 455, + "cup": 456, + "ak": 457, + "Ġde": 458, + "Ġte": 459, + "ĠM": 460, + "ake": 461, + "cupine": 462, + "ig": 463, + "Ġwere": 464, + "orcupine": 465, + "il": 466, + "chool": 467, + "Ġro": 468, + "ood": 469, + "Ġare": 470, + "ive": 471, + "Ġlike": 472, + "yo": 473, + "Ġhou": 474, + "'s": 475, + "one": 476, + "us": 477, + "el": 478, + "ul": 479, + "ack": 480, + "op": 481, + ",\"": 482, + "th": 483, + "acher": 484, + "um": 485, + "ang": 486, + "Ġfa": 487, + "ag": 488, + "Ġschool": 489, + "Ġj": 490, + "te": 491, + "ok": 492, + "ess": 493, + "ust": 494, + "ers": 495, + "....": 496, + "ĠC": 497, + "ther": 498, + "han": 499, + "Ġwhen": 500, + "Ġsp": 501, + "Ġman": 502, + "Ġcan": 503, + "ough": 504, + "Ġwho": 505, + "Ġget": 506, + "Ġdid": 507, + "Ġpo": 508, + "ci": 509, + "Ġal": 510, + "ist": 511, + "Ġcom": 512, + "lf": 513, + "au": 514, + "ĠPorcupine": 515, + "Ġwhich": 516, + "ven": 517, + "Ġaf": 518, + "wn": 519, + "ass": 520, + "ber": 521, + "Ġex": 522, + "ous": 523, + "est": 524, + "lo": 525, + "Ġtr": 526, + "ellow": 527, + "Ġsay": 528, + "ought": 529, + "Ġroom": 530, + "Ġsome": 531, + "--": 532, + "ĠO": 533, + "ate": 534, + "Ġv": 535, + "hed": 536, + "ap": 537, + "Ġtw": 538, + "Ġbec": 539, + "ree": 540, + "ject": 541, + "ks": 542, + "Ġcon": 543, + "Ġbeen": 544, + "ents": 545, + "ide": 546, + "Ġcould": 547, + "ĠG": 548, + "ep": 549, + "Ġpro": 550, + "nt": 551, + "Ġhouse": 552, + "Ġag": 553, + "ĠIf": 554, + "Ġkn": 555, + "Ġfellow": 556, + "Ġwhat": 557, + "way": 558, + "ish": 559, + "Ġam": 560, + "ite": 561, + "nder": 562, + "ime": 563, + "Ġpr": 564, + "Ġteacher": 565, + "are": 566, + "Ġbo": 567, + "Ġshe": 568, + "ĠN": 569, + "ice": 570, + "ast": 571, + "ure": 572, + "ie": 573, + "Ġsuch": 574, + "uten": 575, + "utenber": 576, + "utenberg": 577, + "Ġqu": 578, + "lown": 579, + "Ġwr": 580, + "pt": 581, + "ĠHe": 582, + "Ġstud": 583, + "here": 584, + "Ġmore": 585, + "ry": 586, + "tter": 587, + "ĠY": 588, + "Ġmay": 589, + "ity": 590, + "Ġloo": 591, + "Ġother": 592, + "his": 593, + "ĠPro": 594, + "Ġwill": 595, + "ĠIt": 596, + "ort": 597, + "Ġshould": 598, + "very": 599, + "we": 600, + "Ġpl": 601, + "ash": 602, + ".\"": 603, + "Ġapp": 604, + "Ġday": 605, + "urn": 606, + "po": 607, + "Ġher": 608, + "ĠĠ": 609, + "not": 610, + "ck": 611, + "Ġun": 612, + "hi": 613, + "ving": 614, + "Ġold": 615, + "Ġtime": 616, + "\"T": 617, + "Ġway": 618, + "able": 619, + "?\"Ċ": 620, + "ĠClown": 621, + "Ġonly": 622, + "ub": 623, + "ach": 624, + "Ġoff": 625, + "Ġthan": 626, + "ally": 627, + "Ġtheir": 628, + "be": 629, + "king": 630, + "other": 631, + "ary": 632, + "ans": 633, + "ated": 634, + "self": 635, + "Ġgoing": 636, + "uch": 637, + "oll": 638, + "Ġback": 639, + "iyo": 640, + "-t": 641, + "ance": 642, + "ade": 643, + "ĠProject": 644, + "sp": 645, + "Ġtwo": 646, + "Ġthought": 647, + "so": 648, + "Ġright": 649, + "Ġhead": 650, + "ved": 651, + "ĠD": 652, + "Ġpre": 653, + "Ġsee": 654, + "Ġus": 655, + "Ġstudents": 656, + "cip": 657, + "Ġdon": 658, + "Ġnight": 659, + "incip": 660, + "ĠKiyo": 661, + "pl": 662, + "ared": 663, + "ĠGutenberg": 664, + "Ġco": 665, + "Ġhow": 666, + "omet": 667, + "ff": 668, + "\"I": 669, + ",--": 670, + "Ġasked": 671, + "incipal": 672, + "ever": 673, + "Ġac": 674, + "ĠF": 675, + "Ġmake": 676, + "itt": 677, + "Ġmight": 678, + "ge": 679, + "led": 680, + "Ġafter": 681, + "ign": 682, + "Ġgr": 683, + "Ġmade": 684, + "dd": 685, + "Ġknow": 686, + "Ġcome": 687, + "Ġbr": 688, + "thing": 689, + "ĠBut": 690, + "Ġmat": 691, + "ĠOn": 692, + "ory": 693, + "cl": 694, + "ĠE": 695, + "ble": 696, + "og": 697, + "Ġyour": 698, + "ull": 699, + "Ġwork": 700, + "ear": 701, + "Ġthree": 702, + "ied": 703, + "but": 704, + "The": 705, + "pe": 706, + "ace": 707, + "Ġstart": 708, + "ick": 709, + "Ġover": 710, + "our": 711, + "Ġmuch": 712, + "Ġwant": 713, + "imp": 714, + "Ġpart": 715, + "ho": 716, + "ink": 717, + "ence": 718, + "Ġdown": 719, + "Ġeven": 720, + "Ġprincipal": 721, + "ling": 722, + "ount": 723, + "ause": 724, + "Ġcl": 725, + "Ġbl": 726, + "-tm": 727, + "omething": 728, + "Ġinto": 729, + "orm": 730, + "okyo": 731, + "Ġdis": 732, + "Ġfe": 733, + "Ġface": 734, + "......": 735, + "ress": 736, + "ment": 737, + "ire": 738, + "Ġar": 739, + "ty": 740, + "Ġmo": 741, + "reat": 742, + "Ġfir": 743, + "per": 744, + "Ġour": 745, + "co": 746, + "Ġthen": 747, + "Ġtold": 748, + "ings": 749, + "Ġtake": 750, + "Ġbeg": 751, + "ner": 752, + "ition": 753, + "ose": 754, + "Ġown": 755, + "Ġagain": 756, + "Ġseem": 757, + "ise": 758, + "Ġwat": 759, + "\"W": 760, + "Ġfar": 761, + "aking": 762, + "fore": 763, + "ady": 764, + "-s": 765, + "less": 766, + "Ġret": 767, + "Ġsha": 768, + "Ġcame": 769, + "ger": 770, + "Ġgood": 771, + "ather": 772, + "ark": 773, + "row": 774, + "Ġke": 775, + "'m": 776, + "Ġhas": 777, + "ath": 778, + "pped": 779, + "Ġwent": 780, + "Ġtell": 781, + "quash": 782, + "Ġen": 783, + "Ġfirst": 784, + "Ġhot": 785, + "iz": 786, + "Ġaway": 787, + "Ġsomething": 788, + "Ġrem": 789, + "Ġtown": 790, + "Ġsm": 791, + "ĠThis": 792, + "Ġbetter": 793, + "ĠThen": 794, + "was": 795, + "of": 796, + "bard": 797, + "ĠL": 798, + "li": 799, + "fe": 800, + "ĠTokyo": 801, + "Ġlong": 802, + "ily": 803, + "Ġsure": 804, + "Ġlooked": 805, + "ubbard": 806, + "ction": 807, + "ord": 808, + "Ġmany": 809, + "ious": 810, + "Ġtoo": 811, + "Ġhere": 812, + "os": 813, + "Ġunder": 814, + "ase": 815, + "ng": 816, + "ped": 817, + "od": 818, + "me": 819, + "Ġjust": 820, + "Ġnow": 821, + "ince": 822, + "Ġheard": 823, + "Ġkind": 824, + "ĠThey": 825, + "Ġbefore": 826, + "hy": 827, + "ĠIn": 828, + "Ġent": 829, + "Ġboard": 830, + "!\"": 831, + "ward": 832, + "Ġbeing": 833, + "Ġwell": 834, + "erm": 835, + "ried": 836, + "Ġwrong": 837, + "aid": 838, + "xt": 839, + "Ġreturn": 840, + "ited": 841, + "Ġyen": 842, + "Ġmatter": 843, + "Ġcall": 844, + "Ġtal": 845, + "ĠYou": 846, + "ced": 847, + "ised": 848, + "Ġcha": 849, + "ons": 850, + "Ġsame": 851, + "Ġonce": 852, + "day": 853, + "ft": 854, + "Ġsw": 855, + "Ġbecause": 856, + "Ġthink": 857, + "Ġwhere": 858, + "ĠNo": 859, + "ĠHubbard": 860, + "ĠSquash": 861, + "Ġcop": 862, + "with": 863, + "ered": 864, + "ollow": 865, + "Ġplace": 866, + "idd": 867, + "cess": 868, + "Ġshow": 869, + "isha": 870, + "Ġra": 871, + "Ġletter": 872, + "ne": 873, + "ves": 874, + "ating": 875, + "rang": 876, + "Ġaff": 877, + "Ġhand": 878, + "Ġsc": 879, + "Ġpers": 880, + "int": 881, + "pr": 882, + "side": 883, + "fter": 884, + "Ġsaying": 885, + "Ġlau": 886, + "that": 887, + "Ġwithout": 888, + "ron": 889, + "air": 890, + "lect": 891, + "ĠWhat": 892, + "elt": 893, + "Ġwhile": 894, + "oga": 895, + "aper": 896, + "Ġpe": 897, + "oy": 898, + "Ġsat": 899, + "ies": 900, + "Ġadd": 901, + "Ġdays": 902, + "Ġspe": 903, + "Ġho": 904, + "Ġans": 905, + "Ġhar": 906, + "ĠWhen": 907, + "Ġanything": 908, + "pen": 909, + "]Ċ": 910, + "tain": 911, + "Ġmust": 912, + "Ġnew": 913, + "lic": 914, + "Ġvo": 915, + "hile": 916, + "get": 917, + "ĠAs": 918, + "Ġvery": 919, + "'re": 920, + "Ġevery": 921, + "ave": 922, + "?\"": 923, + "adger": 924, + "ĠKoga": 925, + "ĠMr": 926, + "rough": 927, + "ult": 928, + "Ġfollow": 929, + "ting": 930, + "ife": 931, + "iddle": 932, + "ful": 933, + "ank": 934, + "ĠSo": 935, + "Ġseemed": 936, + "ĠAnd": 937, + "ix": 938, + "Ġset": 939, + "Ġcare": 940, + "Ġres": 941, + "Ġnever": 942, + "Ġfound": 943, + "Ġlo": 944, + "cid": 945, + "ined": 946, + "Ġclass": 947, + "Ġmyself": 948, + "aw": 949, + "Ġwom": 950, + "ations": 951, + "Ġleft": 952, + "ĠWe": 953, + "Ġteachers": 954, + "\"Y": 955, + "na": 956, + "ont": 957, + "Ġdes": 958, + "Ġthose": 959, + "ired": 960, + "Ġsen": 961, + "ying": 962, + "Ġthese": 963, + "az": 964, + "ĠThere": 965, + "cept": 966, + "Ġdang": 967, + "ĠU": 968, + "\"H": 969, + "bod": 970, + "body": 971, + "Ġhaving": 972, + "alary": 973, + "Ġwatch": 974, + "Ġgive": 975, + "age": 976, + "Ġits": 977, + "Ġappe": 978, + "ue": 979, + "Ġcount": 980, + "Ġhard": 981, + "Ġbel": 982, + "ott": 983, + "Ġdist": 984, + "\"S": 985, + "ĠMad": 986, + "-n": 987, + "ribut": 988, + "ged": 989, + "Ġatt": 990, + "fere": 991, + "ither": 992, + "Ġupon": 993, + "Ġtem": 994, + "Ġperson": 995, + "ning": 996, + "Ġche": 997, + "arly": 998, + "oney": 999, + "Ġsoon": 1000, + "ement": 1001, + "Ġ(": 1002, + "Ġtrans": 1003, + "Ġexp": 1004, + "Ġser": 1005, + "Ġreg": 1006, + "ason": 1007, + "Ġsaw": 1008, + "Ġnext": 1009, + "oot": 1010, + "Ġhalf": 1011, + "Ġtook": 1012, + "Ġbad": 1013, + "Ġhour": 1014, + "Ġsalary": 1015, + "Ġbegan": 1016, + "right": 1017, + "onna": 1018, + "-san": 1019, + "Ġworks": 1020, + "ĠJ": 1021, + "form": 1022, + "ical": 1023, + "Ġtra": 1024, + "man": 1025, + "Ġnothing": 1026, + "Ġstill": 1027, + "ears": 1028, + "Ġsupp": 1029, + "Ġturn": 1030, + "Ġfelt": 1031, + "Ġwoman": 1032, + "Ġstarted": 1033, + "ouble": 1034, + "ura": 1035, + "ishing": 1036, + ":Ċ": 1037, + "lectron": 1038, + "lectronic": 1039, + "ook": 1040, + "Ġcopy": 1041, + "Ġfull": 1042, + "cond": 1043, + "mat": 1044, + "Ġmiddle": 1045, + "Ġlook": 1046, + "Ġcomm": 1047, + "wered": 1048, + "Ġbecame": 1049, + "Ġfellows": 1050, + "would": 1051, + "Ġgot": 1052, + "Ġgl": 1053, + "Ġgu": 1054, + "Ġkeep": 1055, + "Ġge": 1056, + "ĠMadonna": 1057, + "iter": 1058, + "ished": 1059, + "Ġunderst": 1060, + "Ġstra": 1061, + "sid": 1062, + "Ġcountry": 1063, + "ople": 1064, + "Ġprov": 1065, + "Ġput": 1066, + "no": 1067, + "'ll": 1068, + "Ġsle": 1069, + "range": 1070, + "ĠShe": 1071, + "pos": 1072, + "Ġmind": 1073, + "Ġpass": 1074, + "Ġthrough": 1075, + "Ġquite": 1076, + "Ġind": 1077, + "Ġboarding": 1078, + "teacher": 1079, + "ple": 1080, + "Porcupine": 1081, + "Ġple": 1082, + "Ġgeisha": 1083, + "ĠĠĠĠ": 1084, + "ost": 1085, + "ense": 1086, + "No": 1087, + "ible": 1088, + "Ġread": 1089, + "Ġred": 1090, + "ention": 1091, + "ened": 1092, + "!\"Ċ": 1093, + "Ġref": 1094, + "Ġad": 1095, + "Ġfl": 1096, + "Ġstay": 1097, + "up": 1098, + "Ġround": 1099, + "Ġcle": 1100, + "Ġopen": 1101, + "Ġob": 1102, + "tend": 1103, + "Ġfind": 1104, + "Ġper": 1105, + "Ġcalled": 1106, + "Ġsur": 1107, + "rew": 1108, + "Ġpaper": 1109, + "ĠBadger": 1110, + "Ġmeet": 1111, + "iss": 1112, + "\"That": 1113, + "erms": 1114, + "TE": 1115, + "itten": 1116, + "ably": 1117, + "ness": 1118, + "Ġcannot": 1119, + "Ġsimp": 1120, + "con": 1121, + "Ġreason": 1122, + "you": 1123, + "Ġhome": 1124, + "by": 1125, + "Ġfight": 1126, + "ittle": 1127, + "Ġthings": 1128, + "Ġeas": 1129, + "Ġimp": 1130, + "ressed": 1131, + "Ġmean": 1132, + "Ġappeared": 1133, + "Ġnat": 1134, + "Ġhel": 1135, + "ret": 1136, + "aken": 1137, + "Ġstraight": 1138, + "Ġaffair": 1139, + "iting": 1140, + "Ġed": 1141, + "Ġsince": 1142, + "log": 1143, + "Ġpay": 1144, + "Ġfront": 1145, + "my": 1146, + "Ġvoice": 1147, + "ready": 1148, + "Ġfool": 1149, + "oundation": 1150, + "Ġelectronic": 1151, + "Ġterms": 1152, + "Ġmar": 1153, + "apan": 1154, + "any": 1155, + "Ġresp": 1156, + "Ġend": 1157, + "app": 1158, + "what": 1159, + "str": 1160, + "rap": 1161, + "ial": 1162, + "icul": 1163, + "Ġacc": 1164, + "oth": 1165, + "Ġsecond": 1166, + "Ġflo": 1167, + "Ġsix": 1168, + "Ġfeet": 1169, + "br": 1170, + "iet": 1171, + "Ġlittle": 1172, + "les": 1173, + "Ġmoney": 1174, + "Ġdecl": 1175, + "Ġey": 1176, + "Ġcomp": 1177, + "aring": 1178, + "Ġagre": 1179, + "where": 1180, + "ĠSt": 1181, + "Ġstre": 1182, + "ex": 1183, + "ract": 1184, + "Ġint": 1185, + "Ġdire": 1186, + "Ġbecome": 1187, + "Ġhon": 1188, + "Ġconsid": 1189, + "ertain": 1190, + "now": 1191, + "Ġsl": 1192, + "itor": 1193, + "gg": 1194, + "Ġjum": 1195, + "Ġbu": 1196, + "Ġthing": 1197, + "Ġanswered": 1198, + "oes": 1199, + "ya": 1200, + "ĠThat": 1201, + "ize": 1202, + "ond": 1203, + "act": 1204, + "Ġeff": 1205, + "Ġbang": 1206, + "about": 1207, + "Ġbed": 1208, + "orrow": 1209, + "ung": 1210, + "ĠTo": 1211, + "Ġkept": 1212, + "Ġwal": 1213, + "Ġbath": 1214, + "Ġdra": 1215, + "\"A": 1216, + "rings": 1217, + "hopp": 1218, + "Ġresign": 1219, + "Ġdin": 1220, + "Ġlady": 1221, + ".E": 1222, + "Ġuse": 1223, + "lish": 1224, + "ors": 1225, + "Ġwritten": 1226, + "ene": 1227, + "iv": 1228, + "Ġdif": 1229, + "Ġste": 1230, + "Ġstory": 1231, + "com": 1232, + "res": 1233, + "ently": 1234, + "Ġfact": 1235, + "hes": 1236, + "ways": 1237, + "Ġwhy": 1238, + "Ġthough": 1239, + "Ġstr": 1240, + "onder": 1241, + "head": 1242, + "Ġcour": 1243, + "Ġmon": 1244, + "Ġsk": 1245, + "Ġbelie": 1246, + "Ġlet": 1247, + "fer": 1248, + "Ġrequ": 1249, + "Ġline": 1250, + "room": 1251, + "-day": 1252, + "Ġdone": 1253, + "Ġdoes": 1254, + "ĠOne": 1255, + "Ġdango": 1256, + "asshopp": 1257, + "Ġconsider": 1258, + "Ġdinner": 1259, + "ĠFoundation": 1260, + "**": 1261, + "empt": 1262, + "ese": 1263, + "Ġword": 1264, + "rest": 1265, + "Ġenough": 1266, + "Ġgreat": 1267, + "Ġname": 1268, + "Ġpub": 1269, + "Ġmanner": 1270, + "wer": 1271, + "ict": 1272, + "iness": 1273, + "Ġhimself": 1274, + "Ġpeople": 1275, + "ew": 1276, + "Ġcor": 1277, + "estion": 1278, + "Ġbig": 1279, + "ee": 1280, + "Ġri": 1281, + "ides": 1282, + "Ġbrother": 1283, + "Ġheart": 1284, + "ected": 1285, + "eed": 1286, + "Ġothers": 1287, + "sol": 1288, + "ted": 1289, + "Ġeyes": 1290, + "Ġtrouble": 1291, + "Ġteach": 1292, + "Ġboat": 1293, + "Ġfour": 1294, + "Ġalready": 1295, + "rom": 1296, + "ghed": 1297, + "Ġsqu": 1298, + "Ġpol": 1299, + "ces": 1300, + "ĠHott": 1301, + "Ġleave": 1302, + "Ġdistribut": 1303, + "aster": 1304, + "CH": 1305, + "uc": 1306, + "Ġim": 1307, + "Ġhowever": 1308, + "there": 1309, + "apanese": 1310, + "Ġlast": 1311, + "Ġcr": 1312, + "ility": 1313, + "Ġsimple": 1314, + "Ġlife": 1315, + "-c": 1316, + "Ġregard": 1317, + "Ġfin": 1318, + "ual": 1319, + "Ġmeans": 1320, + "Ġstand": 1321, + "atch": 1322, + "Ġshort": 1323, + "ned": 1324, + "Ġseen": 1325, + "Ġhapp": 1326, + "-k": 1327, + "Ġagainst": 1328, + "him": 1329, + "amed": 1330, + "Ġstood": 1331, + "Ġgra": 1332, + "Ġmother": 1333, + "Ġfish": 1334, + "Ġwater": 1335, + "ail": 1336, + "cei": 1337, + "Ġrather": 1338, + "Ġins": 1339, + "Ġfeel": 1340, + "Ġalso": 1341, + "Ġord": 1342, + "Ġcoming": 1343, + "ics": 1344, + "Ġeither": 1345, + "nce": 1346, + "Ġ'": 1347, + "Ġkid": 1348, + "Ġlaughed": 1349, + "like": 1350, + "ĠAr": 1351, + "gr": 1352, + "ĠHotta": 1353, + "Ġtalk": 1354, + "gether": 1355, + "ĠSir": 1356, + "Ġpun": 1357, + "Pro": 1358, + "ats": 1359, + "most": 1360, + "Ġrep": 1361, + "Ġgi": 1362, + "isf": 1363, + "bably": 1364, + "akes": 1365, + "ĠNot": 1366, + "ny": 1367, + "Ġappear": 1368, + "mp": 1369, + "cha": 1370, + "Ġact": 1371, + "bed": 1372, + "ief": 1373, + "uff": 1374, + "Ġapo": 1375, + "Ġmet": 1376, + "Ġreturned": 1377, + "Ġsound": 1378, + "usiness": 1379, + "Ġlaugh": 1380, + "Ġclear": 1381, + "Ġneed": 1382, + "fess": 1383, + "ested": 1384, + "Ġinv": 1385, + "Ġaccept": 1386, + "under": 1387, + ";Ċ": 1388, + "Ġsurpr": 1389, + "de": 1390, + "Ġtrain": 1391, + "Ġhotel": 1392, + "Ġsleep": 1393, + "Ġdr": 1394, + "Ġhold": 1395, + "lock": 1396, + "pura": 1397, + "Ġsprings": 1398, + "Ġ......": 1399, + "Ġagreement": 1400, + "ĠDar": 1401, + "Ġrest": 1402, + "clud": 1403, + "ator": 1404, + "av": 1405, + "Ġorig": 1406, + "Ġorigin": 1407, + "Ġel": 1408, + "Ġnor": 1409, + "Ġpres": 1410, + "Ġunderstand": 1411, + "Ġtaken": 1412, + "Ġlight": 1413, + "ener": 1414, + "some": 1415, + "Ġbrought": 1416, + "raph": 1417, + "Ġmost": 1418, + "oke": 1419, + "-w": 1420, + "Ġunt": 1421, + "Ġfather": 1422, + "Ġused": 1423, + "Ġeat": 1424, + "Ġyears": 1425, + "ĠWhile": 1426, + "Ġchan": 1427, + "Ġsudd": 1428, + "Ġsudden": 1429, + "Ġapolog": 1430, + "Ġsett": 1431, + "Ġthin": 1432, + "ĠMy": 1433, + "Ġten": 1434, + "imes": 1435, + "for": 1436, + "oud": 1437, + "When": 1438, + "Ġdet": 1439, + "Ġlive": 1440, + "Ġoc": 1441, + "Ġfive": 1442, + "Ġcont": 1443, + "Ġhelp": 1444, + "Ġwa": 1445, + "Ġpassed": 1446, + "Ġrun": 1447, + "Ġmaking": 1448, + "Ġstrange": 1449, + "Ġtaking": 1450, + "Ġeach": 1451, + "\"You": 1452, + "Ġanother": 1453, + "\"Say": 1454, + "\"The": 1455, + "ates": 1456, + "Ġpleas": 1457, + "asshoppers": 1458, + "Ġmom": 1459, + "Ġmoment": 1460, + "entle": 1461, + "nglish": 1462, + "CHA": 1463, + "Ġoriginal": 1464, + "ions": 1465, + "uring": 1466, + "Ġpublic": 1467, + "uct": 1468, + "uck": 1469, + "Ġquestion": 1470, + "ai": 1471, + "cy": 1472, + "ek": 1473, + "Ġfloor": 1474, + "Ġcar": 1475, + "ouse": 1476, + "Ġside": 1477, + "-ya": 1478, + "Ġcertain": 1479, + "hys": 1480, + "-d": 1481, + "igh": 1482, + "agin": 1483, + "weet": 1484, + "Ġpoor": 1485, + "Ġdecid": 1486, + "ually": 1487, + "Ġbusiness": 1488, + "pro": 1489, + "plain": 1490, + "Ġstop": 1491, + "!Ċ": 1492, + "ĠHow": 1493, + "\"What": 1494, + "can": 1495, + "ĠUn": 1496, + "ps": 1497, + "und": 1498, + "-night": 1499, + "Ġmeeting": 1500, + "edo": 1501, + "Ġraise": 1502, + "Gutenberg": 1503, + "ĠDarling": 1504, + "ume": 1505, + "ĠEnglish": 1506, + "TER": 1507, + "ading": 1508, + "Ġtransl": 1509, + "Ġable": 1510, + "ssible": 1511, + "Ġsatisf": 1512, + "Ġwanted": 1513, + "Ġsub": 1514, + "Ġcase": 1515, + "ific": 1516, + "iterary": 1517, + "Ġmaid": 1518, + "Ġinc": 1519, + "Ġpos": 1520, + "Ġposition": 1521, + "Ġpat": 1522, + "ured": 1523, + "orry": 1524, + "Ġaccount": 1525, + "Ġboth": 1526, + "Ġfrie": 1527, + "Ġfriend": 1528, + "this": 1529, + "Ġalways": 1530, + "Ġparticul": 1531, + "What": 1532, + "Ġsmall": 1533, + "enty": 1534, + "ushed": 1535, + "Ġmis": 1536, + "ully": 1537, + "Ġrecei": 1538, + "You": 1539, + "Ġyet": 1540, + "Ġgave": 1541, + "But": 1542, + "had": 1543, + "Ġanswer": 1544, + "Ġabs": 1545, + "ile": 1546, + "cket": 1547, + "Ġnood": 1548, + "Ġcourse": 1549, + "Ġform": 1550, + "Ġeverything": 1551, + "ection": 1552, + "If": 1553, + "part": 1554, + "Ġsing": 1555, + "Ġsit": 1556, + "Ġpur": 1557, + "ip": 1558, + "Ġfishing": 1559, + "Ġeh": 1560, + "Ġpar": 1561, + "Ġtogether": 1562, + "He": 1563, + "Ġwhe": 1564, + "Ġwhether": 1565, + "Ġbra": 1566, + "\"Yes": 1567, + "Ġpunish": 1568, + "Shirt": 1569, + "ĠYedo": 1570, + "Ġfarew": 1571, + "Ġfarewell": 1572, + "Ġdance": 1573, + "Ġless": 1574, + "ural": 1575, + "Ġdef": 1576, + "Ġattempt": 1577, + "ween": 1578, + "Ġsign": 1579, + "Ġsy": 1580, + "ferent": 1581, + "Ġleast": 1582, + "ser": 1583, + "ob": 1584, + "nding": 1585, + "Ġsorry": 1586, + "Ġjumped": 1587, + "Ġjan": 1588, + "Ġjanitor": 1589, + "ized": 1590, + "Ġtoward": 1591, + "Ġmor": 1592, + "aving": 1593, + "Ġbit": 1594, + "\"This": 1595, + "Ġremark": 1596, + "Ġfut": 1597, + "Ġwonder": 1598, + "Ġfun": 1599, + "Then": 1600, + "Ġdec": 1601, + "Ġwhom": 1602, + "Ġdidn": 1603, + "Ġrec": 1604, + "bec": 1605, + "\"If": 1606, + "Ġknew": 1607, + "after": 1608, + "Ġthus": 1609, + "Ġisn": 1610, + "Ġsight": 1611, + "med": 1612, + "[F": 1613, + "uss": 1614, + "cident": 1615, + "them": 1616, + "Ġfif": 1617, + "Ġdraw": 1618, + "Ġhear": 1619, + "Ġwriting": 1620, + "Ġgetting": 1621, + "sh": 1622, + "ference": 1623, + "Ġraised": 1624, + "they": 1625, + "ax": 1626, + "Ġfine": 1627, + "sel": 1628, + "ĠNobe": 1629, + "ĠNobeok": 1630, + "ĠNobeoka": 1631, + "ormal": 1632, + "ĠeB": 1633, + "icense": 1634, + "00": 1635, + "Ġbest": 1636, + "wor": 1637, + "fic": 1638, + "terest": 1639, + "Ġremar": 1640, + "bl": 1641, + "arted": 1642, + "Ġdark": 1643, + "Ġyoung": 1644, + "ush": 1645, + "Ġbet": 1646, + "outh": 1647, + "house": 1648, + "aught": 1649, + "Ġphys": 1650, + "Ġstrong": 1651, + "Ġfur": 1652, + "Ġroll": 1653, + "cove": 1654, + "chief": 1655, + "awa": 1656, + "Ġfollowed": 1657, + "Ġfond": 1658, + "Ġfuture": 1659, + "ird": 1660, + "fully": 1661, + "Ġeffort": 1662, + "After": 1663, + "oward": 1664, + "Ġreally": 1665, + "Ġamong": 1666, + "Ġaround": 1667, + "Ġcompl": 1668, + "Ġgaz": 1669, + "Ġbow": 1670, + "ater": 1671, + "Ġinsist": 1672, + "Ġturned": 1673, + "hel": 1674, + "rem": 1675, + "Ġhours": 1676, + "Ġdecided": 1677, + "ys": 1678, + "Ġmonth": 1679, + "-a": 1680, + "Ġadv": 1681, + "Ġbelieve": 1682, + "Ġteaching": 1683, + "Ġeasy": 1684, + "Ġdirection": 1685, + "ooked": 1686, + "Ġwar": 1687, + "Ġunless": 1688, + "have": 1689, + "Ġsquare": 1690, + "vil": 1691, + "Ġquiet": 1692, + "Ġhung": 1693, + "Ġgoes": 1694, + "Ġpaid": 1695, + "Ġshall": 1696, + "\"No": 1697, + "Ġpunishment": 1698, + "pose": 1699, + "Ġsweet": 1700, + "'ve": 1701, + "\"Well": 1702, + "Ġgentle": 1703, + "Ġnormal": 1704, + "agraph": 1705, + "chive": 1706, + "chan": 1707, + "Ġinclud": 1708, + "ww": 1709, + "org": 1710, + "tem": 1711, + "AR": 1712, + "ĠTH": 1713, + "Ġequ": 1714, + "Ġtone": 1715, + "Ġpossible": 1716, + "Ġbecom": 1717, + "ĠJapanese": 1718, + "vers": 1719, + "Ġfollowing": 1720, + "Ġpain": 1721, + "Ġwhole": 1722, + "wr": 1723, + "Ġserious": 1724, + "Ġnar": 1725, + "Ġtired": 1726, + "In": 1727, + "Ġplay": 1728, + "Ġprom": 1729, + "Ġgame": 1730, + "ĠSome": 1731, + "Ġhappened": 1732, + "Ġcut": 1733, + "Ġtwenty": 1734, + "Ġdoor": 1735, + "Ġmorning": 1736, + "hind": 1737, + "Ġbre": 1738, + "Ġinside": 1739, + "ove": 1740, + "alth": 1741, + "uk": 1742, + "arge": 1743, + "amb": 1744, + "Ġdam": 1745, + "Ġworry": 1746, + "ative": 1747, + "Ġexpected": 1748, + "Ġfam": 1749, + "Ġpra": 1750, + "Ġpocket": 1751, + "ooks": 1752, + "ched": 1753, + "Ġsil": 1754, + "ol": 1755, + "Ġfav": 1756, + "Ġelse": 1757, + "Ġhigh": 1758, + "Ġreal": 1759, + "Ġalong": 1760, + "Ġmed": 1761, + "hik": 1762, + "hemat": 1763, + "hematics": 1764, + "Ġlist": 1765, + "Ġsick": 1766, + "oint": 1767, + "[Foot": 1768, + "[Footnot": 1769, + "[Footnote": 1770, + ".]Ċ": 1771, + "night": 1772, + "ses": 1773, + "ior": 1774, + "Ġsays": 1775, + "Ġmouth": 1776, + "how": 1777, + "ming": 1778, + "Ġclo": 1779, + "Ġcur": 1780, + "ging": 1781, + "Ġsuddenly": 1782, + "-ah": 1783, + "amp": 1784, + "Ġblack": 1785, + "ross": 1786, + "Ġfac": 1787, + "selves": 1788, + "iew": 1789, + "ission": 1790, + "Ġcopyright": 1791, + "Ġparagraph": 1792, + "ĠArchive": 1793, + "Ġdonations": 1794, + "Project": 1795, + "Ġcost": 1796, + ".org": 1797, + "LI": 1798, + "uced": 1799, + "Ġsuc": 1800, + "yle": 1801, + "Ġforce": 1802, + "joy": 1803, + "ouch": 1804, + "tr": 1805, + "It": 1806, + "Ġtrad": 1807, + "Ġpresent": 1808, + "Ġext": 1809, + "ased": 1810, + "redit": 1811, + "Ġfault": 1812, + "ib": 1813, + "-m": 1814, + "urd": 1815, + "Ġtried": 1816, + "time": 1817, + "Ġpret": 1818, + "Ġspee": 1819, + "ower": 1820, + "Ġwords": 1821, + "CHAP": 1822, + "CHAPTER": 1823, + "school": 1824, + "Ġask": 1825, + "Ġdoing": 1826, + "ately": 1827, + "Ġuntil": 1828, + "bout": 1829, + "Ġtree": 1830, + "call": 1831, + "amash": 1832, + "amashir": 1833, + "amashiro": 1834, + "ste": 1835, + "Ġbehind": 1836, + "old": 1837, + "Ġwall": 1838, + "itory": 1839, + "Ġrolled": 1840, + "Ġmove": 1841, + "Ġapologize": 1842, + "Ġlarge": 1843, + "amboo": 1844, + "su": 1845, + "Ġsettled": 1846, + "\"He": 1847, + "wo": 1848, + "Ġthinking": 1849, + "used": 1850, + "ified": 1851, + "Ġalmost": 1852, + "Ġtre": 1853, + "Ġtreat": 1854, + "Ġnoodle": 1855, + "Ġnote": 1856, + "ĠAll": 1857, + "Ġbeat": 1858, + "Ġobject": 1859, + "Ġseems": 1860, + "Ġide": 1861, + "Yes": 1862, + "ows": 1863, + "Ġremain": 1864, + "Ġbegin": 1865, + "ught": 1866, + "ments": 1867, + "Ġalone": 1868, + "spect": 1869, + "Ġmathematics": 1870, + "Ġrough": 1871, + "Ġoutside": 1872, + "Ġcomes": 1873, + "back": 1874, + "Ġwind": 1875, + "sed": 1876, + "Ġwouldn": 1877, + "eer": 1878, + "inut": 1879, + "from": 1880, + "Ġrepl": 1881, + "Ġnarrow": 1882, + "Ġincident": 1883, + "Ġair": 1884, + "Ġsea": 1885, + "ts": 1886, + "Ġsurprised": 1887, + "Ġtea": 1888, + "Red": 1889, + "Ġtalking": 1890, + "Ġboss": 1891, + "que": 1892, + "Ġpict": 1893, + "irty": 1894, + "Ġce": 1895, + "Ġlim": 1896, + "ĠWhy": 1897, + "Ġpoint": 1898, + "Ġlaw": 1899, + "ciated": 1900, + "Ġmoon": 1901, + "ircu": 1902, + "got": 1903, + "ĠIs": 1904, + "Ġhands": 1905, + "Ġhonor": 1906, + "aut": 1907, + "rge": 1908, + "Ġstate": 1909, + "ĠLiterary": 1910, + ".F": 1911, + "This": 1912, + "line": 1913, + ".g": 1914, + ".gutenberg": 1915, + "ĠOF": 1916, + "EN": 1917, + "racter": 1918, + "Ġbene": 1919, + "ĠEven": 1920, + "oub": 1921, + "Ġmakes": 1922, + "Ġinterest": 1923, + "ope": 1924, + "ms": 1925, + "Ġrespons": 1926, + "Ġfore": 1927, + "Ġsomewhat": 1928, + "Ġhonest": 1929, + "ock": 1930, + "irit": 1931, + "Ġheld": 1932, + "Ġadded": 1933, + "fu": 1934, + "aded": 1935, + "als": 1936, + "att": 1937, + "tern": 1938, + "Ġpersonal": 1939, + "Ġass": 1940, + "ĠWith": 1941, + "tic": 1942, + "Tokyo": 1943, + "Ġshout": 1944, + "Ġpretty": 1945, + "umb": 1946, + "Ġearly": 1947, + "opped": 1948, + "Ġfurther": 1949, + "Ġfre": 1950, + "esides": 1951, + "Ġbamboo": 1952, + "Ġir": 1953, + "more": 1954, + "Ġliving": 1955, + "Ġreceived": 1956, + "Ġlived": 1957, + "Ġmeant": 1958, + "Ġcoward": 1959, + "position": 1960, + "Ġloc": 1961, + "iled": 1962, + "Ġtender": 1963, + "Ġch": 1964, + "ĠAfter": 1965, + "cer": 1966, + "Ġfavor": 1967, + "who": 1968, + "Ġliked": 1969, + "rance": 1970, + "Ġpri": 1971, + "kisha": 1972, + "Ġstudy": 1973, + "Ġorder": 1974, + "Ġafterward": 1975, + "Ġgreatly": 1976, + "Ġunable": 1977, + "go": 1978, + "Ġwait": 1979, + "eping": 1980, + "iding": 1981, + "Ġforty": 1982, + "Ġsky": 1983, + "Ġoffice": 1984, + "will": 1985, + "\"D": 1986, + "wel": 1987, + "Ġstation": 1988, + "bo": 1989, + "hot": 1990, + "such": 1991, + "Ġloud": 1992, + "Ġaw": 1993, + "land": 1994, + "?Ċ": 1995, + "Ġrespect": 1996, + "ances": 1997, + "ient": 1998, + "Ġought": 1999 + }, + "merges": [ + [ + "Ġ", + "t" + ], + [ + "h", + "e" + ], + [ + "Ġ", + "a" + ], + [ + "i", + "n" + ], + [ + "Ġ", + "s" + ], + [ + "Ġ", + "w" + ], + [ + "Ġ", + "the" + ], + [ + "Ġt", + "he" + ], + [ + "Ġth", + "e" + ], + [ + "Ġ", + "o" + ], + [ + "r", + "e" + ], + [ + "Ġ", + "b" + ], + [ + "o", + "u" + ], + [ + "e", + "d" + ], + [ + "Ġ", + "m" + ], + [ + "n", + "d" + ], + [ + "Ġ", + "I" + ], + [ + "h", + "a" + ], + [ + "i", + "t" + ], + [ + "e", + "r" + ], + [ + "i", + "ng" + ], + [ + "in", + "g" + ], + [ + "Ġ", + "f" + ], + [ + "i", + "s" + ], + [ + "Ġ", + "to" + ], + [ + "Ġt", + "o" + ], + [ + "e", + "n" + ], + [ + "o", + "n" + ], + [ + "o", + "r" + ], + [ + "a", + "s" + ], + [ + "Ġ", + "c" + ], + [ + "Ġ", + "of" + ], + [ + "Ġo", + "f" + ], + [ + "Ġ", + "and" + ], + [ + "Ġa", + "nd" + ], + [ + "Ġan", + "d" + ], + [ + "Ġ", + "d" + ], + [ + "l", + "l" + ], + [ + "a", + "t" + ], + [ + "a", + "n" + ], + [ + "a", + "r" + ], + [ + "Ġ", + "p" + ], + [ + "Ġ", + "n" + ], + [ + "Ġ", + "in" + ], + [ + "l", + "e" + ], + [ + "o", + "m" + ], + [ + "o", + "t" + ], + [ + "Ġ", + "be" + ], + [ + "Ġb", + "e" + ], + [ + "Ġ", + "h" + ], + [ + "u", + "t" + ], + [ + "o", + "w" + ], + [ + "e", + "s" + ], + [ + "h", + "at" + ], + [ + "ha", + "t" + ], + [ + "Ġ", + "g" + ], + [ + "Ġ", + "he" + ], + [ + "Ġh", + "e" + ], + [ + "Ġ", + "ha" + ], + [ + "Ġh", + "a" + ], + [ + "Ġ", + "l" + ], + [ + "Ġ", + "was" + ], + [ + "Ġw", + "as" + ], + [ + "Ġwa", + "s" + ], + [ + "l", + "d" + ], + [ + "g", + "h" + ], + [ + "i", + "d" + ], + [ + "c", + "h" + ], + [ + "Ġ", + "th" + ], + [ + "Ġt", + "h" + ], + [ + "Ġ", + "it" + ], + [ + "a", + "y" + ], + [ + "Ġ", + "on" + ], + [ + "Ġo", + "n" + ], + [ + "c", + "e" + ], + [ + "s", + "e" + ], + [ + "e", + "nt" + ], + [ + "en", + "t" + ], + [ + "Ġ", + "st" + ], + [ + "Ġs", + "t" + ], + [ + "l", + "y" + ], + [ + "v", + "e" + ], + [ + "e", + "t" + ], + [ + "s", + "t" + ], + [ + "Ġ", + "T" + ], + [ + "Ġ", + "e" + ], + [ + "Ġ", + "y" + ], + [ + "gh", + "t" + ], + [ + "i", + "r" + ], + [ + "Ġ", + "me" + ], + [ + "Ġm", + "e" + ], + [ + "o", + "o" + ], + [ + "a", + "l" + ], + [ + "i", + "th" + ], + [ + "it", + "h" + ], + [ + "Ġ", + "re" + ], + [ + "Ġr", + "e" + ], + [ + "i", + "m" + ], + [ + "Ġ", + "that" + ], + [ + "Ġt", + "hat" + ], + [ + "Ġth", + "at" + ], + [ + "Ġ", + "as" + ], + [ + "Ġa", + "s" + ], + [ + "ou", + "ld" + ], + [ + "r", + "o" + ], + [ + "a", + "d" + ], + [ + "i", + "on" + ], + [ + ".", + "Ċ" + ], + [ + "h", + "er" + ], + [ + "he", + "r" + ], + [ + "Ġ", + "my" + ], + [ + "Ġm", + "y" + ], + [ + "c", + "t" + ], + [ + "Ġ", + "not" + ], + [ + "Ġn", + "ot" + ], + [ + "Ġno", + "t" + ], + [ + "Ġ", + "with" + ], + [ + "Ġw", + "ith" + ], + [ + "Ġ", + "for" + ], + [ + "Ġf", + "or" + ], + [ + "Ġ", + "u" + ], + [ + "k", + "e" + ], + [ + "Ġ", + "you" + ], + [ + "Ġy", + "ou" + ], + [ + "Ġ", + "S" + ], + [ + "Ġ", + "is" + ], + [ + "i", + "ght" + ], + [ + "igh", + "t" + ], + [ + "\"", + "Ċ" + ], + [ + "a", + "m" + ], + [ + "i", + "c" + ], + [ + "u", + "r" + ], + [ + "Ġ", + "at" + ], + [ + "Ġa", + "t" + ], + [ + ".", + "." + ], + [ + "a", + "c" + ], + [ + "t", + "er" + ], + [ + "te", + "r" + ], + [ + "Ġw", + "h" + ], + [ + "Ġ", + "an" + ], + [ + "Ġa", + "n" + ], + [ + "Ġ", + "we" + ], + [ + "Ġw", + "e" + ], + [ + "Ġ", + "The" + ], + [ + "ĠT", + "he" + ], + [ + "i", + "f" + ], + [ + "Ġ", + "or" + ], + [ + "Ġo", + "r" + ], + [ + "Ġ", + "but" + ], + [ + "Ġb", + "ut" + ], + [ + "Ġbu", + "t" + ], + [ + "v", + "er" + ], + [ + "ve", + "r" + ], + [ + "Ġ", + "\"" + ], + [ + "Ġ", + "r" + ], + [ + "o", + "ut" + ], + [ + "ou", + "t" + ], + [ + "o", + "me" + ], + [ + "om", + "e" + ], + [ + "Ġ", + "had" + ], + [ + "Ġh", + "ad" + ], + [ + "Ġha", + "d" + ], + [ + "p", + "p" + ], + [ + "q", + "u" + ], + [ + "Ġ", + "su" + ], + [ + "Ġs", + "u" + ], + [ + "Ġ", + "this" + ], + [ + "Ġt", + "his" + ], + [ + "Ġth", + "is" + ], + [ + "r", + "ed" + ], + [ + "re", + "d" + ], + [ + "ar", + "d" + ], + [ + "Ġ", + "so" + ], + [ + "Ġs", + "o" + ], + [ + "e", + "ll" + ], + [ + "el", + "l" + ], + [ + "Ġ", + "would" + ], + [ + "Ġw", + "ould" + ], + [ + "Ġ", + "his" + ], + [ + "Ġh", + "is" + ], + [ + "Ġ", + "sh" + ], + [ + "Ġs", + "h" + ], + [ + "i", + "ne" + ], + [ + "in", + "e" + ], + [ + "r", + "a" + ], + [ + "Ġ", + "se" + ], + [ + "Ġs", + "e" + ], + [ + "Ġ", + "by" + ], + [ + "Ġb", + "y" + ], + [ + ".", + "\"Ċ" + ], + [ + ".\"", + "Ċ" + ], + [ + "Ġ", + "P" + ], + [ + "h", + "en" + ], + [ + "he", + "n" + ], + [ + "Ġ", + "A" + ], + [ + "Ġ", + "have" + ], + [ + "Ġh", + "ave" + ], + [ + "Ġha", + "ve" + ], + [ + "Ġf", + "r" + ], + [ + "Ġs", + "a" + ], + [ + "Ġ", + "H" + ], + [ + "Ġ", + "one" + ], + [ + "Ġo", + "ne" + ], + [ + "Ġon", + "e" + ], + [ + "e", + "m" + ], + [ + "k", + "ed" + ], + [ + "ke", + "d" + ], + [ + "ir", + "t" + ], + [ + "e", + "ct" + ], + [ + "Ġ", + "him" + ], + [ + "Ġh", + "im" + ], + [ + "Ġ", + "li" + ], + [ + "Ġl", + "i" + ], + [ + "Ġ", + "ab" + ], + [ + "Ġa", + "b" + ], + [ + "at", + "ion" + ], + [ + "h", + "ing" + ], + [ + "hi", + "ng" + ], + [ + "t", + "he" + ], + [ + "th", + "e" + ], + [ + "Ġ", + "R" + ], + [ + "Ġ", + "le" + ], + [ + "Ġl", + "e" + ], + [ + "s", + "s" + ], + [ + "Ġ", + "W" + ], + [ + "c", + "u" + ], + [ + "i", + "ll" + ], + [ + "il", + "l" + ], + [ + "'", + "t" + ], + [ + "ar", + "t" + ], + [ + "a", + "ll" + ], + [ + "al", + "l" + ], + [ + ",", + "Ċ" + ], + [ + "o", + "wn" + ], + [ + "ow", + "n" + ], + [ + "o", + "re" + ], + [ + "or", + "e" + ], + [ + "Ġ", + "all" + ], + [ + "Ġa", + "ll" + ], + [ + "Ġal", + "l" + ], + [ + "Ġ", + "k" + ], + [ + "Ġ", + "go" + ], + [ + "Ġg", + "o" + ], + [ + "h", + "irt" + ], + [ + "a", + "nd" + ], + [ + "an", + "d" + ], + [ + "Ġ", + "out" + ], + [ + "Ġo", + "ut" + ], + [ + "a", + "me" + ], + [ + "am", + "e" + ], + [ + "a", + "in" + ], + [ + "ai", + "n" + ], + [ + "Ġ", + "if" + ], + [ + "Ġ", + "no" + ], + [ + "Ġn", + "o" + ], + [ + "Ġd", + "o" + ], + [ + "Ġ", + "they" + ], + [ + "Ġthe", + "y" + ], + [ + "o", + "ol" + ], + [ + "oo", + "l" + ], + [ + "u", + "n" + ], + [ + "t", + "o" + ], + [ + "Ġ", + "up" + ], + [ + "Ġu", + "p" + ], + [ + "Ġ", + "Red" + ], + [ + "ĠR", + "ed" + ], + [ + "Ġ", + "ne" + ], + [ + "Ġn", + "e" + ], + [ + "Ġ", + "K" + ], + [ + "Ġ", + "from" + ], + [ + "Ġf", + "rom" + ], + [ + "Ġfr", + "om" + ], + [ + "Ġ", + "Shirt" + ], + [ + "ĠS", + "hirt" + ], + [ + "Ġ", + "wor" + ], + [ + "Ġw", + "or" + ], + [ + "o", + "ng" + ], + [ + "on", + "g" + ], + [ + "Ġ", + "there" + ], + [ + "Ġt", + "here" + ], + [ + "Ġthe", + "re" + ], + [ + "Ġs", + "aid" + ], + [ + "Ġsa", + "id" + ], + [ + "r", + "i" + ], + [ + "a", + "nt" + ], + [ + "an", + "t" + ], + [ + "Ġ", + "B" + ], + [ + "Ġ", + "any" + ], + [ + "Ġa", + "ny" + ], + [ + "Ġan", + "y" + ], + [ + "u", + "d" + ], + [ + "i", + "nd" + ], + [ + "in", + "d" + ], + [ + "Ġw", + "hi" + ], + [ + "Ġwh", + "i" + ], + [ + "a", + "b" + ], + [ + "o", + "und" + ], + [ + "ou", + "nd" + ], + [ + "Ġ", + "about" + ], + [ + "Ġa", + "bout" + ], + [ + "Ġab", + "out" + ], + [ + "Ġ", + "them" + ], + [ + "Ġthe", + "m" + ], + [ + "Ġth", + "em" + ], + [ + "c", + "up" + ], + [ + "cu", + "p" + ], + [ + "a", + "k" + ], + [ + "Ġ", + "de" + ], + [ + "Ġd", + "e" + ], + [ + "Ġ", + "te" + ], + [ + "Ġt", + "e" + ], + [ + "Ġ", + "M" + ], + [ + "a", + "ke" + ], + [ + "ak", + "e" + ], + [ + "cup", + "ine" + ], + [ + "i", + "g" + ], + [ + "Ġwe", + "re" + ], + [ + "or", + "cupine" + ], + [ + "i", + "l" + ], + [ + "ch", + "ool" + ], + [ + "Ġ", + "ro" + ], + [ + "Ġr", + "o" + ], + [ + "o", + "od" + ], + [ + "oo", + "d" + ], + [ + "Ġ", + "are" + ], + [ + "Ġa", + "re" + ], + [ + "Ġar", + "e" + ], + [ + "i", + "ve" + ], + [ + "iv", + "e" + ], + [ + "Ġ", + "like" + ], + [ + "Ġli", + "ke" + ], + [ + "y", + "o" + ], + [ + "Ġh", + "ou" + ], + [ + "Ġho", + "u" + ], + [ + "'", + "s" + ], + [ + "o", + "ne" + ], + [ + "on", + "e" + ], + [ + "u", + "s" + ], + [ + "e", + "l" + ], + [ + "u", + "l" + ], + [ + "a", + "ck" + ], + [ + "ac", + "k" + ], + [ + "o", + "p" + ], + [ + ",", + "\"" + ], + [ + "t", + "h" + ], + [ + "ac", + "her" + ], + [ + "ach", + "er" + ], + [ + "u", + "m" + ], + [ + "a", + "ng" + ], + [ + "an", + "g" + ], + [ + "Ġf", + "a" + ], + [ + "a", + "g" + ], + [ + "Ġ", + "school" + ], + [ + "Ġs", + "chool" + ], + [ + "Ġ", + "j" + ], + [ + "t", + "e" + ], + [ + "o", + "k" + ], + [ + "e", + "ss" + ], + [ + "es", + "s" + ], + [ + "u", + "st" + ], + [ + "us", + "t" + ], + [ + "er", + "s" + ], + [ + "..", + ".." + ], + [ + "Ġ", + "C" + ], + [ + "t", + "her" + ], + [ + "the", + "r" + ], + [ + "th", + "er" + ], + [ + "h", + "an" + ], + [ + "ha", + "n" + ], + [ + "Ġw", + "hen" + ], + [ + "Ġwh", + "en" + ], + [ + "Ġwhe", + "n" + ], + [ + "Ġ", + "sp" + ], + [ + "Ġs", + "p" + ], + [ + "Ġ", + "man" + ], + [ + "Ġm", + "an" + ], + [ + "Ġ", + "can" + ], + [ + "Ġc", + "an" + ], + [ + "ou", + "gh" + ], + [ + "Ġ", + "who" + ], + [ + "Ġw", + "ho" + ], + [ + "Ġwh", + "o" + ], + [ + "Ġ", + "get" + ], + [ + "Ġg", + "et" + ], + [ + "Ġge", + "t" + ], + [ + "Ġd", + "id" + ], + [ + "Ġ", + "po" + ], + [ + "Ġp", + "o" + ], + [ + "c", + "i" + ], + [ + "Ġ", + "al" + ], + [ + "Ġa", + "l" + ], + [ + "i", + "st" + ], + [ + "is", + "t" + ], + [ + "Ġ", + "com" + ], + [ + "Ġc", + "om" + ], + [ + "Ġco", + "m" + ], + [ + "l", + "f" + ], + [ + "a", + "u" + ], + [ + "Ġ", + "Porcupine" + ], + [ + "ĠP", + "orcupine" + ], + [ + "Ġwhi", + "ch" + ], + [ + "v", + "en" + ], + [ + "ve", + "n" + ], + [ + "Ġa", + "f" + ], + [ + "w", + "n" + ], + [ + "a", + "ss" + ], + [ + "as", + "s" + ], + [ + "b", + "er" + ], + [ + "be", + "r" + ], + [ + "Ġ", + "ex" + ], + [ + "Ġe", + "x" + ], + [ + "o", + "us" + ], + [ + "ou", + "s" + ], + [ + "e", + "st" + ], + [ + "es", + "t" + ], + [ + "l", + "o" + ], + [ + "Ġ", + "tr" + ], + [ + "Ġt", + "r" + ], + [ + "ell", + "ow" + ], + [ + "Ġs", + "ay" + ], + [ + "Ġsa", + "y" + ], + [ + "o", + "ught" + ], + [ + "ou", + "ght" + ], + [ + "ough", + "t" + ], + [ + "Ġ", + "room" + ], + [ + "Ġro", + "om" + ], + [ + "Ġ", + "some" + ], + [ + "Ġs", + "ome" + ], + [ + "Ġso", + "me" + ], + [ + "-", + "-" + ], + [ + "Ġ", + "O" + ], + [ + "a", + "te" + ], + [ + "at", + "e" + ], + [ + "Ġ", + "v" + ], + [ + "h", + "ed" + ], + [ + "he", + "d" + ], + [ + "a", + "p" + ], + [ + "Ġt", + "w" + ], + [ + "Ġ", + "bec" + ], + [ + "Ġbe", + "c" + ], + [ + "r", + "ee" + ], + [ + "re", + "e" + ], + [ + "j", + "ect" + ], + [ + "k", + "s" + ], + [ + "Ġ", + "con" + ], + [ + "Ġc", + "on" + ], + [ + "Ġco", + "n" + ], + [ + "Ġbe", + "en" + ], + [ + "en", + "ts" + ], + [ + "ent", + "s" + ], + [ + "i", + "de" + ], + [ + "id", + "e" + ], + [ + "Ġc", + "ould" + ], + [ + "Ġ", + "G" + ], + [ + "e", + "p" + ], + [ + "Ġ", + "pro" + ], + [ + "Ġp", + "ro" + ], + [ + "Ġpr", + "o" + ], + [ + "n", + "t" + ], + [ + "Ġ", + "house" + ], + [ + "Ġh", + "ouse" + ], + [ + "Ġhou", + "se" + ], + [ + "Ġ", + "ag" + ], + [ + "Ġa", + "g" + ], + [ + "Ġ", + "If" + ], + [ + "ĠI", + "f" + ], + [ + "Ġk", + "n" + ], + [ + "Ġf", + "ellow" + ], + [ + "Ġ", + "what" + ], + [ + "Ġw", + "hat" + ], + [ + "Ġwh", + "at" + ], + [ + "w", + "ay" + ], + [ + "i", + "sh" + ], + [ + "is", + "h" + ], + [ + "Ġ", + "am" + ], + [ + "Ġa", + "m" + ], + [ + "i", + "te" + ], + [ + "it", + "e" + ], + [ + "nd", + "er" + ], + [ + "i", + "me" + ], + [ + "im", + "e" + ], + [ + "Ġ", + "pr" + ], + [ + "Ġp", + "r" + ], + [ + "Ġ", + "teacher" + ], + [ + "Ġte", + "acher" + ], + [ + "Ġteach", + "er" + ], + [ + "a", + "re" + ], + [ + "ar", + "e" + ], + [ + "Ġ", + "bo" + ], + [ + "Ġb", + "o" + ], + [ + "Ġs", + "he" + ], + [ + "Ġsh", + "e" + ], + [ + "Ġ", + "N" + ], + [ + "i", + "ce" + ], + [ + "ic", + "e" + ], + [ + "a", + "st" + ], + [ + "as", + "t" + ], + [ + "u", + "re" + ], + [ + "ur", + "e" + ], + [ + "i", + "e" + ], + [ + "Ġ", + "such" + ], + [ + "Ġs", + "uch" + ], + [ + "Ġsu", + "ch" + ], + [ + "Ġsuc", + "h" + ], + [ + "ut", + "en" + ], + [ + "uten", + "ber" + ], + [ + "utenber", + "g" + ], + [ + "Ġ", + "qu" + ], + [ + "l", + "own" + ], + [ + "lo", + "wn" + ], + [ + "Ġ", + "wr" + ], + [ + "Ġw", + "r" + ], + [ + "p", + "t" + ], + [ + "Ġ", + "He" + ], + [ + "ĠH", + "e" + ], + [ + "Ġst", + "ud" + ], + [ + "he", + "re" + ], + [ + "her", + "e" + ], + [ + "Ġ", + "more" + ], + [ + "Ġm", + "ore" + ], + [ + "Ġmo", + "re" + ], + [ + "Ġmor", + "e" + ], + [ + "r", + "y" + ], + [ + "t", + "ter" + ], + [ + "Ġ", + "Y" + ], + [ + "Ġm", + "ay" + ], + [ + "i", + "ty" + ], + [ + "it", + "y" + ], + [ + "Ġl", + "oo" + ], + [ + "Ġlo", + "o" + ], + [ + "Ġ", + "other" + ], + [ + "Ġo", + "ther" + ], + [ + "h", + "is" + ], + [ + "hi", + "s" + ], + [ + "Ġ", + "Pro" + ], + [ + "ĠP", + "ro" + ], + [ + "Ġ", + "will" + ], + [ + "Ġw", + "ill" + ], + [ + "Ġ", + "It" + ], + [ + "ĠI", + "t" + ], + [ + "or", + "t" + ], + [ + "Ġsh", + "ould" + ], + [ + "ve", + "ry" + ], + [ + "ver", + "y" + ], + [ + "w", + "e" + ], + [ + "Ġ", + "pl" + ], + [ + "Ġp", + "l" + ], + [ + "a", + "sh" + ], + [ + "as", + "h" + ], + [ + ".", + "\"" + ], + [ + "Ġ", + "app" + ], + [ + "Ġa", + "pp" + ], + [ + "Ġ", + "day" + ], + [ + "Ġd", + "ay" + ], + [ + "ur", + "n" + ], + [ + "p", + "o" + ], + [ + "Ġ", + "her" + ], + [ + "Ġh", + "er" + ], + [ + "Ġhe", + "r" + ], + [ + "Ġ", + "Ġ" + ], + [ + "n", + "ot" + ], + [ + "no", + "t" + ], + [ + "c", + "k" + ], + [ + "Ġ", + "un" + ], + [ + "Ġu", + "n" + ], + [ + "h", + "i" + ], + [ + "v", + "ing" + ], + [ + "Ġ", + "old" + ], + [ + "Ġo", + "ld" + ], + [ + "Ġ", + "time" + ], + [ + "Ġt", + "ime" + ], + [ + "\"", + "T" + ], + [ + "Ġ", + "way" + ], + [ + "Ġw", + "ay" + ], + [ + "Ġwa", + "y" + ], + [ + "a", + "ble" + ], + [ + "ab", + "le" + ], + [ + "?", + "\"Ċ" + ], + [ + "?\"", + "Ċ" + ], + [ + "ĠC", + "lown" + ], + [ + "Ġon", + "ly" + ], + [ + "u", + "b" + ], + [ + "a", + "ch" + ], + [ + "ac", + "h" + ], + [ + "Ġo", + "ff" + ], + [ + "Ġof", + "f" + ], + [ + "Ġt", + "han" + ], + [ + "Ġth", + "an" + ], + [ + "al", + "ly" + ], + [ + "all", + "y" + ], + [ + "Ġthe", + "ir" + ], + [ + "b", + "e" + ], + [ + "k", + "ing" + ], + [ + "o", + "ther" + ], + [ + "ot", + "her" + ], + [ + "oth", + "er" + ], + [ + "a", + "ry" + ], + [ + "ar", + "y" + ], + [ + "an", + "s" + ], + [ + "a", + "ted" + ], + [ + "at", + "ed" + ], + [ + "ate", + "d" + ], + [ + "se", + "lf" + ], + [ + "sel", + "f" + ], + [ + "Ġgo", + "ing" + ], + [ + "u", + "ch" + ], + [ + "uc", + "h" + ], + [ + "o", + "ll" + ], + [ + "ol", + "l" + ], + [ + "Ġ", + "back" + ], + [ + "Ġb", + "ack" + ], + [ + "i", + "yo" + ], + [ + "-", + "t" + ], + [ + "a", + "nce" + ], + [ + "an", + "ce" + ], + [ + "a", + "de" + ], + [ + "ad", + "e" + ], + [ + "Ġ", + "Project" + ], + [ + "ĠPro", + "ject" + ], + [ + "s", + "p" + ], + [ + "Ġt", + "wo" + ], + [ + "Ġtw", + "o" + ], + [ + "Ġth", + "ought" + ], + [ + "Ġthough", + "t" + ], + [ + "s", + "o" + ], + [ + "Ġ", + "right" + ], + [ + "Ġr", + "ight" + ], + [ + "Ġri", + "ght" + ], + [ + "Ġ", + "head" + ], + [ + "Ġhe", + "ad" + ], + [ + "v", + "ed" + ], + [ + "ve", + "d" + ], + [ + "Ġ", + "D" + ], + [ + "Ġp", + "re" + ], + [ + "Ġpr", + "e" + ], + [ + "Ġs", + "ee" + ], + [ + "Ġse", + "e" + ], + [ + "Ġ", + "us" + ], + [ + "Ġu", + "s" + ], + [ + "Ġstud", + "ents" + ], + [ + "c", + "ip" + ], + [ + "ci", + "p" + ], + [ + "Ġd", + "on" + ], + [ + "Ġdo", + "n" + ], + [ + "Ġ", + "night" + ], + [ + "Ġn", + "ight" + ], + [ + "in", + "cip" + ], + [ + "ĠK", + "iyo" + ], + [ + "p", + "l" + ], + [ + "a", + "red" + ], + [ + "ar", + "ed" + ], + [ + "are", + "d" + ], + [ + "Ġ", + "Gutenberg" + ], + [ + "ĠG", + "utenberg" + ], + [ + "Ġ", + "co" + ], + [ + "Ġc", + "o" + ], + [ + "Ġ", + "how" + ], + [ + "Ġh", + "ow" + ], + [ + "Ġho", + "w" + ], + [ + "om", + "et" + ], + [ + "ome", + "t" + ], + [ + "f", + "f" + ], + [ + "\"", + "I" + ], + [ + ",", + "--" + ], + [ + "Ġas", + "ked" + ], + [ + "Ġask", + "ed" + ], + [ + "incip", + "al" + ], + [ + "e", + "ver" + ], + [ + "Ġ", + "ac" + ], + [ + "Ġa", + "c" + ], + [ + "Ġ", + "F" + ], + [ + "Ġm", + "ake" + ], + [ + "it", + "t" + ], + [ + "Ġm", + "ight" + ], + [ + "g", + "e" + ], + [ + "l", + "ed" + ], + [ + "le", + "d" + ], + [ + "Ġ", + "after" + ], + [ + "Ġa", + "fter" + ], + [ + "Ġaf", + "ter" + ], + [ + "ig", + "n" + ], + [ + "Ġ", + "gr" + ], + [ + "Ġg", + "r" + ], + [ + "Ġm", + "ade" + ], + [ + "d", + "d" + ], + [ + "Ġk", + "now" + ], + [ + "Ġkn", + "ow" + ], + [ + "Ġc", + "ome" + ], + [ + "Ġcom", + "e" + ], + [ + "Ġco", + "me" + ], + [ + "Ġ", + "br" + ], + [ + "Ġb", + "r" + ], + [ + "t", + "hing" + ], + [ + "th", + "ing" + ], + [ + "Ġ", + "But" + ], + [ + "ĠB", + "ut" + ], + [ + "Ġ", + "mat" + ], + [ + "Ġm", + "at" + ], + [ + "ĠO", + "n" + ], + [ + "o", + "ry" + ], + [ + "or", + "y" + ], + [ + "c", + "l" + ], + [ + "Ġ", + "E" + ], + [ + "b", + "le" + ], + [ + "bl", + "e" + ], + [ + "o", + "g" + ], + [ + "Ġy", + "our" + ], + [ + "Ġyou", + "r" + ], + [ + "u", + "ll" + ], + [ + "ul", + "l" + ], + [ + "Ġwor", + "k" + ], + [ + "e", + "ar" + ], + [ + "Ġth", + "ree" + ], + [ + "i", + "ed" + ], + [ + "ie", + "d" + ], + [ + "b", + "ut" + ], + [ + "T", + "he" + ], + [ + "p", + "e" + ], + [ + "a", + "ce" + ], + [ + "ac", + "e" + ], + [ + "Ġst", + "art" + ], + [ + "i", + "ck" + ], + [ + "ic", + "k" + ], + [ + "Ġo", + "ver" + ], + [ + "o", + "ur" + ], + [ + "ou", + "r" + ], + [ + "Ġm", + "uch" + ], + [ + "Ġw", + "ant" + ], + [ + "Ġwa", + "nt" + ], + [ + "i", + "mp" + ], + [ + "im", + "p" + ], + [ + "Ġ", + "part" + ], + [ + "Ġp", + "art" + ], + [ + "Ġpar", + "t" + ], + [ + "h", + "o" + ], + [ + "in", + "k" + ], + [ + "e", + "nce" + ], + [ + "en", + "ce" + ], + [ + "Ġd", + "own" + ], + [ + "Ġdo", + "wn" + ], + [ + "Ġe", + "ven" + ], + [ + "Ġpr", + "incipal" + ], + [ + "l", + "ing" + ], + [ + "li", + "ng" + ], + [ + "ou", + "nt" + ], + [ + "au", + "se" + ], + [ + "Ġ", + "cl" + ], + [ + "Ġc", + "l" + ], + [ + "Ġ", + "bl" + ], + [ + "Ġb", + "l" + ], + [ + "-t", + "m" + ], + [ + "ome", + "thing" + ], + [ + "omet", + "hing" + ], + [ + "Ġin", + "to" + ], + [ + "Ġint", + "o" + ], + [ + "or", + "m" + ], + [ + "ok", + "yo" + ], + [ + "Ġd", + "is" + ], + [ + "Ġ", + "fe" + ], + [ + "Ġf", + "e" + ], + [ + "Ġf", + "ace" + ], + [ + "Ġfa", + "ce" + ], + [ + "Ġfac", + "e" + ], + [ + "..", + "...." + ], + [ + "....", + ".." + ], + [ + "r", + "ess" + ], + [ + "re", + "ss" + ], + [ + "res", + "s" + ], + [ + "m", + "ent" + ], + [ + "me", + "nt" + ], + [ + "i", + "re" + ], + [ + "ir", + "e" + ], + [ + "Ġ", + "ar" + ], + [ + "Ġa", + "r" + ], + [ + "t", + "y" + ], + [ + "Ġm", + "o" + ], + [ + "re", + "at" + ], + [ + "Ġf", + "ir" + ], + [ + "p", + "er" + ], + [ + "pe", + "r" + ], + [ + "Ġ", + "our" + ], + [ + "Ġo", + "ur" + ], + [ + "c", + "o" + ], + [ + "Ġt", + "hen" + ], + [ + "Ġthe", + "n" + ], + [ + "Ġth", + "en" + ], + [ + "Ġt", + "old" + ], + [ + "Ġto", + "ld" + ], + [ + "ing", + "s" + ], + [ + "Ġt", + "ake" + ], + [ + "Ġbe", + "g" + ], + [ + "n", + "er" + ], + [ + "ne", + "r" + ], + [ + "it", + "ion" + ], + [ + "o", + "se" + ], + [ + "os", + "e" + ], + [ + "Ġ", + "own" + ], + [ + "Ġo", + "wn" + ], + [ + "Ġag", + "ain" + ], + [ + "Ġse", + "em" + ], + [ + "Ġsee", + "m" + ], + [ + "i", + "se" + ], + [ + "is", + "e" + ], + [ + "Ġw", + "at" + ], + [ + "Ġwa", + "t" + ], + [ + "\"", + "W" + ], + [ + "Ġf", + "ar" + ], + [ + "Ġfa", + "r" + ], + [ + "a", + "king" + ], + [ + "ak", + "ing" + ], + [ + "f", + "ore" + ], + [ + "for", + "e" + ], + [ + "ad", + "y" + ], + [ + "-", + "s" + ], + [ + "l", + "ess" + ], + [ + "le", + "ss" + ], + [ + "les", + "s" + ], + [ + "Ġ", + "ret" + ], + [ + "Ġre", + "t" + ], + [ + "Ġr", + "et" + ], + [ + "Ġs", + "ha" + ], + [ + "Ġsh", + "a" + ], + [ + "Ġc", + "ame" + ], + [ + "g", + "er" + ], + [ + "ge", + "r" + ], + [ + "Ġg", + "ood" + ], + [ + "Ġgo", + "od" + ], + [ + "a", + "ther" + ], + [ + "at", + "her" + ], + [ + "ath", + "er" + ], + [ + "ar", + "k" + ], + [ + "r", + "ow" + ], + [ + "ro", + "w" + ], + [ + "Ġ", + "ke" + ], + [ + "Ġk", + "e" + ], + [ + "'", + "m" + ], + [ + "Ġh", + "as" + ], + [ + "Ġha", + "s" + ], + [ + "a", + "th" + ], + [ + "at", + "h" + ], + [ + "p", + "ped" + ], + [ + "pp", + "ed" + ], + [ + "Ġw", + "ent" + ], + [ + "Ġwe", + "nt" + ], + [ + "Ġt", + "ell" + ], + [ + "Ġte", + "ll" + ], + [ + "qu", + "ash" + ], + [ + "Ġ", + "en" + ], + [ + "Ġe", + "n" + ], + [ + "Ġfir", + "st" + ], + [ + "Ġ", + "hot" + ], + [ + "Ġh", + "ot" + ], + [ + "Ġho", + "t" + ], + [ + "i", + "z" + ], + [ + "Ġa", + "way" + ], + [ + "Ġaw", + "ay" + ], + [ + "Ġs", + "omething" + ], + [ + "Ġsome", + "thing" + ], + [ + "Ġ", + "rem" + ], + [ + "Ġre", + "m" + ], + [ + "Ġr", + "em" + ], + [ + "Ġt", + "own" + ], + [ + "Ġto", + "wn" + ], + [ + "Ġs", + "m" + ], + [ + "Ġ", + "This" + ], + [ + "ĠT", + "his" + ], + [ + "Ġbe", + "tter" + ], + [ + "Ġbet", + "ter" + ], + [ + "Ġ", + "Then" + ], + [ + "ĠT", + "hen" + ], + [ + "ĠThe", + "n" + ], + [ + "w", + "as" + ], + [ + "o", + "f" + ], + [ + "b", + "ard" + ], + [ + "Ġ", + "L" + ], + [ + "l", + "i" + ], + [ + "f", + "e" + ], + [ + "Ġ", + "Tokyo" + ], + [ + "ĠT", + "okyo" + ], + [ + "Ġl", + "ong" + ], + [ + "Ġlo", + "ng" + ], + [ + "i", + "ly" + ], + [ + "il", + "y" + ], + [ + "Ġs", + "ure" + ], + [ + "Ġsu", + "re" + ], + [ + "Ġsur", + "e" + ], + [ + "Ġl", + "ooked" + ], + [ + "Ġloo", + "ked" + ], + [ + "Ġlook", + "ed" + ], + [ + "ub", + "bard" + ], + [ + "ct", + "ion" + ], + [ + "or", + "d" + ], + [ + "Ġm", + "any" + ], + [ + "Ġman", + "y" + ], + [ + "i", + "ous" + ], + [ + "Ġt", + "oo" + ], + [ + "Ġto", + "o" + ], + [ + "Ġ", + "here" + ], + [ + "Ġhe", + "re" + ], + [ + "Ġher", + "e" + ], + [ + "o", + "s" + ], + [ + "Ġ", + "under" + ], + [ + "Ġu", + "nder" + ], + [ + "a", + "se" + ], + [ + "as", + "e" + ], + [ + "n", + "g" + ], + [ + "p", + "ed" + ], + [ + "pe", + "d" + ], + [ + "o", + "d" + ], + [ + "m", + "e" + ], + [ + "Ġj", + "ust" + ], + [ + "Ġ", + "now" + ], + [ + "Ġn", + "ow" + ], + [ + "Ġno", + "w" + ], + [ + "i", + "nce" + ], + [ + "in", + "ce" + ], + [ + "Ġhe", + "ard" + ], + [ + "Ġhear", + "d" + ], + [ + "Ġk", + "ind" + ], + [ + "ĠThe", + "y" + ], + [ + "Ġbe", + "fore" + ], + [ + "h", + "y" + ], + [ + "Ġ", + "In" + ], + [ + "ĠI", + "n" + ], + [ + "Ġ", + "ent" + ], + [ + "Ġe", + "nt" + ], + [ + "Ġen", + "t" + ], + [ + "Ġbo", + "ard" + ], + [ + "!", + "\"" + ], + [ + "w", + "ard" + ], + [ + "Ġbe", + "ing" + ], + [ + "Ġw", + "ell" + ], + [ + "Ġwe", + "ll" + ], + [ + "er", + "m" + ], + [ + "r", + "ied" + ], + [ + "ri", + "ed" + ], + [ + "Ġwr", + "ong" + ], + [ + "a", + "id" + ], + [ + "ai", + "d" + ], + [ + "x", + "t" + ], + [ + "Ġret", + "urn" + ], + [ + "i", + "ted" + ], + [ + "it", + "ed" + ], + [ + "ite", + "d" + ], + [ + "Ġy", + "en" + ], + [ + "Ġmat", + "ter" + ], + [ + "Ġ", + "call" + ], + [ + "Ġc", + "all" + ], + [ + "Ġt", + "al" + ], + [ + "Ġ", + "You" + ], + [ + "ĠY", + "ou" + ], + [ + "c", + "ed" + ], + [ + "ce", + "d" + ], + [ + "i", + "sed" + ], + [ + "is", + "ed" + ], + [ + "ise", + "d" + ], + [ + "Ġ", + "cha" + ], + [ + "Ġc", + "ha" + ], + [ + "Ġch", + "a" + ], + [ + "on", + "s" + ], + [ + "Ġs", + "ame" + ], + [ + "Ġsa", + "me" + ], + [ + "Ġo", + "nce" + ], + [ + "Ġon", + "ce" + ], + [ + "d", + "ay" + ], + [ + "f", + "t" + ], + [ + "Ġs", + "w" + ], + [ + "Ġbec", + "ause" + ], + [ + "Ġth", + "ink" + ], + [ + "Ġthin", + "k" + ], + [ + "Ġ", + "where" + ], + [ + "Ġw", + "here" + ], + [ + "Ġwhe", + "re" + ], + [ + "Ġ", + "No" + ], + [ + "ĠN", + "o" + ], + [ + "ĠH", + "ubbard" + ], + [ + "ĠS", + "quash" + ], + [ + "Ġc", + "op" + ], + [ + "Ġco", + "p" + ], + [ + "w", + "ith" + ], + [ + "e", + "red" + ], + [ + "er", + "ed" + ], + [ + "oll", + "ow" + ], + [ + "Ġpl", + "ace" + ], + [ + "i", + "dd" + ], + [ + "id", + "d" + ], + [ + "c", + "ess" + ], + [ + "ce", + "ss" + ], + [ + "ces", + "s" + ], + [ + "Ġs", + "how" + ], + [ + "Ġsh", + "ow" + ], + [ + "is", + "ha" + ], + [ + "ish", + "a" + ], + [ + "Ġ", + "ra" + ], + [ + "Ġr", + "a" + ], + [ + "Ġle", + "tter" + ], + [ + "Ġlet", + "ter" + ], + [ + "n", + "e" + ], + [ + "v", + "es" + ], + [ + "ve", + "s" + ], + [ + "a", + "ting" + ], + [ + "at", + "ing" + ], + [ + "r", + "ang" + ], + [ + "ra", + "ng" + ], + [ + "Ġa", + "ff" + ], + [ + "Ġaf", + "f" + ], + [ + "Ġh", + "and" + ], + [ + "Ġha", + "nd" + ], + [ + "Ġs", + "c" + ], + [ + "Ġp", + "ers" + ], + [ + "Ġper", + "s" + ], + [ + "i", + "nt" + ], + [ + "in", + "t" + ], + [ + "p", + "r" + ], + [ + "s", + "ide" + ], + [ + "sid", + "e" + ], + [ + "f", + "ter" + ], + [ + "ft", + "er" + ], + [ + "Ġsa", + "ying" + ], + [ + "Ġsay", + "ing" + ], + [ + "Ġl", + "au" + ], + [ + "t", + "hat" + ], + [ + "th", + "at" + ], + [ + "Ġwith", + "out" + ], + [ + "r", + "on" + ], + [ + "ro", + "n" + ], + [ + "a", + "ir" + ], + [ + "ai", + "r" + ], + [ + "l", + "ect" + ], + [ + "le", + "ct" + ], + [ + "Ġ", + "What" + ], + [ + "ĠW", + "hat" + ], + [ + "el", + "t" + ], + [ + "Ġw", + "hile" + ], + [ + "Ġwh", + "ile" + ], + [ + "Ġwhi", + "le" + ], + [ + "og", + "a" + ], + [ + "a", + "per" + ], + [ + "ap", + "er" + ], + [ + "Ġ", + "pe" + ], + [ + "Ġp", + "e" + ], + [ + "o", + "y" + ], + [ + "Ġs", + "at" + ], + [ + "Ġsa", + "t" + ], + [ + "i", + "es" + ], + [ + "ie", + "s" + ], + [ + "Ġa", + "dd" + ], + [ + "Ġad", + "d" + ], + [ + "Ġday", + "s" + ], + [ + "Ġs", + "pe" + ], + [ + "Ġsp", + "e" + ], + [ + "Ġ", + "ho" + ], + [ + "Ġh", + "o" + ], + [ + "Ġ", + "ans" + ], + [ + "Ġan", + "s" + ], + [ + "Ġh", + "ar" + ], + [ + "Ġha", + "r" + ], + [ + "Ġ", + "When" + ], + [ + "ĠW", + "hen" + ], + [ + "Ġany", + "thing" + ], + [ + "p", + "en" + ], + [ + "pe", + "n" + ], + [ + "]", + "Ċ" + ], + [ + "t", + "ain" + ], + [ + "Ġm", + "ust" + ], + [ + "Ġn", + "ew" + ], + [ + "Ġne", + "w" + ], + [ + "l", + "ic" + ], + [ + "li", + "c" + ], + [ + "Ġv", + "o" + ], + [ + "h", + "ile" + ], + [ + "hi", + "le" + ], + [ + "g", + "et" + ], + [ + "ge", + "t" + ], + [ + "ĠA", + "s" + ], + [ + "Ġ", + "very" + ], + [ + "'", + "re" + ], + [ + "Ġe", + "very" + ], + [ + "a", + "ve" + ], + [ + "av", + "e" + ], + [ + "?", + "\"" + ], + [ + "ad", + "ger" + ], + [ + "ĠK", + "oga" + ], + [ + "ĠM", + "r" + ], + [ + "r", + "ough" + ], + [ + "ul", + "t" + ], + [ + "Ġf", + "ollow" + ], + [ + "t", + "ing" + ], + [ + "i", + "fe" + ], + [ + "if", + "e" + ], + [ + "idd", + "le" + ], + [ + "f", + "ul" + ], + [ + "fu", + "l" + ], + [ + "an", + "k" + ], + [ + "ĠS", + "o" + ], + [ + "Ġsee", + "med" + ], + [ + "Ġseem", + "ed" + ], + [ + "ĠA", + "nd" + ], + [ + "i", + "x" + ], + [ + "Ġs", + "et" + ], + [ + "Ġse", + "t" + ], + [ + "Ġc", + "are" + ], + [ + "Ġcar", + "e" + ], + [ + "Ġ", + "res" + ], + [ + "Ġre", + "s" + ], + [ + "Ġr", + "es" + ], + [ + "Ġn", + "ever" + ], + [ + "Ġne", + "ver" + ], + [ + "Ġf", + "ound" + ], + [ + "Ġ", + "lo" + ], + [ + "Ġl", + "o" + ], + [ + "c", + "id" + ], + [ + "ci", + "d" + ], + [ + "i", + "ned" + ], + [ + "in", + "ed" + ], + [ + "ine", + "d" + ], + [ + "Ġcl", + "ass" + ], + [ + "Ġmy", + "self" + ], + [ + "a", + "w" + ], + [ + "Ġw", + "om" + ], + [ + "at", + "ions" + ], + [ + "ation", + "s" + ], + [ + "Ġle", + "ft" + ], + [ + "ĠW", + "e" + ], + [ + "Ġteacher", + "s" + ], + [ + "Ġteach", + "ers" + ], + [ + "\"", + "Y" + ], + [ + "n", + "a" + ], + [ + "o", + "nt" + ], + [ + "on", + "t" + ], + [ + "Ġd", + "es" + ], + [ + "Ġde", + "s" + ], + [ + "Ġth", + "ose" + ], + [ + "i", + "red" + ], + [ + "ir", + "ed" + ], + [ + "ire", + "d" + ], + [ + "Ġs", + "en" + ], + [ + "Ġse", + "n" + ], + [ + "y", + "ing" + ], + [ + "Ġthe", + "se" + ], + [ + "Ġth", + "ese" + ], + [ + "a", + "z" + ], + [ + "ĠT", + "here" + ], + [ + "ĠThe", + "re" + ], + [ + "ce", + "pt" + ], + [ + "Ġd", + "ang" + ], + [ + "Ġ", + "U" + ], + [ + "\"", + "H" + ], + [ + "b", + "od" + ], + [ + "bo", + "d" + ], + [ + "bod", + "y" + ], + [ + "Ġh", + "aving" + ], + [ + "Ġha", + "ving" + ], + [ + "al", + "ary" + ], + [ + "Ġw", + "atch" + ], + [ + "Ġwat", + "ch" + ], + [ + "Ġg", + "ive" + ], + [ + "Ġgi", + "ve" + ], + [ + "a", + "ge" + ], + [ + "ag", + "e" + ], + [ + "Ġit", + "s" + ], + [ + "Ġapp", + "e" + ], + [ + "u", + "e" + ], + [ + "Ġc", + "ount" + ], + [ + "Ġh", + "ard" + ], + [ + "Ġhar", + "d" + ], + [ + "Ġb", + "el" + ], + [ + "Ġbe", + "l" + ], + [ + "ot", + "t" + ], + [ + "Ġd", + "ist" + ], + [ + "Ġdis", + "t" + ], + [ + "\"", + "S" + ], + [ + "ĠM", + "ad" + ], + [ + "-", + "n" + ], + [ + "ri", + "but" + ], + [ + "g", + "ed" + ], + [ + "ge", + "d" + ], + [ + "Ġ", + "att" + ], + [ + "Ġat", + "t" + ], + [ + "fe", + "re" + ], + [ + "fer", + "e" + ], + [ + "i", + "ther" + ], + [ + "it", + "her" + ], + [ + "ith", + "er" + ], + [ + "Ġup", + "on" + ], + [ + "Ġ", + "tem" + ], + [ + "Ġt", + "em" + ], + [ + "Ġte", + "m" + ], + [ + "Ġpers", + "on" + ], + [ + "n", + "ing" + ], + [ + "Ġc", + "he" + ], + [ + "Ġch", + "e" + ], + [ + "ar", + "ly" + ], + [ + "one", + "y" + ], + [ + "Ġso", + "on" + ], + [ + "e", + "ment" + ], + [ + "em", + "ent" + ], + [ + "Ġ", + "(" + ], + [ + "Ġtr", + "ans" + ], + [ + "Ġex", + "p" + ], + [ + "Ġ", + "ser" + ], + [ + "Ġs", + "er" + ], + [ + "Ġse", + "r" + ], + [ + "Ġre", + "g" + ], + [ + "as", + "on" + ], + [ + "Ġs", + "aw" + ], + [ + "Ġsa", + "w" + ], + [ + "Ġne", + "xt" + ], + [ + "o", + "ot" + ], + [ + "oo", + "t" + ], + [ + "Ġha", + "lf" + ], + [ + "Ġt", + "ook" + ], + [ + "Ġto", + "ok" + ], + [ + "Ġtoo", + "k" + ], + [ + "Ġb", + "ad" + ], + [ + "Ġh", + "our" + ], + [ + "Ġhou", + "r" + ], + [ + "Ġho", + "ur" + ], + [ + "Ġs", + "alary" + ], + [ + "Ġbeg", + "an" + ], + [ + "r", + "ight" + ], + [ + "ri", + "ght" + ], + [ + "on", + "na" + ], + [ + "-s", + "an" + ], + [ + "Ġwor", + "ks" + ], + [ + "Ġwork", + "s" + ], + [ + "Ġ", + "J" + ], + [ + "f", + "orm" + ], + [ + "for", + "m" + ], + [ + "ic", + "al" + ], + [ + "Ġt", + "ra" + ], + [ + "Ġtr", + "a" + ], + [ + "m", + "an" + ], + [ + "Ġnot", + "hing" + ], + [ + "Ġno", + "thing" + ], + [ + "Ġst", + "ill" + ], + [ + "ear", + "s" + ], + [ + "Ġsu", + "pp" + ], + [ + "Ġt", + "urn" + ], + [ + "Ġf", + "elt" + ], + [ + "Ġwom", + "an" + ], + [ + "Ġst", + "arted" + ], + [ + "Ġstart", + "ed" + ], + [ + "ou", + "ble" + ], + [ + "oub", + "le" + ], + [ + "u", + "ra" + ], + [ + "ur", + "a" + ], + [ + "is", + "hing" + ], + [ + "ish", + "ing" + ], + [ + ":", + "Ċ" + ], + [ + "lect", + "ron" + ], + [ + "lectron", + "ic" + ], + [ + "o", + "ok" + ], + [ + "oo", + "k" + ], + [ + "Ġcop", + "y" + ], + [ + "Ġf", + "ull" + ], + [ + "c", + "ond" + ], + [ + "co", + "nd" + ], + [ + "con", + "d" + ], + [ + "m", + "at" + ], + [ + "Ġm", + "iddle" + ], + [ + "Ġl", + "ook" + ], + [ + "Ġloo", + "k" + ], + [ + "Ġlo", + "ok" + ], + [ + "Ġcom", + "m" + ], + [ + "w", + "ered" + ], + [ + "we", + "red" + ], + [ + "wer", + "ed" + ], + [ + "Ġbec", + "ame" + ], + [ + "Ġfellow", + "s" + ], + [ + "w", + "ould" + ], + [ + "Ġ", + "got" + ], + [ + "Ġg", + "ot" + ], + [ + "Ġgo", + "t" + ], + [ + "Ġg", + "l" + ], + [ + "Ġg", + "u" + ], + [ + "Ġke", + "ep" + ], + [ + "Ġ", + "ge" + ], + [ + "Ġg", + "e" + ], + [ + "ĠMad", + "onna" + ], + [ + "i", + "ter" + ], + [ + "it", + "er" + ], + [ + "ite", + "r" + ], + [ + "is", + "hed" + ], + [ + "ish", + "ed" + ], + [ + "Ġunder", + "st" + ], + [ + "Ġst", + "ra" + ], + [ + "Ġstr", + "a" + ], + [ + "s", + "id" + ], + [ + "Ġcount", + "ry" + ], + [ + "o", + "ple" + ], + [ + "op", + "le" + ], + [ + "Ġpro", + "v" + ], + [ + "Ġp", + "ut" + ], + [ + "n", + "o" + ], + [ + "'", + "ll" + ], + [ + "Ġs", + "le" + ], + [ + "Ġsl", + "e" + ], + [ + "rang", + "e" + ], + [ + "ĠS", + "he" + ], + [ + "p", + "os" + ], + [ + "po", + "s" + ], + [ + "Ġm", + "ind" + ], + [ + "Ġp", + "ass" + ], + [ + "Ġth", + "rough" + ], + [ + "Ġqu", + "ite" + ], + [ + "Ġ", + "ind" + ], + [ + "Ġin", + "d" + ], + [ + "Ġboard", + "ing" + ], + [ + "te", + "acher" + ], + [ + "p", + "le" + ], + [ + "pl", + "e" + ], + [ + "P", + "orcupine" + ], + [ + "Ġ", + "ple" + ], + [ + "Ġp", + "le" + ], + [ + "Ġpl", + "e" + ], + [ + "Ġge", + "isha" + ], + [ + "ĠĠ", + "ĠĠ" + ], + [ + "o", + "st" + ], + [ + "os", + "t" + ], + [ + "en", + "se" + ], + [ + "N", + "o" + ], + [ + "i", + "ble" + ], + [ + "ib", + "le" + ], + [ + "Ġre", + "ad" + ], + [ + "Ġ", + "red" + ], + [ + "Ġre", + "d" + ], + [ + "Ġr", + "ed" + ], + [ + "ent", + "ion" + ], + [ + "e", + "ned" + ], + [ + "en", + "ed" + ], + [ + "ene", + "d" + ], + [ + "!", + "\"Ċ" + ], + [ + "!\"", + "Ċ" + ], + [ + "Ġre", + "f" + ], + [ + "Ġ", + "ad" + ], + [ + "Ġa", + "d" + ], + [ + "Ġf", + "l" + ], + [ + "Ġst", + "ay" + ], + [ + "u", + "p" + ], + [ + "Ġr", + "ound" + ], + [ + "Ġro", + "und" + ], + [ + "Ġc", + "le" + ], + [ + "Ġcl", + "e" + ], + [ + "Ġo", + "pen" + ], + [ + "Ġ", + "ob" + ], + [ + "Ġo", + "b" + ], + [ + "te", + "nd" + ], + [ + "Ġf", + "ind" + ], + [ + "Ġfin", + "d" + ], + [ + "Ġ", + "per" + ], + [ + "Ġp", + "er" + ], + [ + "Ġpe", + "r" + ], + [ + "Ġcall", + "ed" + ], + [ + "Ġs", + "ur" + ], + [ + "Ġsu", + "r" + ], + [ + "r", + "ew" + ], + [ + "re", + "w" + ], + [ + "Ġp", + "aper" + ], + [ + "ĠB", + "adger" + ], + [ + "Ġme", + "et" + ], + [ + "i", + "ss" + ], + [ + "is", + "s" + ], + [ + "\"T", + "hat" + ], + [ + "er", + "ms" + ], + [ + "erm", + "s" + ], + [ + "T", + "E" + ], + [ + "itt", + "en" + ], + [ + "ab", + "ly" + ], + [ + "n", + "ess" + ], + [ + "ne", + "ss" + ], + [ + "Ġcan", + "not" + ], + [ + "Ġs", + "imp" + ], + [ + "c", + "on" + ], + [ + "co", + "n" + ], + [ + "Ġre", + "ason" + ], + [ + "y", + "ou" + ], + [ + "yo", + "u" + ], + [ + "Ġh", + "ome" + ], + [ + "Ġho", + "me" + ], + [ + "b", + "y" + ], + [ + "Ġf", + "ight" + ], + [ + "itt", + "le" + ], + [ + "Ġth", + "ings" + ], + [ + "Ġthing", + "s" + ], + [ + "Ġe", + "as" + ], + [ + "Ġ", + "imp" + ], + [ + "Ġim", + "p" + ], + [ + "ress", + "ed" + ], + [ + "res", + "sed" + ], + [ + "Ġme", + "an" + ], + [ + "Ġappe", + "ared" + ], + [ + "Ġappear", + "ed" + ], + [ + "Ġn", + "at" + ], + [ + "Ġ", + "hel" + ], + [ + "Ġh", + "el" + ], + [ + "Ġhe", + "l" + ], + [ + "r", + "et" + ], + [ + "re", + "t" + ], + [ + "ak", + "en" + ], + [ + "ake", + "n" + ], + [ + "Ġstra", + "ight" + ], + [ + "Ġaff", + "air" + ], + [ + "i", + "ting" + ], + [ + "it", + "ing" + ], + [ + "Ġ", + "ed" + ], + [ + "Ġe", + "d" + ], + [ + "Ġs", + "ince" + ], + [ + "l", + "og" + ], + [ + "lo", + "g" + ], + [ + "Ġp", + "ay" + ], + [ + "Ġfr", + "ont" + ], + [ + "m", + "y" + ], + [ + "Ġvo", + "ice" + ], + [ + "re", + "ady" + ], + [ + "Ġf", + "ool" + ], + [ + "ound", + "ation" + ], + [ + "Ġe", + "lectronic" + ], + [ + "Ġt", + "erms" + ], + [ + "Ġm", + "ar" + ], + [ + "ap", + "an" + ], + [ + "a", + "ny" + ], + [ + "an", + "y" + ], + [ + "Ġre", + "sp" + ], + [ + "Ġres", + "p" + ], + [ + "Ġe", + "nd" + ], + [ + "Ġen", + "d" + ], + [ + "a", + "pp" + ], + [ + "ap", + "p" + ], + [ + "w", + "hat" + ], + [ + "s", + "tr" + ], + [ + "st", + "r" + ], + [ + "r", + "ap" + ], + [ + "ra", + "p" + ], + [ + "i", + "al" + ], + [ + "ic", + "ul" + ], + [ + "Ġac", + "c" + ], + [ + "o", + "th" + ], + [ + "ot", + "h" + ], + [ + "Ġse", + "cond" + ], + [ + "Ġf", + "lo" + ], + [ + "Ġfl", + "o" + ], + [ + "Ġs", + "ix" + ], + [ + "Ġfe", + "et" + ], + [ + "b", + "r" + ], + [ + "i", + "et" + ], + [ + "ie", + "t" + ], + [ + "Ġl", + "ittle" + ], + [ + "l", + "es" + ], + [ + "le", + "s" + ], + [ + "Ġm", + "oney" + ], + [ + "Ġde", + "cl" + ], + [ + "Ġdec", + "l" + ], + [ + "Ġe", + "y" + ], + [ + "Ġcom", + "p" + ], + [ + "Ġco", + "mp" + ], + [ + "ar", + "ing" + ], + [ + "Ġag", + "re" + ], + [ + "w", + "here" + ], + [ + "ĠS", + "t" + ], + [ + "Ġst", + "re" + ], + [ + "Ġstr", + "e" + ], + [ + "e", + "x" + ], + [ + "r", + "act" + ], + [ + "ra", + "ct" + ], + [ + "Ġ", + "int" + ], + [ + "Ġin", + "t" + ], + [ + "Ġd", + "ire" + ], + [ + "Ġbec", + "ome" + ], + [ + "Ġbecom", + "e" + ], + [ + "Ġh", + "on" + ], + [ + "Ġho", + "n" + ], + [ + "Ġcon", + "sid" + ], + [ + "er", + "tain" + ], + [ + "n", + "ow" + ], + [ + "no", + "w" + ], + [ + "Ġs", + "l" + ], + [ + "it", + "or" + ], + [ + "g", + "g" + ], + [ + "Ġj", + "um" + ], + [ + "Ġb", + "u" + ], + [ + "Ġ", + "thing" + ], + [ + "Ġt", + "hing" + ], + [ + "Ġth", + "ing" + ], + [ + "Ġthin", + "g" + ], + [ + "Ġans", + "wered" + ], + [ + "Ġanswer", + "ed" + ], + [ + "o", + "es" + ], + [ + "y", + "a" + ], + [ + "ĠT", + "hat" + ], + [ + "iz", + "e" + ], + [ + "o", + "nd" + ], + [ + "on", + "d" + ], + [ + "a", + "ct" + ], + [ + "ac", + "t" + ], + [ + "Ġe", + "ff" + ], + [ + "Ġb", + "ang" + ], + [ + "a", + "bout" + ], + [ + "ab", + "out" + ], + [ + "Ġ", + "bed" + ], + [ + "Ġb", + "ed" + ], + [ + "Ġbe", + "d" + ], + [ + "or", + "row" + ], + [ + "u", + "ng" + ], + [ + "un", + "g" + ], + [ + "ĠT", + "o" + ], + [ + "Ġke", + "pt" + ], + [ + "Ġw", + "al" + ], + [ + "Ġwa", + "l" + ], + [ + "Ġb", + "ath" + ], + [ + "Ġd", + "ra" + ], + [ + "Ġdr", + "a" + ], + [ + "\"", + "A" + ], + [ + "r", + "ings" + ], + [ + "ho", + "pp" + ], + [ + "Ġres", + "ign" + ], + [ + "Ġd", + "in" + ], + [ + "Ġl", + "ady" + ], + [ + ".", + "E" + ], + [ + "Ġu", + "se" + ], + [ + "Ġus", + "e" + ], + [ + "l", + "ish" + ], + [ + "li", + "sh" + ], + [ + "or", + "s" + ], + [ + "Ġwr", + "itten" + ], + [ + "e", + "ne" + ], + [ + "en", + "e" + ], + [ + "i", + "v" + ], + [ + "Ġd", + "if" + ], + [ + "Ġ", + "ste" + ], + [ + "Ġs", + "te" + ], + [ + "Ġst", + "e" + ], + [ + "Ġst", + "ory" + ], + [ + "c", + "om" + ], + [ + "co", + "m" + ], + [ + "r", + "es" + ], + [ + "re", + "s" + ], + [ + "ent", + "ly" + ], + [ + "Ġf", + "act" + ], + [ + "Ġfa", + "ct" + ], + [ + "Ġfac", + "t" + ], + [ + "h", + "es" + ], + [ + "he", + "s" + ], + [ + "way", + "s" + ], + [ + "Ġw", + "hy" + ], + [ + "Ġwh", + "y" + ], + [ + "Ġth", + "ough" + ], + [ + "Ġ", + "str" + ], + [ + "Ġs", + "tr" + ], + [ + "Ġst", + "r" + ], + [ + "o", + "nder" + ], + [ + "ond", + "er" + ], + [ + "he", + "ad" + ], + [ + "Ġc", + "our" + ], + [ + "Ġco", + "ur" + ], + [ + "Ġm", + "on" + ], + [ + "Ġmo", + "n" + ], + [ + "Ġs", + "k" + ], + [ + "Ġbel", + "ie" + ], + [ + "Ġl", + "et" + ], + [ + "Ġle", + "t" + ], + [ + "f", + "er" + ], + [ + "fe", + "r" + ], + [ + "Ġre", + "qu" + ], + [ + "Ġ", + "line" + ], + [ + "Ġl", + "ine" + ], + [ + "Ġli", + "ne" + ], + [ + "ro", + "om" + ], + [ + "-", + "day" + ], + [ + "-d", + "ay" + ], + [ + "Ġd", + "one" + ], + [ + "Ġdo", + "ne" + ], + [ + "Ġdon", + "e" + ], + [ + "Ġd", + "oes" + ], + [ + "Ġdo", + "es" + ], + [ + "ĠO", + "ne" + ], + [ + "ĠOn", + "e" + ], + [ + "Ġdang", + "o" + ], + [ + "ass", + "hopp" + ], + [ + "Ġconsid", + "er" + ], + [ + "Ġdin", + "ner" + ], + [ + "ĠF", + "oundation" + ], + [ + "*", + "*" + ], + [ + "em", + "pt" + ], + [ + "e", + "se" + ], + [ + "es", + "e" + ], + [ + "Ġw", + "ord" + ], + [ + "Ġwor", + "d" + ], + [ + "r", + "est" + ], + [ + "re", + "st" + ], + [ + "res", + "t" + ], + [ + "Ġen", + "ough" + ], + [ + "Ġg", + "reat" + ], + [ + "Ġn", + "ame" + ], + [ + "Ġp", + "ub" + ], + [ + "Ġman", + "ner" + ], + [ + "w", + "er" + ], + [ + "we", + "r" + ], + [ + "i", + "ct" + ], + [ + "ic", + "t" + ], + [ + "i", + "ness" + ], + [ + "in", + "ess" + ], + [ + "ine", + "ss" + ], + [ + "Ġhim", + "self" + ], + [ + "Ġpe", + "ople" + ], + [ + "e", + "w" + ], + [ + "Ġc", + "or" + ], + [ + "Ġco", + "r" + ], + [ + "est", + "ion" + ], + [ + "Ġb", + "ig" + ], + [ + "e", + "e" + ], + [ + "Ġ", + "ri" + ], + [ + "Ġr", + "i" + ], + [ + "id", + "es" + ], + [ + "ide", + "s" + ], + [ + "Ġbr", + "other" + ], + [ + "Ġhe", + "art" + ], + [ + "Ġhear", + "t" + ], + [ + "ect", + "ed" + ], + [ + "e", + "ed" + ], + [ + "ee", + "d" + ], + [ + "Ġother", + "s" + ], + [ + "s", + "ol" + ], + [ + "so", + "l" + ], + [ + "t", + "ed" + ], + [ + "te", + "d" + ], + [ + "Ġey", + "es" + ], + [ + "Ġtr", + "ouble" + ], + [ + "Ġte", + "ach" + ], + [ + "Ġtea", + "ch" + ], + [ + "Ġbo", + "at" + ], + [ + "Ġf", + "our" + ], + [ + "Ġal", + "ready" + ], + [ + "r", + "om" + ], + [ + "ro", + "m" + ], + [ + "g", + "hed" + ], + [ + "gh", + "ed" + ], + [ + "Ġs", + "qu" + ], + [ + "Ġp", + "ol" + ], + [ + "Ġpo", + "l" + ], + [ + "c", + "es" + ], + [ + "ce", + "s" + ], + [ + "ĠH", + "ott" + ], + [ + "Ġle", + "ave" + ], + [ + "Ġdist", + "ribut" + ], + [ + "as", + "ter" + ], + [ + "ast", + "er" + ], + [ + "C", + "H" + ], + [ + "u", + "c" + ], + [ + "Ġ", + "im" + ], + [ + "Ġhow", + "ever" + ], + [ + "t", + "here" + ], + [ + "the", + "re" + ], + [ + "ther", + "e" + ], + [ + "apan", + "ese" + ], + [ + "Ġl", + "ast" + ], + [ + "Ġc", + "r" + ], + [ + "il", + "ity" + ], + [ + "Ġsimp", + "le" + ], + [ + "Ġl", + "ife" + ], + [ + "Ġli", + "fe" + ], + [ + "-", + "c" + ], + [ + "Ġreg", + "ard" + ], + [ + "Ġf", + "in" + ], + [ + "u", + "al" + ], + [ + "Ġme", + "ans" + ], + [ + "Ġmean", + "s" + ], + [ + "Ġst", + "and" + ], + [ + "at", + "ch" + ], + [ + "Ġsh", + "ort" + ], + [ + "n", + "ed" + ], + [ + "ne", + "d" + ], + [ + "Ġse", + "en" + ], + [ + "Ġsee", + "n" + ], + [ + "Ġh", + "app" + ], + [ + "Ġha", + "pp" + ], + [ + "-", + "k" + ], + [ + "Ġagain", + "st" + ], + [ + "h", + "im" + ], + [ + "hi", + "m" + ], + [ + "a", + "med" + ], + [ + "am", + "ed" + ], + [ + "ame", + "d" + ], + [ + "Ġst", + "ood" + ], + [ + "Ġg", + "ra" + ], + [ + "Ġgr", + "a" + ], + [ + "Ġm", + "other" + ], + [ + "Ġmo", + "ther" + ], + [ + "Ġf", + "ish" + ], + [ + "Ġw", + "ater" + ], + [ + "Ġwat", + "er" + ], + [ + "Ġwa", + "ter" + ], + [ + "a", + "il" + ], + [ + "ai", + "l" + ], + [ + "ce", + "i" + ], + [ + "Ġr", + "ather" + ], + [ + "Ġra", + "ther" + ], + [ + "Ġin", + "s" + ], + [ + "Ġfe", + "el" + ], + [ + "Ġal", + "so" + ], + [ + "Ġ", + "ord" + ], + [ + "Ġor", + "d" + ], + [ + "Ġcom", + "ing" + ], + [ + "Ġco", + "ming" + ], + [ + "ic", + "s" + ], + [ + "Ġe", + "ither" + ], + [ + "n", + "ce" + ], + [ + "Ġ", + "'" + ], + [ + "Ġk", + "id" + ], + [ + "Ġlau", + "ghed" + ], + [ + "Ġlaugh", + "ed" + ], + [ + "li", + "ke" + ], + [ + "ĠA", + "r" + ], + [ + "g", + "r" + ], + [ + "ĠHott", + "a" + ], + [ + "Ġtal", + "k" + ], + [ + "ge", + "ther" + ], + [ + "get", + "her" + ], + [ + "ĠS", + "ir" + ], + [ + "Ġp", + "un" + ], + [ + "P", + "ro" + ], + [ + "a", + "ts" + ], + [ + "at", + "s" + ], + [ + "m", + "ost" + ], + [ + "Ġre", + "p" + ], + [ + "Ġr", + "ep" + ], + [ + "Ġg", + "i" + ], + [ + "is", + "f" + ], + [ + "b", + "ably" + ], + [ + "ak", + "es" + ], + [ + "ake", + "s" + ], + [ + "ĠN", + "ot" + ], + [ + "ĠNo", + "t" + ], + [ + "n", + "y" + ], + [ + "Ġapp", + "ear" + ], + [ + "Ġappe", + "ar" + ], + [ + "m", + "p" + ], + [ + "c", + "ha" + ], + [ + "ch", + "a" + ], + [ + "Ġ", + "act" + ], + [ + "Ġa", + "ct" + ], + [ + "Ġac", + "t" + ], + [ + "b", + "ed" + ], + [ + "be", + "d" + ], + [ + "ie", + "f" + ], + [ + "u", + "ff" + ], + [ + "Ġa", + "po" + ], + [ + "Ġm", + "et" + ], + [ + "Ġme", + "t" + ], + [ + "Ġreturn", + "ed" + ], + [ + "Ġs", + "ound" + ], + [ + "Ġso", + "und" + ], + [ + "us", + "iness" + ], + [ + "Ġlau", + "gh" + ], + [ + "Ġcl", + "ear" + ], + [ + "Ġcle", + "ar" + ], + [ + "Ġn", + "eed" + ], + [ + "Ġne", + "ed" + ], + [ + "f", + "ess" + ], + [ + "fe", + "ss" + ], + [ + "es", + "ted" + ], + [ + "est", + "ed" + ], + [ + "Ġin", + "v" + ], + [ + "Ġac", + "cept" + ], + [ + "u", + "nder" + ], + [ + "und", + "er" + ], + [ + ";", + "Ċ" + ], + [ + "Ġsur", + "pr" + ], + [ + "d", + "e" + ], + [ + "Ġtr", + "ain" + ], + [ + "Ġtra", + "in" + ], + [ + "Ġhot", + "el" + ], + [ + "Ġsle", + "ep" + ], + [ + "Ġd", + "r" + ], + [ + "Ġh", + "old" + ], + [ + "Ġho", + "ld" + ], + [ + "l", + "ock" + ], + [ + "lo", + "ck" + ], + [ + "p", + "ura" + ], + [ + "Ġsp", + "rings" + ], + [ + "Ġ", + "......" + ], + [ + "Ġagre", + "ement" + ], + [ + "ĠD", + "ar" + ], + [ + "Ġ", + "rest" + ], + [ + "Ġre", + "st" + ], + [ + "Ġr", + "est" + ], + [ + "Ġres", + "t" + ], + [ + "cl", + "ud" + ], + [ + "at", + "or" + ], + [ + "a", + "v" + ], + [ + "Ġor", + "ig" + ], + [ + "Ġorig", + "in" + ], + [ + "Ġ", + "el" + ], + [ + "Ġe", + "l" + ], + [ + "Ġn", + "or" + ], + [ + "Ġno", + "r" + ], + [ + "Ġp", + "res" + ], + [ + "Ġpr", + "es" + ], + [ + "Ġpre", + "s" + ], + [ + "Ġunderst", + "and" + ], + [ + "Ġt", + "aken" + ], + [ + "Ġtake", + "n" + ], + [ + "Ġl", + "ight" + ], + [ + "Ġli", + "ght" + ], + [ + "e", + "ner" + ], + [ + "en", + "er" + ], + [ + "ene", + "r" + ], + [ + "s", + "ome" + ], + [ + "so", + "me" + ], + [ + "Ġbr", + "ought" + ], + [ + "rap", + "h" + ], + [ + "Ġ", + "most" + ], + [ + "Ġm", + "ost" + ], + [ + "Ġmo", + "st" + ], + [ + "o", + "ke" + ], + [ + "ok", + "e" + ], + [ + "-", + "w" + ], + [ + "Ġu", + "nt" + ], + [ + "Ġun", + "t" + ], + [ + "Ġf", + "ather" + ], + [ + "Ġfa", + "ther" + ], + [ + "Ġ", + "used" + ], + [ + "Ġu", + "sed" + ], + [ + "Ġus", + "ed" + ], + [ + "Ġuse", + "d" + ], + [ + "Ġe", + "at" + ], + [ + "Ġy", + "ears" + ], + [ + "ĠW", + "hile" + ], + [ + "Ġ", + "chan" + ], + [ + "Ġc", + "han" + ], + [ + "Ġcha", + "n" + ], + [ + "Ġch", + "an" + ], + [ + "Ġsu", + "dd" + ], + [ + "Ġsudd", + "en" + ], + [ + "Ġapo", + "log" + ], + [ + "Ġset", + "t" + ], + [ + "Ġth", + "in" + ], + [ + "ĠM", + "y" + ], + [ + "Ġt", + "en" + ], + [ + "Ġte", + "n" + ], + [ + "im", + "es" + ], + [ + "ime", + "s" + ], + [ + "f", + "or" + ], + [ + "o", + "ud" + ], + [ + "ou", + "d" + ], + [ + "W", + "hen" + ], + [ + "Ġd", + "et" + ], + [ + "Ġde", + "t" + ], + [ + "Ġl", + "ive" + ], + [ + "Ġli", + "ve" + ], + [ + "Ġo", + "c" + ], + [ + "Ġf", + "ive" + ], + [ + "Ġc", + "ont" + ], + [ + "Ġcon", + "t" + ], + [ + "Ġco", + "nt" + ], + [ + "Ġhel", + "p" + ], + [ + "Ġw", + "a" + ], + [ + "Ġpass", + "ed" + ], + [ + "Ġr", + "un" + ], + [ + "Ġm", + "aking" + ], + [ + "Ġst", + "range" + ], + [ + "Ġt", + "aking" + ], + [ + "Ġe", + "ach" + ], + [ + "\"", + "You" + ], + [ + "\"Y", + "ou" + ], + [ + "Ġan", + "other" + ], + [ + "\"S", + "ay" + ], + [ + "\"", + "The" + ], + [ + "\"T", + "he" + ], + [ + "at", + "es" + ], + [ + "ate", + "s" + ], + [ + "Ġple", + "as" + ], + [ + "asshopp", + "ers" + ], + [ + "Ġm", + "om" + ], + [ + "Ġmo", + "m" + ], + [ + "Ġmo", + "ment" + ], + [ + "Ġmom", + "ent" + ], + [ + "ent", + "le" + ], + [ + "ng", + "lish" + ], + [ + "CH", + "A" + ], + [ + "Ġorigin", + "al" + ], + [ + "i", + "ons" + ], + [ + "ion", + "s" + ], + [ + "ur", + "ing" + ], + [ + "Ġpub", + "lic" + ], + [ + "u", + "ct" + ], + [ + "uc", + "t" + ], + [ + "u", + "ck" + ], + [ + "uc", + "k" + ], + [ + "Ġqu", + "estion" + ], + [ + "a", + "i" + ], + [ + "c", + "y" + ], + [ + "e", + "k" + ], + [ + "Ġflo", + "or" + ], + [ + "Ġc", + "ar" + ], + [ + "ou", + "se" + ], + [ + "ous", + "e" + ], + [ + "Ġ", + "side" + ], + [ + "Ġs", + "ide" + ], + [ + "-", + "ya" + ], + [ + "Ġc", + "ertain" + ], + [ + "h", + "ys" + ], + [ + "hy", + "s" + ], + [ + "-", + "d" + ], + [ + "i", + "gh" + ], + [ + "ig", + "h" + ], + [ + "ag", + "in" + ], + [ + "we", + "et" + ], + [ + "Ġpo", + "or" + ], + [ + "Ġde", + "cid" + ], + [ + "Ġdec", + "id" + ], + [ + "u", + "ally" + ], + [ + "ual", + "ly" + ], + [ + "Ġb", + "usiness" + ], + [ + "p", + "ro" + ], + [ + "pr", + "o" + ], + [ + "pl", + "ain" + ], + [ + "Ġst", + "op" + ], + [ + "!", + "Ċ" + ], + [ + "ĠH", + "ow" + ], + [ + "\"", + "What" + ], + [ + "\"W", + "hat" + ], + [ + "c", + "an" + ], + [ + "ĠU", + "n" + ], + [ + "p", + "s" + ], + [ + "u", + "nd" + ], + [ + "un", + "d" + ], + [ + "-", + "night" + ], + [ + "-n", + "ight" + ], + [ + "Ġmeet", + "ing" + ], + [ + "ed", + "o" + ], + [ + "Ġra", + "ise" + ], + [ + "G", + "utenberg" + ], + [ + "ĠDar", + "ling" + ], + [ + "u", + "me" + ], + [ + "um", + "e" + ], + [ + "ĠE", + "nglish" + ], + [ + "TE", + "R" + ], + [ + "ad", + "ing" + ], + [ + "Ġtrans", + "l" + ], + [ + "Ġ", + "able" + ], + [ + "Ġa", + "ble" + ], + [ + "Ġab", + "le" + ], + [ + "ss", + "ible" + ], + [ + "Ġsat", + "isf" + ], + [ + "Ġwant", + "ed" + ], + [ + "Ġs", + "ub" + ], + [ + "Ġsu", + "b" + ], + [ + "Ġc", + "ase" + ], + [ + "i", + "fic" + ], + [ + "if", + "ic" + ], + [ + "iter", + "ary" + ], + [ + "Ġm", + "aid" + ], + [ + "Ġin", + "c" + ], + [ + "Ġ", + "pos" + ], + [ + "Ġp", + "os" + ], + [ + "Ġpo", + "s" + ], + [ + "Ġ", + "position" + ], + [ + "Ġpos", + "ition" + ], + [ + "Ġp", + "at" + ], + [ + "u", + "red" + ], + [ + "ur", + "ed" + ], + [ + "ure", + "d" + ], + [ + "or", + "ry" + ], + [ + "Ġacc", + "ount" + ], + [ + "Ġb", + "oth" + ], + [ + "Ġbo", + "th" + ], + [ + "Ġfr", + "ie" + ], + [ + "Ġfrie", + "nd" + ], + [ + "t", + "his" + ], + [ + "th", + "is" + ], + [ + "Ġal", + "ways" + ], + [ + "Ġpart", + "icul" + ], + [ + "W", + "hat" + ], + [ + "Ġsm", + "all" + ], + [ + "en", + "ty" + ], + [ + "ent", + "y" + ], + [ + "us", + "hed" + ], + [ + "ush", + "ed" + ], + [ + "Ġm", + "is" + ], + [ + "ul", + "ly" + ], + [ + "ull", + "y" + ], + [ + "Ġre", + "cei" + ], + [ + "Y", + "ou" + ], + [ + "Ġy", + "et" + ], + [ + "Ġg", + "ave" + ], + [ + "B", + "ut" + ], + [ + "h", + "ad" + ], + [ + "ha", + "d" + ], + [ + "Ġans", + "wer" + ], + [ + "Ġab", + "s" + ], + [ + "i", + "le" + ], + [ + "il", + "e" + ], + [ + "ck", + "et" + ], + [ + "Ġn", + "ood" + ], + [ + "Ġno", + "od" + ], + [ + "Ġcour", + "se" + ], + [ + "Ġ", + "form" + ], + [ + "Ġf", + "orm" + ], + [ + "Ġfor", + "m" + ], + [ + "Ġevery", + "thing" + ], + [ + "e", + "ction" + ], + [ + "ect", + "ion" + ], + [ + "I", + "f" + ], + [ + "p", + "art" + ], + [ + "Ġs", + "ing" + ], + [ + "Ġs", + "it" + ], + [ + "Ġp", + "ur" + ], + [ + "i", + "p" + ], + [ + "Ġf", + "ishing" + ], + [ + "Ġfish", + "ing" + ], + [ + "Ġe", + "h" + ], + [ + "Ġp", + "ar" + ], + [ + "Ġto", + "gether" + ], + [ + "H", + "e" + ], + [ + "Ġw", + "he" + ], + [ + "Ġwh", + "e" + ], + [ + "Ġwhe", + "ther" + ], + [ + "Ġb", + "ra" + ], + [ + "Ġbr", + "a" + ], + [ + "\"", + "Yes" + ], + [ + "\"Y", + "es" + ], + [ + "Ġpun", + "ish" + ], + [ + "S", + "hirt" + ], + [ + "ĠY", + "edo" + ], + [ + "Ġfa", + "rew" + ], + [ + "Ġfar", + "ew" + ], + [ + "Ġfarew", + "ell" + ], + [ + "Ġd", + "ance" + ], + [ + "Ġ", + "less" + ], + [ + "Ġl", + "ess" + ], + [ + "Ġle", + "ss" + ], + [ + "ur", + "al" + ], + [ + "ura", + "l" + ], + [ + "Ġde", + "f" + ], + [ + "Ġatt", + "empt" + ], + [ + "we", + "en" + ], + [ + "Ġs", + "ign" + ], + [ + "Ġs", + "y" + ], + [ + "fere", + "nt" + ], + [ + "fer", + "ent" + ], + [ + "Ġle", + "ast" + ], + [ + "s", + "er" + ], + [ + "se", + "r" + ], + [ + "o", + "b" + ], + [ + "nd", + "ing" + ], + [ + "Ġs", + "orry" + ], + [ + "Ġjum", + "ped" + ], + [ + "Ġj", + "an" + ], + [ + "Ġjan", + "itor" + ], + [ + "iz", + "ed" + ], + [ + "ize", + "d" + ], + [ + "Ġt", + "oward" + ], + [ + "Ġto", + "ward" + ], + [ + "Ġm", + "or" + ], + [ + "Ġmo", + "r" + ], + [ + "a", + "ving" + ], + [ + "av", + "ing" + ], + [ + "Ġb", + "it" + ], + [ + "\"", + "This" + ], + [ + "\"T", + "his" + ], + [ + "Ġrem", + "ark" + ], + [ + "Ġremar", + "k" + ], + [ + "Ġf", + "ut" + ], + [ + "Ġw", + "onder" + ], + [ + "Ġf", + "un" + ], + [ + "T", + "hen" + ], + [ + "The", + "n" + ], + [ + "Ġde", + "c" + ], + [ + "Ġwh", + "om" + ], + [ + "Ġwho", + "m" + ], + [ + "Ġdid", + "n" + ], + [ + "Ġre", + "c" + ], + [ + "be", + "c" + ], + [ + "\"", + "If" + ], + [ + "\"I", + "f" + ], + [ + "Ġkn", + "ew" + ], + [ + "a", + "fter" + ], + [ + "Ġth", + "us" + ], + [ + "Ġis", + "n" + ], + [ + "Ġs", + "ight" + ], + [ + "m", + "ed" + ], + [ + "me", + "d" + ], + [ + "[", + "F" + ], + [ + "u", + "ss" + ], + [ + "us", + "s" + ], + [ + "cid", + "ent" + ], + [ + "the", + "m" + ], + [ + "th", + "em" + ], + [ + "Ġf", + "if" + ], + [ + "Ġdra", + "w" + ], + [ + "Ġdr", + "aw" + ], + [ + "Ġh", + "ear" + ], + [ + "Ġhe", + "ar" + ], + [ + "Ġwr", + "iting" + ], + [ + "Ġget", + "ting" + ], + [ + "s", + "h" + ], + [ + "fere", + "nce" + ], + [ + "fer", + "ence" + ], + [ + "Ġra", + "ised" + ], + [ + "Ġraise", + "d" + ], + [ + "the", + "y" + ], + [ + "a", + "x" + ], + [ + "Ġf", + "ine" + ], + [ + "Ġfin", + "e" + ], + [ + "s", + "el" + ], + [ + "se", + "l" + ], + [ + "ĠNo", + "be" + ], + [ + "ĠNobe", + "ok" + ], + [ + "ĠNobeok", + "a" + ], + [ + "orm", + "al" + ], + [ + "Ġe", + "B" + ], + [ + "ic", + "ense" + ], + [ + "0", + "0" + ], + [ + "Ġb", + "est" + ], + [ + "Ġbe", + "st" + ], + [ + "w", + "or" + ], + [ + "wo", + "r" + ], + [ + "f", + "ic" + ], + [ + "ter", + "est" + ], + [ + "te", + "rest" + ], + [ + "Ġrem", + "ar" + ], + [ + "b", + "l" + ], + [ + "ar", + "ted" + ], + [ + "art", + "ed" + ], + [ + "Ġd", + "ark" + ], + [ + "Ġyou", + "ng" + ], + [ + "u", + "sh" + ], + [ + "us", + "h" + ], + [ + "Ġb", + "et" + ], + [ + "Ġbe", + "t" + ], + [ + "ou", + "th" + ], + [ + "out", + "h" + ], + [ + "h", + "ouse" + ], + [ + "a", + "ught" + ], + [ + "au", + "ght" + ], + [ + "Ġp", + "hys" + ], + [ + "Ġstr", + "ong" + ], + [ + "Ġf", + "ur" + ], + [ + "Ġr", + "oll" + ], + [ + "Ġro", + "ll" + ], + [ + "c", + "ove" + ], + [ + "co", + "ve" + ], + [ + "ch", + "ief" + ], + [ + "aw", + "a" + ], + [ + "Ġfollow", + "ed" + ], + [ + "Ġf", + "ond" + ], + [ + "Ġfut", + "ure" + ], + [ + "ir", + "d" + ], + [ + "f", + "ully" + ], + [ + "ful", + "ly" + ], + [ + "Ġeff", + "ort" + ], + [ + "A", + "fter" + ], + [ + "o", + "ward" + ], + [ + "ow", + "ard" + ], + [ + "Ġre", + "ally" + ], + [ + "Ġreal", + "ly" + ], + [ + "Ġam", + "ong" + ], + [ + "Ġar", + "ound" + ], + [ + "Ġcom", + "pl" + ], + [ + "Ġcomp", + "l" + ], + [ + "Ġg", + "az" + ], + [ + "Ġb", + "ow" + ], + [ + "Ġbo", + "w" + ], + [ + "a", + "ter" + ], + [ + "at", + "er" + ], + [ + "ate", + "r" + ], + [ + "Ġins", + "ist" + ], + [ + "Ġturn", + "ed" + ], + [ + "h", + "el" + ], + [ + "he", + "l" + ], + [ + "r", + "em" + ], + [ + "re", + "m" + ], + [ + "Ġhour", + "s" + ], + [ + "Ġdecid", + "ed" + ], + [ + "y", + "s" + ], + [ + "Ġmon", + "th" + ], + [ + "-", + "a" + ], + [ + "Ġad", + "v" + ], + [ + "Ġbelie", + "ve" + ], + [ + "Ġteach", + "ing" + ], + [ + "Ġeas", + "y" + ], + [ + "Ġdire", + "ction" + ], + [ + "oo", + "ked" + ], + [ + "ook", + "ed" + ], + [ + "Ġw", + "ar" + ], + [ + "Ġwa", + "r" + ], + [ + "Ġun", + "less" + ], + [ + "h", + "ave" + ], + [ + "ha", + "ve" + ], + [ + "Ġsqu", + "are" + ], + [ + "v", + "il" + ], + [ + "Ġqu", + "iet" + ], + [ + "Ġh", + "ung" + ], + [ + "Ġg", + "oes" + ], + [ + "Ġgo", + "es" + ], + [ + "Ġp", + "aid" + ], + [ + "Ġsh", + "all" + ], + [ + "Ġsha", + "ll" + ], + [ + "\"", + "No" + ], + [ + "Ġpunish", + "ment" + ], + [ + "p", + "ose" + ], + [ + "po", + "se" + ], + [ + "pos", + "e" + ], + [ + "Ġs", + "weet" + ], + [ + "'", + "ve" + ], + [ + "\"W", + "ell" + ], + [ + "Ġg", + "entle" + ], + [ + "Ġn", + "ormal" + ], + [ + "ag", + "raph" + ], + [ + "ch", + "ive" + ], + [ + "c", + "han" + ], + [ + "ch", + "an" + ], + [ + "cha", + "n" + ], + [ + "Ġin", + "clud" + ], + [ + "w", + "w" + ], + [ + "or", + "g" + ], + [ + "t", + "em" + ], + [ + "te", + "m" + ], + [ + "A", + "R" + ], + [ + "ĠT", + "H" + ], + [ + "Ġe", + "qu" + ], + [ + "Ġt", + "one" + ], + [ + "Ġto", + "ne" + ], + [ + "Ġpo", + "ssible" + ], + [ + "Ġbe", + "com" + ], + [ + "Ġbec", + "om" + ], + [ + "ĠJ", + "apanese" + ], + [ + "v", + "ers" + ], + [ + "ver", + "s" + ], + [ + "Ġfollow", + "ing" + ], + [ + "Ġp", + "ain" + ], + [ + "Ġwho", + "le" + ], + [ + "w", + "r" + ], + [ + "Ġser", + "ious" + ], + [ + "Ġn", + "ar" + ], + [ + "Ġt", + "ired" + ], + [ + "I", + "n" + ], + [ + "Ġpl", + "ay" + ], + [ + "Ġp", + "rom" + ], + [ + "Ġpro", + "m" + ], + [ + "Ġpr", + "om" + ], + [ + "Ġg", + "ame" + ], + [ + "ĠS", + "ome" + ], + [ + "ĠSo", + "me" + ], + [ + "Ġhapp", + "ened" + ], + [ + "Ġc", + "ut" + ], + [ + "Ġtw", + "enty" + ], + [ + "Ġdo", + "or" + ], + [ + "Ġmor", + "ning" + ], + [ + "h", + "ind" + ], + [ + "hi", + "nd" + ], + [ + "Ġb", + "re" + ], + [ + "Ġbr", + "e" + ], + [ + "Ġin", + "side" + ], + [ + "Ġins", + "ide" + ], + [ + "o", + "ve" + ], + [ + "al", + "th" + ], + [ + "u", + "k" + ], + [ + "a", + "rge" + ], + [ + "ar", + "ge" + ], + [ + "am", + "b" + ], + [ + "Ġd", + "am" + ], + [ + "Ġw", + "orry" + ], + [ + "Ġwor", + "ry" + ], + [ + "at", + "ive" + ], + [ + "Ġexp", + "ected" + ], + [ + "Ġf", + "am" + ], + [ + "Ġfa", + "m" + ], + [ + "Ġp", + "ra" + ], + [ + "Ġpr", + "a" + ], + [ + "Ġpo", + "cket" + ], + [ + "oo", + "ks" + ], + [ + "ook", + "s" + ], + [ + "c", + "hed" + ], + [ + "ch", + "ed" + ], + [ + "Ġs", + "il" + ], + [ + "o", + "l" + ], + [ + "Ġf", + "av" + ], + [ + "Ġfa", + "v" + ], + [ + "Ġel", + "se" + ], + [ + "Ġh", + "igh" + ], + [ + "Ġre", + "al" + ], + [ + "Ġal", + "ong" + ], + [ + "Ġ", + "med" + ], + [ + "Ġm", + "ed" + ], + [ + "Ġme", + "d" + ], + [ + "hi", + "k" + ], + [ + "he", + "mat" + ], + [ + "hemat", + "ics" + ], + [ + "Ġl", + "ist" + ], + [ + "Ġli", + "st" + ], + [ + "Ġs", + "ick" + ], + [ + "o", + "int" + ], + [ + "[F", + "oot" + ], + [ + "[Foot", + "not" + ], + [ + "[Footnot", + "e" + ], + [ + ".", + "]Ċ" + ], + [ + "n", + "ight" + ], + [ + "s", + "es" + ], + [ + "se", + "s" + ], + [ + "i", + "or" + ], + [ + "Ġsa", + "ys" + ], + [ + "Ġsay", + "s" + ], + [ + "Ġm", + "outh" + ], + [ + "h", + "ow" + ], + [ + "ho", + "w" + ], + [ + "m", + "ing" + ], + [ + "Ġc", + "lo" + ], + [ + "Ġcl", + "o" + ], + [ + "Ġc", + "ur" + ], + [ + "g", + "ing" + ], + [ + "Ġsudden", + "ly" + ], + [ + "-a", + "h" + ], + [ + "a", + "mp" + ], + [ + "am", + "p" + ], + [ + "Ġbl", + "ack" + ], + [ + "ro", + "ss" + ], + [ + "Ġf", + "ac" + ], + [ + "Ġfa", + "c" + ], + [ + "sel", + "ves" + ], + [ + "i", + "ew" + ], + [ + "ie", + "w" + ], + [ + "iss", + "ion" + ], + [ + "Ġcopy", + "right" + ], + [ + "Ġpar", + "agraph" + ], + [ + "ĠAr", + "chive" + ], + [ + "Ġdon", + "ations" + ], + [ + "Pro", + "ject" + ], + [ + "Ġc", + "ost" + ], + [ + "Ġco", + "st" + ], + [ + ".", + "org" + ], + [ + "L", + "I" + ], + [ + "u", + "ced" + ], + [ + "uc", + "ed" + ], + [ + "Ġs", + "uc" + ], + [ + "Ġsu", + "c" + ], + [ + "y", + "le" + ], + [ + "Ġfor", + "ce" + ], + [ + "j", + "oy" + ], + [ + "o", + "uch" + ], + [ + "ou", + "ch" + ], + [ + "t", + "r" + ], + [ + "I", + "t" + ], + [ + "Ġtr", + "ad" + ], + [ + "Ġtra", + "d" + ], + [ + "Ġpres", + "ent" + ], + [ + "Ġe", + "xt" + ], + [ + "Ġex", + "t" + ], + [ + "a", + "sed" + ], + [ + "as", + "ed" + ], + [ + "ase", + "d" + ], + [ + "red", + "it" + ], + [ + "Ġfa", + "ult" + ], + [ + "i", + "b" + ], + [ + "-", + "m" + ], + [ + "ur", + "d" + ], + [ + "Ġt", + "ried" + ], + [ + "Ġtr", + "ied" + ], + [ + "t", + "ime" + ], + [ + "Ġp", + "ret" + ], + [ + "Ġpr", + "et" + ], + [ + "Ġpre", + "t" + ], + [ + "Ġsp", + "ee" + ], + [ + "Ġspe", + "e" + ], + [ + "o", + "wer" + ], + [ + "ow", + "er" + ], + [ + "Ġword", + "s" + ], + [ + "CHA", + "P" + ], + [ + "CHAP", + "TER" + ], + [ + "s", + "chool" + ], + [ + "Ġas", + "k" + ], + [ + "Ġdo", + "ing" + ], + [ + "ate", + "ly" + ], + [ + "Ġunt", + "il" + ], + [ + "b", + "out" + ], + [ + "bo", + "ut" + ], + [ + "Ġt", + "ree" + ], + [ + "Ġtr", + "ee" + ], + [ + "Ġtre", + "e" + ], + [ + "c", + "all" + ], + [ + "am", + "ash" + ], + [ + "amash", + "ir" + ], + [ + "amashir", + "o" + ], + [ + "s", + "te" + ], + [ + "st", + "e" + ], + [ + "Ġbe", + "hind" + ], + [ + "o", + "ld" + ], + [ + "ol", + "d" + ], + [ + "Ġw", + "all" + ], + [ + "Ġwal", + "l" + ], + [ + "Ġwa", + "ll" + ], + [ + "it", + "ory" + ], + [ + "itor", + "y" + ], + [ + "Ġroll", + "ed" + ], + [ + "Ġm", + "ove" + ], + [ + "Ġmo", + "ve" + ], + [ + "Ġapolog", + "ize" + ], + [ + "Ġl", + "arge" + ], + [ + "amb", + "oo" + ], + [ + "s", + "u" + ], + [ + "Ġsett", + "led" + ], + [ + "\"", + "He" + ], + [ + "\"H", + "e" + ], + [ + "w", + "o" + ], + [ + "Ġthink", + "ing" + ], + [ + "Ġthin", + "king" + ], + [ + "u", + "sed" + ], + [ + "us", + "ed" + ], + [ + "if", + "ied" + ], + [ + "Ġal", + "most" + ], + [ + "Ġt", + "re" + ], + [ + "Ġtr", + "e" + ], + [ + "Ġt", + "reat" + ], + [ + "Ġtre", + "at" + ], + [ + "Ġnood", + "le" + ], + [ + "Ġnot", + "e" + ], + [ + "Ġno", + "te" + ], + [ + "ĠA", + "ll" + ], + [ + "Ġbe", + "at" + ], + [ + "Ġob", + "ject" + ], + [ + "Ġsee", + "ms" + ], + [ + "Ġseem", + "s" + ], + [ + "Ġ", + "ide" + ], + [ + "Y", + "es" + ], + [ + "ow", + "s" + ], + [ + "Ġrem", + "ain" + ], + [ + "Ġbeg", + "in" + ], + [ + "u", + "ght" + ], + [ + "m", + "ents" + ], + [ + "ment", + "s" + ], + [ + "Ġal", + "one" + ], + [ + "sp", + "ect" + ], + [ + "Ġmat", + "hematics" + ], + [ + "Ġ", + "rough" + ], + [ + "Ġr", + "ough" + ], + [ + "Ġout", + "side" + ], + [ + "Ġcom", + "es" + ], + [ + "Ġcome", + "s" + ], + [ + "b", + "ack" + ], + [ + "Ġw", + "ind" + ], + [ + "s", + "ed" + ], + [ + "se", + "d" + ], + [ + "Ġwould", + "n" + ], + [ + "e", + "er" + ], + [ + "ee", + "r" + ], + [ + "in", + "ut" + ], + [ + "f", + "rom" + ], + [ + "Ġre", + "pl" + ], + [ + "Ġrep", + "l" + ], + [ + "Ġnar", + "row" + ], + [ + "Ġin", + "cident" + ], + [ + "Ġ", + "air" + ], + [ + "Ġa", + "ir" + ], + [ + "Ġse", + "a" + ], + [ + "t", + "s" + ], + [ + "Ġsurpr", + "ised" + ], + [ + "Ġte", + "a" + ], + [ + "R", + "ed" + ], + [ + "Ġtal", + "king" + ], + [ + "Ġtalk", + "ing" + ], + [ + "Ġbo", + "ss" + ], + [ + "q", + "ue" + ], + [ + "qu", + "e" + ], + [ + "Ġp", + "ict" + ], + [ + "ir", + "ty" + ], + [ + "irt", + "y" + ], + [ + "Ġ", + "ce" + ], + [ + "Ġc", + "e" + ], + [ + "Ġl", + "im" + ], + [ + "Ġli", + "m" + ], + [ + "ĠW", + "hy" + ], + [ + "Ġp", + "oint" + ], + [ + "Ġpo", + "int" + ], + [ + "Ġl", + "aw" + ], + [ + "ci", + "ated" + ], + [ + "Ġmo", + "on" + ], + [ + "ir", + "cu" + ], + [ + "g", + "ot" + ], + [ + "go", + "t" + ], + [ + "ĠI", + "s" + ], + [ + "Ġhand", + "s" + ], + [ + "Ġhon", + "or" + ], + [ + "a", + "ut" + ], + [ + "au", + "t" + ], + [ + "r", + "ge" + ], + [ + "Ġst", + "ate" + ], + [ + "ĠL", + "iterary" + ], + [ + ".", + "F" + ], + [ + "T", + "his" + ], + [ + "l", + "ine" + ], + [ + "li", + "ne" + ], + [ + ".", + "g" + ], + [ + ".g", + "utenberg" + ], + [ + "ĠO", + "F" + ], + [ + "E", + "N" + ], + [ + "ract", + "er" + ], + [ + "Ġb", + "ene" + ], + [ + "Ġbe", + "ne" + ], + [ + "ĠE", + "ven" + ], + [ + "o", + "ub" + ], + [ + "ou", + "b" + ], + [ + "Ġm", + "akes" + ], + [ + "Ġmake", + "s" + ], + [ + "Ġin", + "terest" + ], + [ + "o", + "pe" + ], + [ + "op", + "e" + ], + [ + "m", + "s" + ], + [ + "Ġresp", + "ons" + ], + [ + "Ġ", + "fore" + ], + [ + "Ġf", + "ore" + ], + [ + "Ġfor", + "e" + ], + [ + "Ġsome", + "what" + ], + [ + "Ġhon", + "est" + ], + [ + "o", + "ck" + ], + [ + "ir", + "it" + ], + [ + "Ġhe", + "ld" + ], + [ + "Ġhel", + "d" + ], + [ + "Ġadd", + "ed" + ], + [ + "f", + "u" + ], + [ + "ad", + "ed" + ], + [ + "ade", + "d" + ], + [ + "al", + "s" + ], + [ + "at", + "t" + ], + [ + "ter", + "n" + ], + [ + "Ġperson", + "al" + ], + [ + "Ġ", + "ass" + ], + [ + "Ġa", + "ss" + ], + [ + "Ġas", + "s" + ], + [ + "ĠW", + "ith" + ], + [ + "t", + "ic" + ], + [ + "T", + "okyo" + ], + [ + "Ġsh", + "out" + ], + [ + "Ġpret", + "ty" + ], + [ + "um", + "b" + ], + [ + "Ġe", + "arly" + ], + [ + "o", + "pped" + ], + [ + "op", + "ped" + ], + [ + "Ġfur", + "ther" + ], + [ + "Ġf", + "re" + ], + [ + "Ġfr", + "e" + ], + [ + "es", + "ides" + ], + [ + "Ġb", + "amboo" + ], + [ + "Ġ", + "ir" + ], + [ + "m", + "ore" + ], + [ + "Ġli", + "ving" + ], + [ + "Ġrecei", + "ved" + ], + [ + "Ġli", + "ved" + ], + [ + "Ġlive", + "d" + ], + [ + "Ġme", + "ant" + ], + [ + "Ġmean", + "t" + ], + [ + "Ġc", + "oward" + ], + [ + "Ġco", + "ward" + ], + [ + "pos", + "ition" + ], + [ + "Ġlo", + "c" + ], + [ + "i", + "led" + ], + [ + "il", + "ed" + ], + [ + "ile", + "d" + ], + [ + "Ġte", + "nder" + ], + [ + "Ġ", + "ch" + ], + [ + "Ġc", + "h" + ], + [ + "Ġ", + "After" + ], + [ + "ĠA", + "fter" + ], + [ + "c", + "er" + ], + [ + "ce", + "r" + ], + [ + "Ġfav", + "or" + ], + [ + "w", + "ho" + ], + [ + "Ġli", + "ked" + ], + [ + "Ġlike", + "d" + ], + [ + "r", + "ance" + ], + [ + "ra", + "nce" + ], + [ + "Ġp", + "ri" + ], + [ + "Ġpr", + "i" + ], + [ + "k", + "isha" + ], + [ + "Ġstud", + "y" + ], + [ + "Ġord", + "er" + ], + [ + "Ġafter", + "ward" + ], + [ + "Ġgreat", + "ly" + ], + [ + "Ġun", + "able" + ], + [ + "g", + "o" + ], + [ + "Ġwa", + "it" + ], + [ + "ep", + "ing" + ], + [ + "id", + "ing" + ], + [ + "Ġfor", + "ty" + ], + [ + "Ġsk", + "y" + ], + [ + "Ġoff", + "ice" + ], + [ + "w", + "ill" + ], + [ + "\"", + "D" + ], + [ + "w", + "el" + ], + [ + "we", + "l" + ], + [ + "Ġst", + "ation" + ], + [ + "b", + "o" + ], + [ + "h", + "ot" + ], + [ + "ho", + "t" + ], + [ + "s", + "uch" + ], + [ + "su", + "ch" + ], + [ + "Ġl", + "oud" + ], + [ + "Ġlo", + "ud" + ], + [ + "Ġ", + "aw" + ], + [ + "Ġa", + "w" + ], + [ + "l", + "and" + ], + [ + "?", + "Ċ" + ], + [ + "Ġre", + "spect" + ], + [ + "Ġresp", + "ect" + ], + [ + "an", + "ces" + ], + [ + "ance", + "s" + ], + [ + "i", + "ent" + ], + [ + "ie", + "nt" + ], + [ + "Ġ", + "ought" + ], + [ + "Ġo", + "ught" + ] + ] + } +} diff --git a/tests/assets/tokenizer_config.json b/tests/assets/tokenizer_config.json new file mode 100644 index 0000000000..4db797a6af --- /dev/null +++ b/tests/assets/tokenizer_config.json @@ -0,0 +1 @@ +{"bos_token": {"__type": "AddedToken", "content": "<|begin_of_sentence|>", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false}, "eos_token": {"__type": "AddedToken", "content": "<|end_of_sentence|>", "lstrip": false, "normalized": true, "rstrip": false, "single_word": false}} diff --git a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py index ecdc11a9c0..0d0da63051 100644 --- a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py @@ -1,20 +1,126 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + import pytest -from torchtune.modules.transforms.tokenizers import HFTokenizer +from tests.common import ASSETS +from tokenizers import Tokenizer +from torchtune.models.llama3._tokenizer import CL100K_PATTERN +from torchtune.modules.transforms.tokenizers import HFTokenizer, TikTokenBaseTokenizer + -# TODO: change this (just for testing) -TOKENIZER_DIR = "/data/users/ebs/phi4/" +TOKENIZER_CONFIG_PATH = ASSETS / "tokenizer_config.json" +GENERATION_CONFIG_PATH = ASSETS / "generation_config.json" class TestHFTokenizer: @pytest.fixture - def tokenizer(self): - return HFTokenizer( - path=TOKENIZER_DIR + "tokenizer.json", - config_path=TOKENIZER_DIR + "tokenizer_config.json", + def tt_tokenizer(self): + # Pretrained tiktoken model generated via the script in + # https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4 + return TikTokenBaseTokenizer( + path=str(ASSETS / "tiktoken_small.model"), + name="test_tiktoken", + pattern=CL100K_PATTERN, + bos_id=0, + eos_id=-1, + special_tokens={ + "<|test_token_0|>": 2000, + "<|test_token_1|>": 2001, + }, ) - def test_tokenizer(self, tokenizer): - import pdb + @pytest.fixture + def texts(self): + return [ + "I can see the sun. But even if I cannot see the sun, I know that it exists.", + "And to know that the sun is there - that is living.", + ] + + @pytest.fixture + def token_ids(self): + return [ + 73, + 503, + 654, + 262, + 376, + 110, + 46, + 690, + 720, + 428, + 270, + 1119, + 654, + 262, + 376, + 110, + 44, + 270, + 686, + 334, + 312, + 522, + 511, + 115, + 46, + ] + + def test_invalid_hf_tokenizer(self): + with pytest.raises(ValueError, match="Could not infer"): + _ = HFTokenizer( + path=str(ASSETS / "tokenizer.json"), + ) + + @pytest.mark.parametrize( + "config_path, generation_config_path", + [ + (TOKENIZER_CONFIG_PATH, GENERATION_CONFIG_PATH), + (TOKENIZER_CONFIG_PATH, None), + (None, GENERATION_CONFIG_PATH), + ], + ) + def test_tokenizer_encode_and_decode_parity( + self, + tt_tokenizer, + texts, + token_ids, + config_path, + generation_config_path, + mocker, + ): + + # Patch tokenizer's token_to_id method for BOS and EOS + # since they are not present in the original tokenizer model + def patch_token_to_id_for_dummy_tokenizer(*args, **kwargs): + if args[0] == "<|begin_of_sentence|>": + return 0 + elif args[0] == "<|end_of_sentence|>": + return -1 + else: + raise RuntimeError("Unexpected input") + + mocker.patch.object( + Tokenizer, "token_to_id", side_effect=patch_token_to_id_for_dummy_tokenizer + ) + # Tokenizer artifacts for this test were created from tiktoken_small.model + # using the script in https://gist.github.com/ebsmothers/55b2f177f5ed15a3b81508f8f8b91159 + hf_tokenizer = HFTokenizer( + path=str(ASSETS / "tokenizer.json"), + config_path=config_path, + generation_config_path=generation_config_path, + ) + + tt_tokens = tt_tokenizer.encode(texts[0], add_bos=True, add_eos=True) + hf_tokens = hf_tokenizer.encode(texts[0], add_bos=True, add_eos=True) + + assert tt_tokens == hf_tokens + assert hf_tokens == [0] + token_ids + [-1] - pdb.set_trace() - raise ValueError("done") + tt_text = tt_tokenizer.decode(token_ids) + hf_text = hf_tokenizer.decode(token_ids) + assert tt_text == hf_text + assert hf_text == texts[0] diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py index 5ed9c533cf..b19879408c 100644 --- a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -1,5 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + import json -from typing import List +from typing import Any, Dict, List, Optional from tokenizers import Tokenizer from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer @@ -7,43 +13,107 @@ class HFTokenizer(BaseTokenizer): """ - A wrapper around HuggingFace tokenizers. BLAH BLAH BLAH + A wrapper around Hugging Face tokenizers. This class can be used to load from a + Hugging Face tokenizer.json file into a torchtune BaseTokenizer. Args: path (str): Path to tokenizer.json file - config_path (str): Path to tokenizer_config.json file + config_path (Optional[str]): Path to tokenizer_config.json file. Default: None + generation_config_path (Optional[str]): Path to generation_config.json file. + Default: None """ - def __init__(self, path: str, config_path: str): + def __init__( + self, + path: str, + config_path: Optional[str] = None, + generation_config_path: Optional[str] = None, + ): self.hf_tokenizer = Tokenizer.from_file(path) - with open(config_path, "rb") as f: - config = json.load(f) + if config_path: + with open(config_path, "rb") as f: + self.config = json.load(f) + else: + self.config = None + if generation_config_path: + with open(generation_config_path, "rb") as f: + self.generation_config = json.load(f) + else: + self.generation_config = None + self._infer_bos_eos_tokens() + + def _get_token_from_config(self, config: Dict[str, Any], key: str) -> str: + """ + HF BOS/EOS tokens are either stored as e.g. {'bos_token': 5} + or {'bos_token': {'content': 5, ...}}. This utility handles both. + """ + token = config.get(key) + if isinstance(token, Dict): + if "content" not in token: + raise ValueError(f"Could not parse {key} from config") + token = token["content"] + else: + if not isinstance(token, str): + raise ValueError(f"Could not parse {key} from config") + return token + + def _infer_bos_eos_tokens(self): + """ + Infer BOS and EOS token IDs from config and/or generation_config. - def _infer_tokenizer_class_from_config(self): - pass + Will first try to infer ID directly from generation_config. + If that's not available, will infer token from config then map to ID. + Otherwise, raise a ValueError. + """ + self.bos_id = None + self.eos_id = None + + if self.generation_config: + self.bos_id = self.generation_config.get("bos_token_id") + self.eos_id = self.generation_config.get("eos_token_id") + + if self.config: + bos_token = self._get_token_from_config(self.config, "bos_token") + eos_token = self._get_token_from_config(self.config, "eos_token") + if bos_token is not None and self.bos_id is None: + self.bos_id = self.hf_tokenizer.token_to_id(bos_token) + if eos_token is not None and self.eos_id is None: + self.eos_id = self.eos_id or self.hf_tokenizer.token_to_id(eos_token) + + if self.bos_id is None or self.eos_id is None: + raise ValueError("Could not infer BOS and EOS token IDs from config") def encode( - self, text: str, add_bos: bool = False, add_eos: bool = False + self, text: str, add_bos: bool = True, add_eos: bool = True ) -> List[int]: """ Encodes a string into a list of token ids. Args: text (str): The text to encode. - add_bos (bool): Whether to add a beginning-of-sequence token to the beginning of the + add_bos (bool): Whether to add the tokenizer's bos_id to the encoded string. + Default True. + add_eos (bool): Whether to add the tokenizer's eos_id to the encoded string. + Default True. + + Returns: + List[int]: The list of token ids. """ token_ids = self.hf_tokenizer.encode(text).ids if add_bos: - token_ids.insert(0, self.hf_tokenizer.bos_id) + token_ids.insert(0, self.bos_id) if add_eos: - token_ids.append(self.hf_tokenizer.eos_id) + token_ids.append(self.eos_id) return token_ids def decode(self, token_ids: List[int]) -> str: """ - Decodes a list of token ids into a string. + Decode a list of token ids into a string. Args: - token_ids (List[int]): The list of token ids to decode. + token_ids (List[int]): The list of token ids. + + Returns: + str: The decoded string. """ return self.hf_tokenizer.decode(token_ids) From 5f5fa8bc5241bc31cb881046969790f46062fc12 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Mon, 10 Feb 2025 13:59:41 -0800 Subject: [PATCH 3/9] prefer config over generation_config, address comments --- .../tokenizers/test_hf_tokenizer.py | 6 ++-- .../transforms/tokenizers/_hf_tokenizer.py | 29 ++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py index 0d0da63051..b90757989a 100644 --- a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py @@ -70,9 +70,9 @@ def token_ids(self): ] def test_invalid_hf_tokenizer(self): - with pytest.raises(ValueError, match="Could not infer"): + with pytest.raises(ValueError, match="At least one of"): _ = HFTokenizer( - path=str(ASSETS / "tokenizer.json"), + tokenizer_json_path=str(ASSETS / "tokenizer.json"), ) @pytest.mark.parametrize( @@ -109,7 +109,7 @@ def patch_token_to_id_for_dummy_tokenizer(*args, **kwargs): # Tokenizer artifacts for this test were created from tiktoken_small.model # using the script in https://gist.github.com/ebsmothers/55b2f177f5ed15a3b81508f8f8b91159 hf_tokenizer = HFTokenizer( - path=str(ASSETS / "tokenizer.json"), + tokenizer_json_path=str(ASSETS / "tokenizer.json"), config_path=config_path, generation_config_path=generation_config_path, ) diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py index b19879408c..60ca92dfee 100644 --- a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -17,19 +17,26 @@ class HFTokenizer(BaseTokenizer): Hugging Face tokenizer.json file into a torchtune BaseTokenizer. Args: - path (str): Path to tokenizer.json file + tokenizer_json_path (str): Path to tokenizer.json file config_path (Optional[str]): Path to tokenizer_config.json file. Default: None generation_config_path (Optional[str]): Path to generation_config.json file. Default: None + + Raises: + ValueError: If neither config_path or generation_config_path are specified. """ def __init__( self, - path: str, + tokenizer_json_path: str, config_path: Optional[str] = None, generation_config_path: Optional[str] = None, ): - self.hf_tokenizer = Tokenizer.from_file(path) + self.hf_tokenizer = Tokenizer.from_file(tokenizer_json_path) + if not (config_path or generation_config_path): + raise ValueError( + "At least one of config_path or generation_config_path must be specified." + ) if config_path: with open(config_path, "rb") as f: self.config = json.load(f) @@ -68,17 +75,19 @@ def _infer_bos_eos_tokens(self): self.bos_id = None self.eos_id = None - if self.generation_config: - self.bos_id = self.generation_config.get("bos_token_id") - self.eos_id = self.generation_config.get("eos_token_id") - if self.config: bos_token = self._get_token_from_config(self.config, "bos_token") eos_token = self._get_token_from_config(self.config, "eos_token") - if bos_token is not None and self.bos_id is None: + if bos_token is not None: self.bos_id = self.hf_tokenizer.token_to_id(bos_token) - if eos_token is not None and self.eos_id is None: - self.eos_id = self.eos_id or self.hf_tokenizer.token_to_id(eos_token) + if eos_token is not None: + self.eos_id = self.hf_tokenizer.token_to_id(eos_token) + + if self.generation_config: + if self.bos_id is None: + self.bos_id = self.generation_config.get("bos_token_id") + if self.eos_id is None: + self.eos_id = self.generation_config.get("eos_token_id") if self.bos_id is None or self.eos_id is None: raise ValueError("Could not infer BOS and EOS token IDs from config") From 0bb68a3605b90ab5c7cd2e04db6f9a4b86975451 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Mon, 10 Feb 2025 17:07:35 -0800 Subject: [PATCH 4/9] docstring updates --- torchtune/modules/transforms/tokenizers/_hf_tokenizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py index 60ca92dfee..7e600b1908 100644 --- a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -16,6 +16,10 @@ class HFTokenizer(BaseTokenizer): A wrapper around Hugging Face tokenizers. This class can be used to load from a Hugging Face tokenizer.json file into a torchtune BaseTokenizer. + This class will load the tokenizer.json file from tokenizer_json_path. It will + attempt to infer BOS and EOS token IDs from config.json if possible, and if not + will fallback to inferring them from generation_config.json. + Args: tokenizer_json_path (str): Path to tokenizer.json file config_path (Optional[str]): Path to tokenizer_config.json file. Default: None @@ -68,8 +72,8 @@ def _infer_bos_eos_tokens(self): """ Infer BOS and EOS token IDs from config and/or generation_config. - Will first try to infer ID directly from generation_config. - If that's not available, will infer token from config then map to ID. + Will first try to infer token from config then map to ID. + If that's not available, will infer ID directly from generation_config. Otherwise, raise a ValueError. """ self.bos_id = None From 0a00c801d14ab69334c43a37095528af9808325d Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Mon, 10 Feb 2025 18:21:04 -0800 Subject: [PATCH 5/9] more comments --- .../tokenizers/test_hf_tokenizer.py | 13 +++++---- .../modules/transforms/tokenizers/__init__.py | 4 +-- .../transforms/tokenizers/_hf_tokenizer.py | 27 ++++++++++--------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py index b90757989a..3dd0c68822 100644 --- a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py @@ -8,14 +8,17 @@ from tests.common import ASSETS from tokenizers import Tokenizer from torchtune.models.llama3._tokenizer import CL100K_PATTERN -from torchtune.modules.transforms.tokenizers import HFTokenizer, TikTokenBaseTokenizer +from torchtune.modules.transforms.tokenizers import ( + HuggingFaceTokenizer, + TikTokenBaseTokenizer, +) TOKENIZER_CONFIG_PATH = ASSETS / "tokenizer_config.json" GENERATION_CONFIG_PATH = ASSETS / "generation_config.json" -class TestHFTokenizer: +class TestHuggingFaceTokenizer: @pytest.fixture def tt_tokenizer(self): # Pretrained tiktoken model generated via the script in @@ -71,7 +74,7 @@ def token_ids(self): def test_invalid_hf_tokenizer(self): with pytest.raises(ValueError, match="At least one of"): - _ = HFTokenizer( + _ = HuggingFaceTokenizer( tokenizer_json_path=str(ASSETS / "tokenizer.json"), ) @@ -108,9 +111,9 @@ def patch_token_to_id_for_dummy_tokenizer(*args, **kwargs): ) # Tokenizer artifacts for this test were created from tiktoken_small.model # using the script in https://gist.github.com/ebsmothers/55b2f177f5ed15a3b81508f8f8b91159 - hf_tokenizer = HFTokenizer( + hf_tokenizer = HuggingFaceTokenizer( tokenizer_json_path=str(ASSETS / "tokenizer.json"), - config_path=config_path, + tokenizer_config_json_path=config_path, generation_config_path=generation_config_path, ) diff --git a/torchtune/modules/transforms/tokenizers/__init__.py b/torchtune/modules/transforms/tokenizers/__init__.py index dab7565ba5..4b61c95a70 100644 --- a/torchtune/modules/transforms/tokenizers/__init__.py +++ b/torchtune/modules/transforms/tokenizers/__init__.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from ._hf_tokenizer import HFTokenizer +from ._hf_tokenizer import HuggingFaceTokenizer from ._sentencepiece import SentencePieceBaseTokenizer from ._tiktoken import TikTokenBaseTokenizer from ._utils import ( @@ -21,5 +21,5 @@ "BaseTokenizer", "tokenize_messages_no_special_tokens", "parse_hf_tokenizer_json", - "HFTokenizer", + "HuggingFaceTokenizer", ] diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py index 7e600b1908..b4492f5ff5 100644 --- a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -11,7 +11,7 @@ from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer -class HFTokenizer(BaseTokenizer): +class HuggingFaceTokenizer(BaseTokenizer): """ A wrapper around Hugging Face tokenizers. This class can be used to load from a Hugging Face tokenizer.json file into a torchtune BaseTokenizer. @@ -22,27 +22,28 @@ class HFTokenizer(BaseTokenizer): Args: tokenizer_json_path (str): Path to tokenizer.json file - config_path (Optional[str]): Path to tokenizer_config.json file. Default: None + tokenizer_config_json_path (Optional[str]): Path to tokenizer_config.json file. Default: None generation_config_path (Optional[str]): Path to generation_config.json file. Default: None Raises: - ValueError: If neither config_path or generation_config_path are specified. + ValueError: If neither tokenizer_config_json_path or generation_config_path are specified. """ def __init__( self, + *, tokenizer_json_path: str, - config_path: Optional[str] = None, + tokenizer_config_json_path: Optional[str] = None, generation_config_path: Optional[str] = None, ): - self.hf_tokenizer = Tokenizer.from_file(tokenizer_json_path) - if not (config_path or generation_config_path): + self.tokenizer = Tokenizer.from_file(tokenizer_json_path) + if not (tokenizer_config_json_path or generation_config_path): raise ValueError( - "At least one of config_path or generation_config_path must be specified." + "At least one of tokenizer_config_json_path or generation_config_path must be specified." ) - if config_path: - with open(config_path, "rb") as f: + if tokenizer_config_json_path: + with open(tokenizer_config_json_path, "rb") as f: self.config = json.load(f) else: self.config = None @@ -83,9 +84,9 @@ def _infer_bos_eos_tokens(self): bos_token = self._get_token_from_config(self.config, "bos_token") eos_token = self._get_token_from_config(self.config, "eos_token") if bos_token is not None: - self.bos_id = self.hf_tokenizer.token_to_id(bos_token) + self.bos_id = self.tokenizer.token_to_id(bos_token) if eos_token is not None: - self.eos_id = self.hf_tokenizer.token_to_id(eos_token) + self.eos_id = self.tokenizer.token_to_id(eos_token) if self.generation_config: if self.bos_id is None: @@ -112,7 +113,7 @@ def encode( Returns: List[int]: The list of token ids. """ - token_ids = self.hf_tokenizer.encode(text).ids + token_ids = self.tokenizer.encode(text).ids if add_bos: token_ids.insert(0, self.bos_id) if add_eos: @@ -129,4 +130,4 @@ def decode(self, token_ids: List[int]) -> str: Returns: str: The decoded string. """ - return self.hf_tokenizer.decode(token_ids) + return self.tokenizer.decode(token_ids) From 7e70de72f72aeb0dc0d8d528171e4144ff2c0ac4 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Mon, 10 Feb 2025 21:32:22 -0800 Subject: [PATCH 6/9] add link --- torchtune/modules/transforms/tokenizers/_hf_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py index b4492f5ff5..5ee7f3f0fe 100644 --- a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -13,8 +13,8 @@ class HuggingFaceTokenizer(BaseTokenizer): """ - A wrapper around Hugging Face tokenizers. This class can be used to load from a - Hugging Face tokenizer.json file into a torchtune BaseTokenizer. + A wrapper around Hugging Face tokenizers. See https://github.com/huggingface/tokenizers + This can be used to load from a Hugging Face tokenizer.json file into a torchtune BaseTokenizer. This class will load the tokenizer.json file from tokenizer_json_path. It will attempt to infer BOS and EOS token IDs from config.json if possible, and if not From 9120af589ec982bce6fc4b27171217f9527d7af9 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Wed, 12 Feb 2025 17:43:06 -0800 Subject: [PATCH 7/9] more robust BOS/EOS handling, update docs and test --- docs/source/basics/tokenizers.rst | 21 ++++++++++++ .../tokenizers/test_hf_tokenizer.py | 32 +++++++++++++------ .../modules/transforms/tokenizers/__init__.py | 4 +-- .../transforms/tokenizers/_hf_tokenizer.py | 23 ++++++++++--- 4 files changed, 65 insertions(+), 15 deletions(-) diff --git a/docs/source/basics/tokenizers.rst b/docs/source/basics/tokenizers.rst index 47be88fe0c..2bf14eb7e0 100644 --- a/docs/source/basics/tokenizers.rst +++ b/docs/source/basics/tokenizers.rst @@ -222,6 +222,27 @@ to do the actual encoding and decoding. print(sp_tokenizer.encode(text)) # [1, 6312, 28709, 1526, 2] +.. _hf_tokenizers: + +Using Hugging Face tokenizers +----------------------------- + +Sometimes tokenizers hosted on Hugging Face do not contain files compatible with one of torchtune's +existing tokenizer classes. In this case, we provide :class:`~torchtune.modules.transforms.tokenizers.HuggingFaceBaseTokenizer` +to parse the Hugging Face ``tokenizer.json`` file and define the correct ``encode`` and ``decode`` methods to +match torchtune's other :class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` classes. You should also pass the path to +either ``tokenizer_config.json`` or ``generation_config.json``, which will allow torchtune to infer BOS and EOS tokens. +Continuing with the Mistral example: + +.. code-block:: python + hf_tokenizer = HuggingFaceBaseTokenizer( + tokenizer_json_path="/tmp/Mistral-7B-v0.1/tokenizer.json", + tokenizer_config_json_path="/tmp/Mistral-7B-v0.1/tokenizer_config.json", + ) + text = "hello world" + print(hf_tokenizer.encode(text)) + # [1, 6312, 28709, 1526, 2] + .. _model_tokenizers: Model tokenizers diff --git a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py index 3dd0c68822..d6e33e76d3 100644 --- a/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py +++ b/tests/torchtune/modules/transforms/tokenizers/test_hf_tokenizer.py @@ -4,21 +4,22 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + import pytest from tests.common import ASSETS from tokenizers import Tokenizer +from tokenizers.processors import TemplateProcessing from torchtune.models.llama3._tokenizer import CL100K_PATTERN from torchtune.modules.transforms.tokenizers import ( - HuggingFaceTokenizer, + HuggingFaceBaseTokenizer, TikTokenBaseTokenizer, ) - TOKENIZER_CONFIG_PATH = ASSETS / "tokenizer_config.json" GENERATION_CONFIG_PATH = ASSETS / "generation_config.json" -class TestHuggingFaceTokenizer: +class TestHuggingFaceBaseTokenizer: @pytest.fixture def tt_tokenizer(self): # Pretrained tiktoken model generated via the script in @@ -74,16 +75,16 @@ def token_ids(self): def test_invalid_hf_tokenizer(self): with pytest.raises(ValueError, match="At least one of"): - _ = HuggingFaceTokenizer( + _ = HuggingFaceBaseTokenizer( tokenizer_json_path=str(ASSETS / "tokenizer.json"), ) @pytest.mark.parametrize( - "config_path, generation_config_path", + "config_path, generation_config_path, hf_tokenizer_adds_bos", [ - (TOKENIZER_CONFIG_PATH, GENERATION_CONFIG_PATH), - (TOKENIZER_CONFIG_PATH, None), - (None, GENERATION_CONFIG_PATH), + (TOKENIZER_CONFIG_PATH, GENERATION_CONFIG_PATH, True), + (TOKENIZER_CONFIG_PATH, None, False), + (None, GENERATION_CONFIG_PATH, True), ], ) def test_tokenizer_encode_and_decode_parity( @@ -93,6 +94,7 @@ def test_tokenizer_encode_and_decode_parity( token_ids, config_path, generation_config_path, + hf_tokenizer_adds_bos, mocker, ): @@ -111,12 +113,24 @@ def patch_token_to_id_for_dummy_tokenizer(*args, **kwargs): ) # Tokenizer artifacts for this test were created from tiktoken_small.model # using the script in https://gist.github.com/ebsmothers/55b2f177f5ed15a3b81508f8f8b91159 - hf_tokenizer = HuggingFaceTokenizer( + hf_tokenizer = HuggingFaceBaseTokenizer( tokenizer_json_path=str(ASSETS / "tokenizer.json"), tokenizer_config_json_path=config_path, generation_config_path=generation_config_path, ) + if hf_tokenizer_adds_bos: + # This is a hacky way to patch the post-processor to prepend BOS + # (Patching with mocker doesn't work) + post_processor = TemplateProcessing( + single=" $0", pair=" $A $B", special_tokens=[("", 0)] + ) + hf_tokenizer.tokenizer.post_processor = post_processor + # Validate that the patch worked + assert hf_tokenizer.tokenizer.encode("").ids == [0] + # Re-call the method with the new post-processor + hf_tokenizer._infer_should_add_bos_eos() + tt_tokens = tt_tokenizer.encode(texts[0], add_bos=True, add_eos=True) hf_tokens = hf_tokenizer.encode(texts[0], add_bos=True, add_eos=True) diff --git a/torchtune/modules/transforms/tokenizers/__init__.py b/torchtune/modules/transforms/tokenizers/__init__.py index 5dd968fdcb..a0d4f1aa46 100644 --- a/torchtune/modules/transforms/tokenizers/__init__.py +++ b/torchtune/modules/transforms/tokenizers/__init__.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. from ._gpt2 import GPT2BaseTokenizer -from ._hf_tokenizer import HuggingFaceTokenizer +from ._hf_tokenizer import HuggingFaceBaseTokenizer from ._sentencepiece import SentencePieceBaseTokenizer from ._tiktoken import TikTokenBaseTokenizer from ._utils import ( @@ -23,5 +23,5 @@ "BaseTokenizer", "tokenize_messages_no_special_tokens", "parse_hf_tokenizer_json", - "HuggingFaceTokenizer", + "HuggingFaceBaseTokenizer", ] diff --git a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py index 5ee7f3f0fe..e95c64edd2 100644 --- a/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py +++ b/torchtune/modules/transforms/tokenizers/_hf_tokenizer.py @@ -11,7 +11,7 @@ from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer -class HuggingFaceTokenizer(BaseTokenizer): +class HuggingFaceBaseTokenizer(BaseTokenizer): """ A wrapper around Hugging Face tokenizers. See https://github.com/huggingface/tokenizers This can be used to load from a Hugging Face tokenizer.json file into a torchtune BaseTokenizer. @@ -32,8 +32,8 @@ class HuggingFaceTokenizer(BaseTokenizer): def __init__( self, - *, tokenizer_json_path: str, + *, tokenizer_config_json_path: Optional[str] = None, generation_config_path: Optional[str] = None, ): @@ -53,6 +53,7 @@ def __init__( else: self.generation_config = None self._infer_bos_eos_tokens() + self._infer_should_add_bos_eos() def _get_token_from_config(self, config: Dict[str, Any], key: str) -> str: """ @@ -97,6 +98,20 @@ def _infer_bos_eos_tokens(self): if self.bos_id is None or self.eos_id is None: raise ValueError("Could not infer BOS and EOS token IDs from config") + def _infer_should_add_bos_eos(self): + """ + Hugging Face tokenizers sometimes add BOS by default. We should infer this to determine + whether to add it ourselves in encode. Otherwise we will get duplicate BOS tokens. + """ + + self.hf_adds_bos, self.hf_adds_eos = False, False + encoded_empty_str = self.tokenizer.encode("").ids + + if self.bos_id in encoded_empty_str: + self.hf_adds_bos = True + if self.eos_id in encoded_empty_str: + self.hf_adds_eos = True + def encode( self, text: str, add_bos: bool = True, add_eos: bool = True ) -> List[int]: @@ -114,9 +129,9 @@ def encode( List[int]: The list of token ids. """ token_ids = self.tokenizer.encode(text).ids - if add_bos: + if add_bos and not self.hf_adds_bos: token_ids.insert(0, self.bos_id) - if add_eos: + if add_eos and not self.hf_adds_eos: token_ids.append(self.eos_id) return token_ids From b1aba76e746ea99cb18bb406188ccc2cea74a78f Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Wed, 12 Feb 2025 17:47:56 -0800 Subject: [PATCH 8/9] docs formatting --- docs/source/basics/tokenizers.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/basics/tokenizers.rst b/docs/source/basics/tokenizers.rst index 2bf14eb7e0..855be27f41 100644 --- a/docs/source/basics/tokenizers.rst +++ b/docs/source/basics/tokenizers.rst @@ -235,11 +235,14 @@ either ``tokenizer_config.json`` or ``generation_config.json``, which will allow Continuing with the Mistral example: .. code-block:: python + hf_tokenizer = HuggingFaceBaseTokenizer( tokenizer_json_path="/tmp/Mistral-7B-v0.1/tokenizer.json", tokenizer_config_json_path="/tmp/Mistral-7B-v0.1/tokenizer_config.json", ) + text = "hello world" + print(hf_tokenizer.encode(text)) # [1, 6312, 28709, 1526, 2] From 64072776d897b9e26caea1f44fa2ad67c1b6a161 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Wed, 12 Feb 2025 17:54:08 -0800 Subject: [PATCH 9/9] Add to api_ref_modules.rst --- docs/source/api_ref_modules.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst index 979e57347f..78298cc3e6 100644 --- a/docs/source/api_ref_modules.rst +++ b/docs/source/api_ref_modules.rst @@ -50,6 +50,7 @@ model specific tokenizers. transforms.tokenizers.SentencePieceBaseTokenizer transforms.tokenizers.TikTokenBaseTokenizer + transforms.tokenizers.HuggingFaceBaseTokenizer transforms.tokenizers.ModelTokenizer transforms.tokenizers.BaseTokenizer