From a06eb6baff4ea8434999cb2bce5e6553f0d58bf2 Mon Sep 17 00:00:00 2001
From: Omer Jakobinsky <omer@jakobinsky.com>
Date: Sun, 2 Apr 2017 15:24:56 -0700
Subject: [PATCH] Do not use test and validation datasets whilst building the
 vocabulary

---
 scripts/preprocess.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/scripts/preprocess.py b/scripts/preprocess.py
index 90b834b6..290a0326 100644
--- a/scripts/preprocess.py
+++ b/scripts/preprocess.py
@@ -20,21 +20,33 @@
 if __name__ == '__main__':
   if args.encoding == 'bytes': args.encoding = None
 
-  # First go the file once to see how big it is and to build the vocab
-  token_to_idx = {}
+  # First go the file once to see how big it is
   total_size = 0
   with codecs.open(args.input_txt, 'r', args.encoding) as f:
     for line in f:
       total_size += len(line)
-      for char in line:
-        if char not in token_to_idx:
-          token_to_idx[char] = len(token_to_idx) + 1
 
   # Now we can figure out the split sizes
   val_size = int(args.val_frac * total_size)
   test_size = int(args.test_frac * total_size)
   train_size = total_size - val_size - test_size
- 
+
+  # Scan the fist N lines of the file in order to build the vocab
+  token_to_idx = {}
+  with codecs.open(args.input_txt, 'r', args.encoding) as f:
+    cur_idx = 0
+    for line in f:
+      for char in line:
+        cur_idx += 1
+        if cur_idx <= train_size:
+          if char not in token_to_idx:
+            token_to_idx[char] = len(token_to_idx) + 1
+        else:
+          break # break our of the nested loop
+      else:
+        continue # executed if the loop ended normally
+      break  # executed if 'continue' was skipped
+
   if not args.quiet:
     print 'Total vocabulary size: %d' % len(token_to_idx)
     print 'Total tokens in file: %d' % total_size
@@ -61,7 +73,8 @@
   with codecs.open(args.input_txt, 'r', args.encoding) as f:
     for line in f:
       for char in line:
-        splits[split_idx][cur_idx] = token_to_idx[char]
+        if char in token_to_idx:
+          splits[split_idx][cur_idx] = token_to_idx[char]
         cur_idx += 1
         if cur_idx == splits[split_idx].size:
           split_idx += 1