From a06eb6baff4ea8434999cb2bce5e6553f0d58bf2 Mon Sep 17 00:00:00 2001 From: Omer Jakobinsky Date: Sun, 2 Apr 2017 15:24:56 -0700 Subject: [PATCH] Do not use test and validation datasets whilst building the vocabulary --- scripts/preprocess.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 90b834b6..290a0326 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -20,21 +20,33 @@ if __name__ == '__main__': if args.encoding == 'bytes': args.encoding = None - # First go the file once to see how big it is and to build the vocab - token_to_idx = {} + # First go the file once to see how big it is total_size = 0 with codecs.open(args.input_txt, 'r', args.encoding) as f: for line in f: total_size += len(line) - for char in line: - if char not in token_to_idx: - token_to_idx[char] = len(token_to_idx) + 1 # Now we can figure out the split sizes val_size = int(args.val_frac * total_size) test_size = int(args.test_frac * total_size) train_size = total_size - val_size - test_size - + + # Scan the fist N lines of the file in order to build the vocab + token_to_idx = {} + with codecs.open(args.input_txt, 'r', args.encoding) as f: + cur_idx = 0 + for line in f: + for char in line: + cur_idx += 1 + if cur_idx <= train_size: + if char not in token_to_idx: + token_to_idx[char] = len(token_to_idx) + 1 + else: + break # break our of the nested loop + else: + continue # executed if the loop ended normally + break # executed if 'continue' was skipped + if not args.quiet: print 'Total vocabulary size: %d' % len(token_to_idx) print 'Total tokens in file: %d' % total_size @@ -61,7 +73,8 @@ with codecs.open(args.input_txt, 'r', args.encoding) as f: for line in f: for char in line: - splits[split_idx][cur_idx] = token_to_idx[char] + if char in token_to_idx: + splits[split_idx][cur_idx] = token_to_idx[char] cur_idx += 1 if cur_idx == splits[split_idx].size: split_idx += 1