forked from shlurbee/dmrs-text-generation-naacl2019
-
Notifications
You must be signed in to change notification settings - Fork 1
/
replace_rare.py
44 lines (37 loc) · 2.3 KB
/
replace_rare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
Interface for anon replacements.
Helpful, e.g., for programming different preprocessing steps.
e.g. To get vocab counts from all training data and then do replacements
in both training and dev data:
# anonymize all unknowns
> python scripts/replace_rare.py vocab --vocabfile data_v2_all_anon/vocab.txt --infile data_v2_all_anon/train-tgt.txt
> python scripts/replace_rare.py replace --vocabfile data_v2_all_anon/vocab.txt --infile data_v2_all_anon/train --min_freq 10000
> python scripts/replace_rare.py replace --vocabfile data_v2_all_anon/vocab.txt --infile data_v2_all_anon/dev --min_freq 10000
# anonymize rare unknowns
> python scripts/replace_rare.py vocab --vocabfile data_v2_rare_anon/vocab.txt --infile data_v2_rare_anon/train-tgt.txt
> python scripts/replace_rare.py replace --vocabfile data_v2_rare_anon/vocab.txt --infile data_v2_rare_anon/train --min_freq 2
> python scripts/replace_rare.py replace --vocabfile data_v2_rare_anon/vocab.txt --infile data_v2_rare_anon/dev --min_freq 2
"""
import argparse
import preprocessing
import sys
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('mode', choices=['vocab', 'replace'], help='One of "vocab" or "replace"')
parser.add_argument('--vocabfile', help='filename where vocab is or will be stored')
parser.add_argument('--infile', help='name or prefix file where tokens will be counted or replaced (usually a -tgt file)')
parser.add_argument('--min_freq', type=int, help='vocab tokens that appear fewer than this many times will be replaced')
args = parser.parse_args()
if not (args.vocabfile and args.infile):
sys.stderr.write('--vocabfile and --infile are required arguments\n')
sys.exit(1)
if args.mode == 'replace' and not args.min_freq:
sys.stderr.write('--min_freq is required for replace mode\n')
sys.exit(1)
if args.mode == 'vocab':
sys.stderr.write('Building vocab from anon file {}\n'.format(args.infile))
preprocessing.build_vocab(args.infile, args.vocabfile)
elif args.mode == 'replace':
sys.stderr.write('Using vocab counts in {} to replace tokens in {} with '
'freq less than {}\n'.format(args.vocabfile, args.infile, args.min_freq))
preprocessing.replace_rare_tokens(args.infile, args.vocabfile, args.min_freq)