-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_cfg_freq.py
executable file
·73 lines (58 loc) · 1.7 KB
/
count_cfg_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/Users/husnusensoy/Downloads/pypy-2.0-beta2/bin/pypy
__author__="Alexander Rush <[email protected]>"
__date__ ="$Sep 12, 2012"
import sys, json
"""
Count rule frequencies in a binarized CFG.
"""
class Counts:
def __init__(self):
self.unary = {}
self.binary = {}
self.nonterm = {}
def show(self):
for symbol, count in self.nonterm.iteritems():
print count, "NONTERMINAL", symbol
for (sym, word), count in self.unary.iteritems():
print count, "UNARYRULE", sym, word
for (sym, y1, y2), count in self.binary.iteritems():
print count, "BINARYRULE", sym, y1, y2
def count(self, tree):
"""
Count the frequencies of non-terminals and rules in the tree.
"""
if isinstance(tree, basestring): return
# Count the non-terminal symbol.
symbol = tree[0]
self.nonterm.setdefault(symbol, 0)
self.nonterm[symbol] += 1
if len(tree) == 3:
# It is a binary rule.
y1, y2 = (tree[1][0], tree[2][0])
key = (symbol, y1, y2)
self.binary.setdefault(key, 0)
self.binary[(symbol, y1, y2)] += 1
# Recursively count the children.
self.count(tree[1])
self.count(tree[2])
elif len(tree) == 2:
# It is a unary rule.
y1 = tree[1]
key = (symbol, y1)
self.unary.setdefault(key, 0)
self.unary[key] += 1
def main(parse_file):
counter = Counts()
for l in open(parse_file):
t = json.loads(l)
counter.count(t)
counter.show()
def usage():
sys.stderr.write("""
Usage: python count_cfg_freq.py [tree_file]
Print the counts of a corpus of trees.\n""")
if __name__ == "__main__":
if len(sys.argv) != 2:
usage()
sys.exit(1)
main(sys.argv[1])