forked from mjpost/dptsg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsg.pl
executable file
·261 lines (218 loc) · 7.68 KB
/
tsg.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/perl
# Matt Post <[email protected]>
# Gibbs sampler for learning a tree substitution grammar with a
# Dirichlet process prior. See
#
# "Bayesian learning of a tree substitution grammar". Matt Post and
# Daniel Gildea. ACL (short paper). Singapore.
#
# for more information.
#
# Sample usage:
# tsg.pl -alpha 100 -stop 0.9 -lexicon lex -thresh 0 -corpus corpus -pcfg pcfg
#
# where
# - {alpha} and {stop} are hyperparameters to the DP prior
# - {lexicon} determines the lexicon used to convert leaves to UNK tokens
# - all words occuring fewer than {thresh} times (from the lexicon)
# are converted to an unknown word category
# - {corpus} is the WSJ corpus, one parse tree per line, in parenthetical
# form, with the root node being TOP
# - {pcfg} is the MLE depth-one PCFG grammar used in the base measure
#
# Arguments can be passed as environment variables or command-line
# arguments, with the latter overriding the former (and both
# overriding code-supplied defaults).
use strict;
use warnings;
use POSIX qw|strftime|;
use List::Util qw|reduce min max|;
use Memoize;
use Sampler qw(compress_files);
use Sampler::TSG;
use TSG;
#use Clone qw/clone/;
my $basedir = $ENV{DPTSG};
# parameters (via environment variables and command-line params)
my %PARAMS = (
alpha => 10, # DP parameter
iters => 500, # number of iterations
stop => 0.9, # stop prob for base geometric distribution
log => 0,
lexicon => "$basedir/data/lex.02-21",
pcfg => "$basedir/data/pcfg_rules.prb",
'*unordered' => 0, # whether RHS of PCFG should be considered unordered
thresh => 1, # threshold for converting words to UNKs
corpus => "$basedir/data/wsj.trees.02-21.clean",
rundir => $ENV{PWD},
'*two' => 0, # sample two nodes every {two}th iter (empty arg = 1)
dump => 1, # frequency with which to dump corpus and counts
'*startover' => 0, # start over even if there are existing iters completed
srand => undef,
verbosity => 1 );
# process command-line parameters
process_params(\%PARAMS,\@ARGV,\%ENV);
my $lexicon = read_lexicon($PARAMS{lexicon},$PARAMS{thresh});
# initialize the random number generator if requested
srand($PARAMS{srand}) if defined $PARAMS{srand};
my @corpus;
my %counts;
my %rules; # PCFG probs for regular rules
my %samples; # samples gathered
my $size = 0; # number of tree fragments in the corpus
my $debug = 0;
my $loghandle;
chdir $PARAMS{rundir} or die "couldn't chdir to '$PARAMS{rundir}'";
# We need the PCFG rules in order to score fragments according to the
# base model, so read them in to pass them into the sampler
my $pcfg_file = $PARAMS{pcfg};
print STDERR "Reading PCFG rules...";
open RULES, $pcfg_file or die "can't read PCFG rules file '$pcfg_file'";
while (my $line = <RULES>) {
chomp($line);
my ($prb, @tokens) = split ' ', $line;
my $rep = join " ", @tokens;
$rules{$rep} = $prb;
}
close RULES;
print STDERR "done (read " . (scalar keys %rules) . " rules).\n";
$PARAMS{rules} = \%rules;
# find the highest directory from a previous run, and pick up from
# there unless -startover was specified on the command line
my $PICKING_UP = 0;
my $bzip = "/usr/bin/bzip2";
$bzip = "$ENV{HOME}/bin/bzip2" if ! -e $bzip;
opendir DIR, $PARAMS{rundir} or die "can't read files in rundir '$PARAMS{rundir}'";
my $iter = max(1, grep(/^\d+$/, readdir(DIR)));
closedir DIR;
if ($iter == 1 || $PARAMS{startover}) {
$iter = 1;
print STDERR "Initializing counts with first pass over $PARAMS{corpus}.\...";
my $sentno = 0;
open CORPUS, $PARAMS{corpus} or die "can't open corpus '$PARAMS{corpus}'";
while (my $line = <CORPUS>) {
chomp $line;
next if $line =~ /^$/;
$sentno++;
my $tree = build_subtree($line,$lexicon);
if ( (scalar @{$tree->{children}}) > 1 ) {
my $found = ruleof($tree,1);
print "\n* FATAL: tree $.: top-level rule must be unary rule labeled 'TOP'\n (found '$found')\n";
exit(1);
}
push(@corpus, $tree)
if defined $tree;
}
close CORPUS;
print STDERR "done.\n";
} else {
if ($iter == $PARAMS{iters}) {
print "* Already have $iter iterations of output, quitting\n";
exit;
}
$PICKING_UP = 1;
print STDERR "Picking up where we left off (iteration $iter)...";
my $corpus = "$iter/corpus";
if (! -e $corpus and -e "${corpus}.bz2") {
print "decompressing compressed corpus file ${corpus}.bz2\n";
system("$bzip -d ${corpus}.bz2");
}
open CORPUS, $corpus or die "can't open corpus '$corpus'";
while (my $line = <CORPUS>) {
chomp $line;
my $tree = build_subtree($line,$lexicon);
push(@corpus, $tree)
if defined $tree;
}
close CORPUS;
if (-e $corpus and ! -e "${corpus}.bz2") {
compress_files($corpus);
}
print STDERR "done.\n";
$iter++;
}
# create the sampler, passing it the command line parameters (some of
# which it might use), and set the corpus, which has been read in.
my $sampler = new Sampler::TSG(%PARAMS);
$sampler->corpus(\@corpus);
$sampler->count();
# print "saw $size events\n";
# iterate until completion ($iter was set earlier, in case we picked
# up from an existing run)
for ( ; $iter <= $PARAMS{iters}; $iter++) {
print "ITERATION $iter TIMESTAMP ", , $/;
# allows for easy kills when you don't know which process is running
# in the current dir and only want to kill that one
my $stop_file = ".stop";
if (-e $stop_file) {
unlink($stop_file);
print "QUITTING ON PRESENCE OF STOP FILE\n";
exit;
}
# log
open $sampler->{logfh}, ">log.$iter" if ($PARAMS{log});
my $start_time = time;
# sample_all visits all nodes of all sentences in the corpus, and
# applies the function pointer passed to it to each of those nodes.
# Here, we pass it a function that considers either one or two nodes
# at a time, depending on the program arguments
if ($PARAMS{two} && !($iter % $PARAMS{two})) {
$sampler->sample_all($iter,$sampler->can('sample_two'));
} else {
$sampler->sample_all($iter,$sampler->can('sample_each_TSG'));
}
# sanity check -- put in place after a bug was discovered (we cache
# the LHS counts, and were not incrementing/decrementing them
# correctly)
if ($sampler->check_counts()) {
print "ITERATION $iter passed sanity check.\n";
} else {
print "* FATAL: failed sanity check.\n";
exit;
}
# map { print "$counts{$_} '$_'\n" } (keys %counts);
my $dur = time() - $start_time;
my $nicedur = mytime($dur);
mylog("ITERATION $iter took $dur seconds ($nicedur)",1);
my $types = $sampler->types();
my $tokens = $sampler->tokens();
mylog("ITERATION $iter splits:$sampler->{splits} merges:$sampler->{merges} types:$types tokens:$tokens",1);
my $likelihood = $sampler->likelihood();
mylog("ITERATION $iter log likelihood $likelihood");
# print "ITERATION stats ", (scalar keys %counts), " keys\n";
# my @newcorpus = map { build_tree_oneline($_) } @corpus;
# print "ITERATION size corpus ", total_size(\@newcorpus), " counts ", total_size(\%counts), $/;
close $sampler->{logfh} if ($PARAMS{log});
if ($PARAMS{dump} and ! ($iter % $PARAMS{dump})) {
$sampler->dump_corpus($iter);
$sampler->dump_counts($iter);
}
}
# obtains the appropriate filehandle for logging to, and logs the
# message
sub mylog {
my ($msg,$stdout) = @_;
my $fh = loghandle();
print $fh "$msg\n";
print $msg, $/
if $stdout;
}
sub loghandle() {
if (! defined $loghandle) {
if ($PICKING_UP) {
open $loghandle, ">>out.log" or die "can't open logfile";
} else {
open $loghandle, ">out.log" or die "can't open logfile";
}
}
return $loghandle;
}
sub mytime {
my $secs = shift;
my $hours = int($secs / 3600);
$secs %= 3600;
my $mins = int($secs / 60);
$secs %= 60;
my $time = "${hours}h ${mins}m ${secs}s";
return $time;
}