forked from neubig/multi-extract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-multi.pl
executable file
·144 lines (125 loc) · 5.4 KB
/
train-multi.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Getopt::Long;
use List::Util qw(sum min max shuffle);
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
my $THREADS=2;
my $SRC="en";
my $LMSIZE="0100000";
my $TMSIZE="0100000";
GetOptions(
"lmsize=s" => \$LMSIZE,
"tmsize=s" => \$TMSIZE,
"threads=s" => \$THREADS,
"src=s" => \$SRC,
);
if(@ARGV == 0) {
print STDERR "Usage: $0 ar zh ...\n";
exit 1;
}
my $HOME = $ENV{"HOME"};
my $MULTDIR="$HOME/work/multi-extract";
my $TRAVDIR="$HOME/work/travatar";
my $GIZADIR="$HOME/usr/local/giza-pp";
my $WD=`pwd`; chomp $WD;
my @trgs = @ARGV;
#################### Rule extraction ##############################
# Check to make sure the target files and alignments exists
my @files = ("tok/train-$TMSIZE.$SRC");
my @standmod;
for(@trgs) {
push @files, "tok/train-$TMSIZE.$_";
push @standmod, "standard-model/$SRC$_-lm$LMSIZE-tm$TMSIZE";
push @files, "$standmod[-1]/align/align.txt";
}
for(@files) { -e $_ or die "Could not find file $_\n"; }
my $ID = "$SRC".join("", @trgs)."-lm".join("x",map { $LMSIZE } @trgs)."-tm$TMSIZE-fstd";
# Create the output directory
(not -e "multi-model/$ID") or die "multi-model/$ID already exists";
# Perform rule extraction
safesystem("mkdir -p multi-model/$ID/model") or die;
safesystem("$MULTDIR/multi-extract.py @files | gzip > multi-model/$ID/model/extract.gz") or die;
# Score the table as a whole with no lexical weighting
my $cmd1 = "zcat multi-model/$ID/model/extract.gz | env LC_ALL=C sort | $TRAVDIR/script/train/score-t2s.pl --cond-prefix=egf --joint | env LC_ALL=C sort | gzip > multi-model/$ID/model/rule-table.src-trg.all.gz";
my $cmd2 = "zcat multi-model/$ID/model/extract.gz | $TRAVDIR/script/train/reverse-rt.pl | env LC_ALL=C sort | $TRAVDIR/script/train/score-t2s.pl --cond-prefix=fge | $TRAVDIR/script/train/reverse-rt.pl | env LC_ALL=C sort | gzip > multi-model/$ID/model/rule-table.trg-src.all.gz";
run_two($cmd1, $cmd2);
# Score each factor of the table with conditional probabilities and lexical
foreach my $factnum (0 .. $#trgs) {
my $trg = $trgs[$factnum];
$cmd1 = "zcat multi-model/$ID/model/extract.gz | $MULTDIR/extract-factor.pl $factnum | env LC_ALL=C sort | $TRAVDIR/script/train/score-t2s.pl --lex-prob-file=$standmod[$factnum]/lex/trg_given_src.lex --prefix=$factnum --cond-prefix=egf --joint | env LC_ALL=C sort | gzip > multi-model/$ID/model/rule-table.src-trg.$factnum.gz";
$cmd2 = "zcat multi-model/$ID/model/extract.gz | $MULTDIR/extract-factor.pl $factnum | $TRAVDIR/script/train/reverse-rt.pl | env LC_ALL=C sort | $TRAVDIR/script/train/score-t2s.pl --lex-prob-file=$standmod[$factnum]/lex/src_given_trg.lex --prefix=$factnum --cond-prefix=fge | $TRAVDIR/script/train/reverse-rt.pl | env LC_ALL=C sort | gzip > multi-model/$ID/model/rule-table.trg-src.$factnum.gz";
run_two($cmd1, $cmd2);
}
# Create the multi-output phrase table
my @tables;
for my $factnum ("all", 0 .. $#trgs) {
for my $dir (qw(src-trg trg-src)) {
push @tables, "multi-model/$ID/model/rule-table.$dir.$factnum.gz";
}
}
safesystem("$MULTDIR/combine-multi-rt.pl @tables | gzip > multi-model/$ID/model/rule-table.gz");
# Create the glue rules
my $gfile = "$WD/multi-model/$ID/model/glue-rules";
open GFILE, ">:utf8", $gfile or die "Couldn't open $gfile\n";
print GFILE "x0:X @ S ||| " .join(" |COL| ", map { "x0:X @ S" } (0 .. $#trgs))." ||| \n";
print GFILE "x0:S x1:X @ S ||| ".join(" |COL| ", map { "x0:S x1:X @ S" } (0 .. $#trgs))." ||| glue=1\n";
close GFILE;
# Create the config file
my $TINI_FILE = "$WD/multi-model/$ID/model/travatar.ini";
my $TM_FILES = "$WD/multi-model/$ID/model/rule-table.gz\n$WD/multi-model/$ID/model/glue-rules";
my $LM_FILES = join("\n", map { "$WD/lm/$trgs[$_]-lm$LMSIZE.blm|factor=$_,lm_feat=${_}lm,lm_unk_feat=${_}lmunk" } (0 .. $#trgs));
open TINI, ">:utf8", $TINI_FILE or die "Couldn't open $TINI_FILE\n";
print TINI "[tm_file]\n$TM_FILES\n\n";
print TINI "[lm_file]\n$LM_FILES\n\n";
print TINI "[in_format]\nword\n\n";
print TINI "[tm_storage]\nfsm\n\n";
print TINI "[search]\ncp\n\n";
print TINI "[trg_factors]\n".@trgs."\n\n";
print TINI "[hiero_span_limit]\n20\n1000\n\n";
# Default values for the weights
my $weights = "0egfp=0.05\n0egfl=0.05\n0fgep=0.05\n0fgel=0.05\n0lm=0.3\n0w=0.3\np=-0.15\nunk=-1\nlfreq=0.05\n";
print TINI "[weight_vals]\n$weights\n";
close TINI;
print "Finished training! You can find the configuation file in:\n$TINI_FILE\n";
#################### Utility functions ############################
# Adapted from Moses's train-model.perl
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "ERROR: Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
sub run_two {
@_ == 2 or die "run_two handles two commands, got @_\n";
my ($CMD1, $CMD2) = @_;
if($THREADS > 1) {
my $pid = fork();
die "ERROR: couldn't fork" unless defined $pid;
if(!$pid) {
safesystem("$CMD1") or die;
exit 0;
} else {
safesystem("$CMD2") or die;
waitpid($pid, 0);
}
} else {
safesystem("$CMD1") or die;
safesystem("$CMD2") or die;
}
}