-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopicFormatting.pl
143 lines (124 loc) · 3.65 KB
/
topicFormatting.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#TREC topics (tested on Robust04 topics) converted to Indri's file format.
#TODO: instead of manual XML parsing, use a library.
use strict;
#takes a string and cleans it up
sub clean {
my $s = shift;
$s =~s/[^a-zA-Z0-9]/ /g;
#trim whitespaces
$s =~s/\s\s+/ /g;
$s =~ s/^\s+//;
$s =~ s/\s+$//;
$s = lc($s);
return $s;
}
my $numArgs = $#ARGV+ 1;
if($numArgs < 5 ){
print "Four command line arguments expected: [input: TREC formatted topic file] [output: Indri formatted file] [part (title/desc/narr)] [retrieval rule] [sequential dependence (1 or anything else)]\n";
exit;
}
my $infile = $ARGV[0]; #topic file in TREC format
my $outfile = $ARGV[1]; #topic file in Indri format
my $topicType = $ARGV[2]; #part of the topic file to consider (encompassing, i.e. choosing desc means title+desc)
my $retrievalRule = $ARGV[3]; #retrieval rule
my $seqDependence = $ARGV[4]; #sequential dependence
if($topicType=~m/title/ || $topicType=~m/desc/ || $topicType=~m/narr/){
;
}
else {
print "The last argument (now: $topicType) is one of title|desc|narr or a combination of them, e.g. title+desc.\n";
exit;
}
my %stopwords = ();
open(IN, "stoplist.dft")||die $!;
while(<IN>){
if($_=~m/<.*parameters>/ || $_=~m/<.*stopper>/){;}
else {
$_ =~s/.*<word>//;
$_ =~s/<\/word>.*\n//;
$stopwords{lc($_)}=1;
}
}
close(IN);
open(OUT,">>$outfile")||die $!;
print OUT "<parameters>\n";
my $inType = "";
my $currentQuery = "";
open(IN,$infile)||die $!;
while(<IN>){
chomp;
if($_ =~m/<num>/){
print OUT "<query>\n";
print OUT "<number>";
my $qid = $_;
$_ =~s/.*Number\s*:\s*//;
$_ =~s/<\/num>//; #core18 specific
$_ =~s/\s+//g;
print OUT "$_</number>\n";
print OUT "<text>";
}
elsif($_=~m/<top>/){
$currentQuery = "";
}
elsif($_=~m/<\/top>/){
my @tokens =split(/\s+/,clean($currentQuery));
my @stoppedTokens;
#we need to remove stopwords to get valid sd elements
foreach my $t(@tokens){
if(exists $stopwords{$t}){;}
else {
push(@stoppedTokens,$t);
}
}
#process $currentQuery
if($retrievalRule=~m/(okapi|tfidf)/){
print OUT clean($currentQuery);
}
elsif($seqDependence ne "1" || (@stoppedTokens)<3){
print OUT "#combine($currentQuery)";
}
#sequential dependence
else {
print OUT "#weight( ";
print OUT "0.9 #combine(".clean($currentQuery).") ";
print OUT "0.05 #combine(";
for(my $i=0; $i<@stoppedTokens-1; $i++){
print OUT "#1($stoppedTokens[$i] $stoppedTokens[$i+1]) ";
}
print OUT ") ";
print OUT "0.05 #combine(";
for(my $i=0; $i<@stoppedTokens-1; $i++){
print OUT "#uw8($stoppedTokens[$i] $stoppedTokens[$i+1]) ";
}
print OUT "))";
}
print OUT "</text>\n</query>\n";
$inType = "";
}
elsif($_=~m/<title>/ && $topicType=~m/title/){
$_=~s/<title>\s*//;
$currentQuery = $currentQuery." ".clean($_);
$inType = "title";
}
elsif($_=~m/<desc>/){
$_=~s/<desc>\s*//;
$inType = "desc";
}
elsif($_=~m/<narr>/){
$_=~s/<narr>\s*//;
$inType = "narr";
}
elsif($_=~m/<\/title>/ || $_=~m/<\/desc>/ || $_=~m/<\/narr>/)#core18 specific
{
;
}
elsif($topicType=~m/$inType/){
$currentQuery = $currentQuery." ".clean($_);
}
else {
;
}
}
print OUT "</parameters>\n";
close(OUT);
close(IN);