forked from wangruinlp/gbwe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
co_occur_filter.pl
99 lines (75 loc) · 2.38 KB
/
co_occur_filter.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# This is for filtering the co-occurrence graph using PMI information.
use strict;
use locale;
my @lignec=();
my %cooc;
my %coocproba;
my %freq;
#while (my $filename=<STDIN>)
# {
# my $compteur=0;
#open(Premier,"../../../");
# my $compteur=0;
while (my $ligne= <STDIN>){
chomp $ligne;
#$compteur=+1;
# if ($compteur%1000==0){print $compteur,"\n";}
{$_=$ligne;
/([^[^\t]*)\t([0-9]*)\t(.*)/;
if ((length($1)>1)&&(int($2)>30))
{
my $pivot=$1;
#print $pivot, "\n";
if (defined ($freq{$pivot}))
{ $freq{$pivot}+=int($2);}
else {$freq{$pivot}=int($2);}
# if ($freq{$pivot}>5)
{#print $3, "\n";
my @segm=split(/[,\t]+/,$3);
for (my $i=0;$i<=$#segm;$i++)
{#print $segm[$i], "\n";
my @fields = split( /\[/, $segm[$i]);
#print $fields[0], " ", $fields[1], "\n";
if ((int($fields[1])>2)&&(length($fields[0])>1))
{
my $cooccurrent=$fields[0];
if (defined $cooc{$pivot}{$cooccurrent})
{$cooc{$pivot}{$cooccurrent}+=int($fields[1]);}
else {$cooc{$pivot}{$cooccurrent}=int($fields[1]);}
}
else
{if (int($fields[1])<=2) {last;}}
}
}
}
}
}
foreach my $pivot ( keys %cooc ) {
my %cooc_pivot = %{ $cooc{$pivot} };
foreach my $cooccurrent ( keys %cooc_pivot ) {
if ((defined ($freq{$pivot}))&&($freq{$pivot}>0)&&(defined ($freq{$cooccurrent}))&&($freq{$cooccurrent}>0))
{
$coocproba{$pivot}{$cooccurrent} =$cooc{$pivot}{$cooccurrent}*10000/($freq{$pivot}*$freq{$cooccurrent});#/($freq{$pivot}) ;#
#print $pivot, $cooccurrent, " " , $coocproba{$pivot}{$cooccurrent}, "\n";
}
else
{
#$coocproba{$pivot}{$cooccurrent} =0;
#{ print $pivot, $freq{$pivot}, $cooccurrent, $freq{$cooccurrent}, " n " , $coocproba{$pivot}{$cooccurrent}, "\n";}
}
}
}
%cooc=();
foreach my $pivot ( sort keys %coocproba ) {
if (length($pivot)>3)
{print $pivot, "\t:\t";
my %cooc_pivot = %{ $coocproba{$pivot} };
foreach my $cooccurrent
( sort { $cooc_pivot{$b} <=> $cooc_pivot{$a} or $a cmp $b}
keys %cooc_pivot ) {
if ((length($cooccurrent)>3)&&($cooc_pivot{$cooccurrent}>5))
{print $cooccurrent, "\t"; }
}
print "\n";
}
}