forked from Yang233666/CMVC
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.py
167 lines (133 loc) · 5 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
'''
Implementation of different metrics used for evaluating CESI results
C: Clusters produced by algorithm
E: Gold standard cluster
'''
import itertools, sys
import random
def macroPrecision(C_clust2ele, E_ele2clust):
num_prec = 0
for _, cluster in C_clust2ele.items():
isFirst = True
res = set()
for ele in cluster:
if ele not in E_ele2clust:
# sys.stdout.write('.')
continue
if isFirst:
res = E_ele2clust[ele]
isFirst = False
continue
res = res.intersection(E_ele2clust[ele])
if len(res) == 1: num_prec += 1
#else:print('res:', len(res), res)
elif len(res) > 1: print( 'ERROR In Clustering micro!!!')
if len(C_clust2ele) == 0: return 0
return float(num_prec) / float(len(C_clust2ele))
def microPrecision(C_clust2ele, E_ele2clust):
num_prec = 0
total = 0
for _, cluster in C_clust2ele.items():
freq_map = {}
total += len(cluster)
for ent in cluster:
if ent not in E_ele2clust:
# sys.stdout.write('.')
continue
for ele in E_ele2clust[ent]:
freq_map[ele] = freq_map.get(ele, 0)
freq_map[ele] += 1
max_rep = 0
for k, v in freq_map.items(): max_rep = max(max_rep, v)
num_prec += max_rep
if total == 0: return 0
return float(num_prec) / float(total)
def pairPrecision(C_clust2ele, E_ele2clust):
num_hit = 0
num_pairs = 0
for _, cluster in C_clust2ele.items():
all_pairs = list(itertools.combinations(cluster, 2))
num_pairs += len(all_pairs)
for e1, e2 in all_pairs:
if e1 not in E_ele2clust or e2 not in E_ele2clust:
# sys.stdout.write('.')
continue
res = E_ele2clust[e1].intersection(E_ele2clust[e2])
if len(res) == 1: num_hit += 1
# elif len(res) > 1: print( 'ERROR In Clustering pairwise!!!')
if num_pairs == 0: return 0
return float(num_hit) / float(num_pairs)
def streaming_combination_pairs(cluster, max_pairs_from_cluster):
idx_set = list(range(len(cluster)))
first_idxs = random.choices(idx_set, k=2*max_pairs_from_cluster)
second_idxs = random.choices(idx_set, k=2*max_pairs_from_cluster)
unique_pairs = list(set(zip(first_idxs, second_idxs)))
if len(unique_pairs) >= max_pairs_from_cluster:
sampled_idxs = random.sample(unique_pairs, k=max_pairs_from_cluster)
clusterlist = list(cluster)
unique_pairs = [(clusterlist[i], clusterlist[j]) for (i,j) in sampled_idxs]
return unique_pairs
def capped_combinations(cluster, max_pairs_from_cluster):
combination_size = len(cluster) * (len(cluster) - 1) / 2
if combination_size <= max_pairs_from_cluster:
all_pairs = itertools.combinations(cluster, 2)
else:
all_pairs = streaming_combination_pairs(cluster, max_pairs_from_cluster)
return all_pairs
def pairwiseMetric(C_clust2ele, E_ele2clust, E_clust2ent, max_pairs_from_cluster=100000):
num_hit = 0
num_C_pairs = 0
num_E_pairs = 0
for _, cluster in C_clust2ele.items():
all_pairs = capped_combinations(cluster, max_pairs_from_cluster)
for e1, e2 in all_pairs:
if e1 in E_ele2clust and e2 in E_ele2clust and len(E_ele2clust[e1].intersection(E_ele2clust[e2])) > 0:
num_hit += 1
num_C_pairs += 1
for rep, cluster in E_clust2ent.items():
num_pairs = 0
pairs = capped_combinations(cluster, max_pairs_from_cluster)
for pair in pairs:
num_pairs += 1
num_E_pairs += num_pairs
if num_C_pairs == 0 or num_E_pairs == 0:
return 1e-6, 1e-6
# print( num_hit, num_C_pairs, num_E_pairs)
return float(num_hit) / float(num_C_pairs), float(num_hit) / float(num_E_pairs)
def calcF1(prec, recall):
if prec + recall == 0: return 0
return 2 * (prec * recall) / (prec + recall)
def microF1(C_ele2clust, C_clust2ele, E_ele2clust, E_clust2ent):
micro_prec = microPrecision(C_clust2ele, E_ele2clust)
micro_recall = microPrecision(E_clust2ent, C_ele2clust)
micro_f1 = calcF1(micro_prec, micro_recall)
return micro_f1
def macroF1(C_ele2clust, C_clust2ele, E_ele2clust, E_clust2ent):
macro_prec = macroPrecision(C_clust2ele, E_ele2clust)
macro_recall = macroPrecision(E_clust2ent, C_ele2clust)
macro_f1 = calcF1(macro_prec, macro_recall)
return macro_f1
def pairF1(C_ele2clust, C_clust2ele, E_ele2clust, E_clust2ent):
pair_prec,pair_recall = pairwiseMetric(C_clust2ele, E_ele2clust, E_clust2ent)
pair_f1 = calcF1(pair_prec, pair_recall)
return pair_f1
def evaluate(C_ele2clust, C_clust2ele, E_ele2clust, E_clust2ent):
macro_prec = macroPrecision(C_clust2ele, E_ele2clust)
macro_recall = macroPrecision(E_clust2ent, C_ele2clust)
macro_f1 = calcF1(macro_prec, macro_recall)
micro_prec = microPrecision(C_clust2ele, E_ele2clust)
micro_recall = microPrecision(E_clust2ent, C_ele2clust)
micro_f1 = calcF1(micro_prec, micro_recall)
pair_prec,pair_recall = pairwiseMetric(C_clust2ele, E_ele2clust, E_clust2ent)
pair_f1 = calcF1(pair_prec, pair_recall)
return {
'macro_prec': round(macro_prec, 4),
'macro_recall': round(macro_recall, 4),
'macro_f1': round(macro_f1, 4),
'micro_prec': round(micro_prec, 4),
'micro_recall': round(micro_recall, 4),
'micro_f1': round(micro_f1, 4),
'pair_prec': round(pair_prec, 4),
'pair_recall': round(pair_recall, 4),
'pair_f1': round(pair_f1, 4),
}