-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_candidates.py
97 lines (72 loc) · 2.56 KB
/
generate_candidates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from enum import IntEnum
import os.path
from collections import Counter
from background_processing import get_background_popularity
from background_processing import get_prefix_tree
class CandidateScenario (IntEnum):
NO_SUFFIXES = 1
TENK_SUFFIXES = 2
HUNDERDK_SUFFIXES = 3
class DataSet (IntEnum):
TRAINING = 1
TEST = 2
VALIDATION = 3
BACKGROUND = 4
def load_dataset(set: DataSet):
file_name = ""
if set is DataSet.TRAINING:
file_name = "training.txt"
elif set is DataSet.VALIDATION:
file_name = "validation.txt"
elif set is DataSet.TEST:
file_name = "test.txt"
elif set is DataSet.BACKGROUND:
file_name = "background.txt"
else:
raise Exception(f"Dataset not found: {set}")
if not os.path.isfile(file_name):
raise Exception(f"{file_name} is not available. Please run the `parse_dataset.py` script.")
print(f"Now loading {file_name}")
f = open(file_name, 'r')
data = f.read().splitlines()
f.close()
return data[1:]
def most_popular_completion(scenario: CandidateScenario, dataset: DataSet):
data = load_dataset(dataset)
print(f"Size of set is {len(data)}.")
background_popularity = get_background_popularity()
background_popularity_tree = get_prefix_tree(background_popularity, "background_popularity_tree.txt")
mrr = 0
count = 0
j = 0
for query in data:
j += 1
if j % 1000 == 0:
prog = "{:.2f}".format(j / len(data) * 100)
print(mrr / count)
print(f"Evaluating MostPopularCompletion: {prog}%")
query_split = query.split(" ", 2)
q = query_split[0]
count += 1
mrr += computeReciprocalRank(query, get_full_query_candidates(background_popularity_tree, q))
if len(query_split) <= 1:
continue
for char in " " + query_split[1]:
# Skip whitespace.
if char == " ":
q += char
continue
q += char
count += 1
mrr += computeReciprocalRank(query, get_full_query_candidates(background_popularity_tree, q))
print(f"MRR: {mrr/count}")
def computeReciprocalRank(query, candidates):
i = 0
for candidate, count in candidates:
if query == candidate:
return 1.0 / (i + 1)
i += 1
return 0
def get_full_query_candidates(background_popularity, prefix, max = 10):
return Counter(dict(background_popularity.items(prefix))).most_common(max)
most_popular_completion(CandidateScenario.NO_SUFFIXES, DataSet.BACKGROUND)