-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackground_processing.py
105 lines (81 loc) · 3.26 KB
/
background_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os.path
from collections import Counter
import pickle
import pygtrie as trie
import datrie
import string
# Computes end-grams of the background data.
def get_background_endgrams(directory = ""):
background_data_file = directory + "background.txt"
background_ngrams_file = directory + "background_ngrams.txt"
if not os.path.isfile(background_data_file):
raise Exception("Background data is not available. Please run the `parse_dataset.py` script.")
# Get counter if already computed.
if os.path.isfile(background_ngrams_file):
print("N-grams were already computed, now loading the file.")
return pickle.load(open(background_ngrams_file, 'rb'))
print("Generating n-grams and storing to file.")
# Load data
f = open(background_data_file, 'r')
background_data = f.read().splitlines()
f.close()
# Compute n-grams of background_data.
counter = Counter()
j = 0
for query in background_data:
j += 1
if j % 100000 == 0:
prog = "{:.2f}".format(j / len(background_data) * 100)
print(f"{prog}%")
counter.update(compute_end_grams(query))
with open(background_ngrams_file, 'wb') as outputfile:
pickle.dump(counter, outputfile)
return counter
# Computes end-grams of the background data.
def get_background_popularity(directory = ""):
background_data_file = directory + "background.txt"
background_popularity_file = directory + "background_popularity.txt"
if not os.path.isfile(background_data_file):
raise Exception("Background data is not available. Please run the `parse_dataset.py` script.")
# Get counter if already computed.
if os.path.isfile(background_popularity_file):
print("Popular queries were already computed, now loading the file.")
return pickle.load(open(background_popularity_file, 'rb'))
print("Generating popularity scores and storing to file.")
# Load data
f = open(background_data_file, 'r')
background_data = f.read().splitlines()
f.close()
# Compute n-grams of background_data.
counter = Counter()
j = 0
for query in background_data:
j += 1
if j % 100000 == 0:
prog = "{:.2f}".format(j / len(background_data) * 100)
print(f"Building popularity counter: {prog}%")
counter.update({query})
with open(background_popularity_file, 'wb') as outputfile:
pickle.dump(counter, outputfile)
return counter
def get_prefix_tree(counter, filename):
# Get tree if already computed.
if os.path.isfile(filename):
print(f"{filename} was already computed, now load the file.")
return datrie.Trie.load(filename)
tree = datrie.Trie(string.ascii_lowercase + string.whitespace + string.digits)
j = 0
counter_common = counter.most_common()
for el, count in counter_common:
j += 1
if j % 100000 == 0:
prog = "{:.2f}".format(j / len(counter_common) * 100)
print(f"Build prefix-tree: {prog}%")
tree[el] = count
# Store tree.
tree.save(filename)
return tree
# For a query computes all end-grams.
def compute_end_grams(query):
query_split = query.split()
return {' '.join(query_split[-i:]) for i in range(1, len(query_split) + 1)}