-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSearcher.py
142 lines (122 loc) · 5.71 KB
/
Searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import itertools
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from ast import literal_eval
snow_stemmer = SnowballStemmer(language='english')
# finds distance between two words; gives rank according to how close the words are. Returns sorted list of doc-ids
def proximity_rank(final_list, fixdata, fixrank):
# pass through every document in the temporary forward index
for d_id, fix in fixdata.items():
nums = len(fix)
# in each document apply cartesian product of every unique word in the search
for i in range(nums - 1):
for j in range(i + 1, nums):
carts = list(itertools.product(fix[i], fix[j]))
tmin = 10000
cmin = 10000
# after cartesian product check which combination has the minimum distance in both title and content
for k in carts:
if k[0][0] != k[1][0]:
continue
if k[0][0] == 10:
if abs(k[0][1] - k[1][1]) < tmin:
tmin = abs(k[0][1] - k[1][1])
if k[0][0] == 1:
if abs(k[0][1] - k[1][1]) < cmin:
cmin = abs(k[0][1] - k[1][1])
# give rank according to the shortest distance between two words
if tmin <= 3:
fixrank[d_id] += 25
elif tmin <= 5:
fixrank[d_id] += 20
elif tmin <= 8:
fixrank[d_id] += 15
if cmin <= 3:
fixrank[d_id] += 15
elif cmin <= 5:
fixrank[d_id] += 10
elif cmin <= 8:
fixrank[d_id] += 5
# sort the rank and append in final_list
nprank = np.array(list(fixrank.items()))
nprank = nprank[nprank[:, 1].argsort()]
nprank = np.flip(nprank, 0)
final_list = np.append(final_list, nprank, axis=0)
return final_list
# prints the urls sorted by rank for the given word
def WordSearch(word,filestreams,lexicon,docids,accumulativefreq):
results = []
# priorities of documents on how many words mach by the search query
thirdfix = {} # highest priority
secondfix = {} # second priority
firstfix = {} # lowest priority
# rank of each document in its priority dictionary
thirdrank = {}
secondrank = {}
firstrank = {}
# convert word to standard form and only include unique words which are in lexicon
all_words = word.split(" ")
all_words = [snow_stemmer.stem(wd.lower()) for wd in all_words]
main_words = [wd for wd in all_words if wd in lexicon]
main_words = list(dict.fromkeys(main_words))
# parse through each word and select documents in which these words occur. Priority is also set on how many words
# in search occur in a particular document
for wd in main_words:
wid = lexicon[wd][0]
barrelid = wid // 400
offsetid = wid % 400
# get the start and end location in inverted index with the help of cumulative frequency
if offsetid == 0:
start = 0
end = accumulativefreq[barrelid][offsetid]
else:
start = accumulativefreq[barrelid][offsetid - 1]
end = accumulativefreq[barrelid][offsetid]
quality_rank = int(100000/lexicon[wd][1]) # this assigns weights to words based on how many documents they occur
# parse through the lines in the inverted index and get rank and hit list
for line in itertools.islice(filestreams[barrelid], start, end):
did, _, rank, hits = line.split('#')
did, rank, hits = int(did), int(rank), literal_eval(hits)
rank += quality_rank
# set priority of documents based on how many words in search words occur in the document
if did in thirdfix:
thirdfix[did].append(hits)
thirdrank[did] += int(rank)
elif did in secondfix:
thirdfix[did] = secondfix[did]
thirdfix[did].append(hits)
thirdrank[did] = secondrank[did] + rank
secondfix.pop(did)
secondrank.pop(did)
elif did in firstfix:
secondfix[did] = firstfix[did]
secondfix[did].append(hits)
secondrank[did] = firstrank[did] + rank
firstfix.pop(did)
firstrank.pop(did)
else:
firstfix[did] = [hits]
firstrank[did] = rank
# go to the start of the inverted file for another word that may lie in the same barrel
filestreams[barrelid].seek(0)
final_list = np.array([[-1, -1]]) # stores the final sorted list of document-ids and rank
# first choose from highest priority list "thirdrank". if the search results are less than 10 then move to new
# priority.
if len(thirdrank) != 0:
final_list = proximity_rank(final_list, fixdata=thirdfix, fixrank=thirdrank)
if len(final_list) <= 50 and len(secondrank) != 0:
final_list = proximity_rank(final_list, fixdata=secondfix, fixrank=secondrank)
if len(final_list) <= 50 and len(firstrank) != 0:
final_list = proximity_rank(final_list, fixdata=firstfix, fixrank=firstrank)
final_length = len(final_list)
if final_length == 1:
results.append("No match for the search!")
elif final_length <= 50:
for i in range(1, final_length):
results.append(docids[final_list[i][0]][1])
results.append(docids[final_list[i][0]][0])
else:
for i in range(1, 50):
results.append(docids[final_list[i][0]][1])
results.append(docids[final_list[i][0]][0])
return results