-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWSD.py
296 lines (277 loc) · 12.9 KB
/
WSD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#Andrew Gasiorowski
import re as _re
import pandas as _pd
import random
import sys
from collections import Counter as _Counter
from math import ceil as _ceil
from math import log as _log
#this function takes a file name and returns a list of tuples
#tuple[0] is answer instance
#tuple[1] is word sense
#tuple[2] is context text
def extract_tuples(file_name):
with open(file_name) as file:
corpus = _re.findall(r'<answer instance="(.+?)" senseid="(.+?)"/>.+?<context>.(.+?).</context>',file.read(), _re.DOTALL)
return corpus
#This function takes a list of tuples where the context (tuple[2]) is a single string. It returns a new list where tuple[0&1] are the same but tuple[2] is now a list
def extract_context(corpus):
enhanced_corpus = []
word_senses = []
for instance in corpus:
#for every training sample
if instance[1] not in word_senses:
#if this word sense has not been seen before
word_senses.append(instance[1])
#add it to the list of known word senses
words = _re.findall(r'(?=(?<!<|>)\b)(\w+)(?=(?!>)\b)', instance[2])
#extract all the words that aren't the word we're trying to disambiguate(or it's tag words)
new_tuple = (instance[0], instance[1], words)
enhanced_corpus.append(new_tuple)
return enhanced_corpus, word_senses
#This function generates the indexes for folds in a top-bottom approach from the input file
def generate_k_fold_validation_indexes(corpus, senses, k):
num_senses = len(senses)
num_samples = len(corpus)
indexes = []
if num_samples % k != 0:
#if we cannot evenly divide the training samples
num_samples_not_last_fold = _ceil(num_samples/k)
#the equal size folds are of size #samples / #folds
num_samples_last_fold = num_samples - ((k - 1) * num_samples_not_last_fold)
#the last fold is all the remaining samples
this_start = 0
this_end = num_samples_not_last_fold
for i in range(0,k-1):
#for each of the equal sized folds
indexes.append(list(range(this_start,this_end)))
#append a list of indexes
this_start = this_end
this_end = this_end + num_samples_not_last_fold
#The above two lines move the indexes
indexes.append(list(range(this_start, this_start + num_samples_last_fold)))
#append a final list of indexes of the remaining numbers
else:
#if we can evenly divide our training samples
num_samples_per_fold = _ceil(num_samples/k)
this_start = 0
this_end = num_samples_per_fold
for i in range(0,k):
#for each of the folds
indexes.append(list(range(this_start,this_end)))
#append a list of indexes
this_start = this_end
this_end = this_end + num_samples_per_fold
return indexes
#this function was created as an optimization of the previous index algorithm
#If we have M samples we generate M unique random numbers between 0 : (M-1)
#These numbers are stored in a list. We iterate through the list of rands and generate the indexes of the folds
#It returns a list of lists where each nested list contains the indexes for a fold
def generate_rands(corpus, senses, k):
num_samples = len(corpus)
raw_indexes = _pd.Series(random.sample(range(num_samples), num_samples))
#this generates the random unique numbers
indexes = []
this_start = 0
if num_samples % k == 0:
#if we can evenly split the number of samples
num_samples_per_fold = int(num_samples/k)
#every fold will have the same number of samples
this_end = num_samples_per_fold
#this end is used to signify the list index of the end of the current fold
for i in range(0,k):
indexes.append(raw_indexes[this_start:this_end])
#append this list of indexes to the ith fold
this_start = this_end
#the start of the next fold is the end of this fold
this_end = this_end + num_samples_per_fold
#the end of the next fold is the end of this fold plus the fold size
else:
#if we cannot evenly split the number of samples
num_samples_not_last_fold = int(_ceil(num_samples/k))
num_samples_last_fold = int(num_samples - ((k - 1) * num_samples_not_last_fold))
#the last fold will contain the remainder of examples
this_end = num_samples_not_last_fold
for i in range(0,k-1):
#assign all the evenly sized folds
indexes.append(raw_indexes[this_start:this_end])
this_start = this_end
this_end = this_end + num_samples_not_last_fold
indexes.append(raw_indexes[this_end - num_samples_not_last_fold:this_end+num_samples_last_fold])
#assign the remaining examples to the last fold
return indexes
#this function generates folds with the aim of minimizing the difference between the number of samples per fold accross all folds
def even_folds(corpus, senses, k):
sense_list_in_order = []
for sample in corpus:
sense_list_in_order.append(sample[1])
num_per_sense = _Counter(sense_list_in_order)
count_per_fold = {}
for sense in senses:
total = num_per_sense[sense]
not_last_fold = _ceil(total / k)
total_not_last = (k-1)*not_last_fold
last_fold = total - total_not_last
count_per_fold[sense] = (not_last_fold, last_fold)
index_by_sense = {}
sense_index_counter = {}
for sense in senses:
index_by_sense[sense] = []
sense_index_counter[sense] = 0
for idx,sample in enumerate(corpus):
index_by_sense[sample[1]].append(idx)
indexes = []
last_fold_counter = 0
for fold in range(k):
fold_indexes = []
for sense in senses:
#we have n lists of indexes for n senses
#we need to subset each of the lists into appropriate fold sizes
if last_fold_counter < (k - 1):
current_idx = 0
while(current_idx < count_per_fold[sense][0]):
fold_indexes.append(index_by_sense[sense][sense_index_counter[sense]])
current_idx += 1
sense_index_counter[sense] += 1
else:
current_idx = 0
while(current_idx < count_per_fold[sense][1]):
fold_indexes.append(index_by_sense[sense][sense_index_counter[sense]])
current_idx += 1
sense_index_counter[sense] += 1
last_fold_counter += 1
indexes.append(fold_indexes)
return indexes
#Takes the corpus and indicies of the folds. Returns the corpus split by folds
def split_data_into_folds(corpus, indicies):
folds_list = []
corpus_series = _pd.Series(corpus)
#convert list to series for easy subsetting
for i,idx_list in enumerate(indicies):
#for every fold of indexes
folds_list.append(corpus_series[idx_list])
#append the corresponding samples to folds_list
return _pd.Series(folds_list)
#takes corpus and senses, returns a dictionary of dictionaries(counters)
#outside dict key is sense (meaning there is a dictionary for each sense) whose value is a dictionary(counter)
#nested dict key is word and value is the count of that word
def create_bag_of_words(corpus, senses):
unique_word_counts = _Counter()
senses_counter = _Counter()
bow_dict = {}
for sense in senses:
bow_dict[sense] = _Counter()
#make a Counter obj for every word sense
for instance in corpus:
#for every sample
for word in instance[2]:
#for every word in sample
bow_dict[instance[1]][word] += 1
#incrament the count of this word for this sense
unique_word_counts[word] += 1
#incrament the times we've seen this word total
senses_counter[instance[1]] += 1
#incrament the number of times we've seen this sense
return bow_dict, senses_counter, unique_word_counts
#bag_dict -> dictionary{key:sense value:Counter{key:word value:count_of_word}} -> returns count of word for sense
#test_set[n] -> Series[n][0]:sample_num series[n][1]:actual_sense series[n][2]:list of words
#senses -> list of senses
#num_training -> num training samples
#examples_per_sense = dictionary{key:sense value:count_for_sense}
#sum(examples_per_sense) = len(training_set)
#unique_words_count = dictionary{key:word value:count_independant_of_sense}
#len(unique_words_count) = number of unique words in training set
def new_naive_bayes(bag_dict, test_set, senses, num_training, examples_per_sense, unique_words_count, fold_num):
#p(sense | words) = p(sense) * p(word[0] | sense) * ... * p(word[n] | sense)
#p(sense) = count(examples_per_sense) / num_training_examples
#p(word | sense) = (count(word, sense) + 1) / (count(sense) + V)
#so we need the following counts:
#count(examples_per_sense) -> examples_per_sense[sense]
#num_training_examples -> num_training
#count(word, sense) -> bag_dict[sense][word]
#count(sense) -> word_count_by_sense[sense]
#V -> _v
#we must construct count(sense)
word_count_by_sense = {}
for sense in senses:
word_count_by_sense[sense] = sum(bag_dict[sense].values())
#the total number of words per sense is the sum of the counts of each word
#end of construct count(sense)
#we must construct V
_v = len(unique_words_count)
#end of construct V
file_output_set = []
file_output_set.append('fold ' + fold_num)
answer_set = []
#answer_set[0]=sample_num answer_set[1]=actual_output answer_set[2]=predicted_output
for sentence in test_set:
prob_sense = {}
for sense in senses:
prob_sense[sense] = _log(examples_per_sense[sense] / num_training_examples, 10)
for word in sentence[2]:
prob_sense[sense] += _log((bag_dict[sense][word] + 1) / (word_count_by_sense[sense] + _v), 10)
prediction = max(prob_sense, key=prob_sense.get)
answer_set.append((sentence[0], sentence[1], prediction))
file_output_set.append((sentence[0],prediction))
print('fold',fold_num, str(round(calculate_accuracy(answer_set)*100, 2)),'%')
return answer_set, file_output_set
#calculates the average for a fold or the average accross all folds depending on what is passed to it
def calculate_accuracy(result):
num_samples = len(result)
num_correct = 0
wrong = []
for sample in result:
if sample[1] == sample[2]:
num_correct += 1
else:
wrong.append(sample[0])
return num_correct/num_samples
#this writes the output to file
def write_to_file(_disamb_word, _output_tup):
out_file = _disamb_word + '.out'
template = "{0:15}{1:5}"
with open(out_file, 'w') as f:
for item in _output_tup:
if type(item) == str:
f.write(item+'\n')
else:
line_tup = (item[0], item[1])
f.write(template.format(*line_tup)+'\n')
file_name = sys.argv
#gets filename from command line
file_name = file_name[1]
num_folds = 5
#number of folds
corpus = extract_tuples(file_name)
#extracts the sample number, the correct sense, and all the text
corpus, senses = extract_context(corpus)
#tokenizes the text for each sample and removes the sense word
indexes_of_folds = generate_k_fold_validation_indexes(corpus, senses, num_folds)
#indexes_of_folds = generate_rands(corpus, senses, num_folds)
#indexes_of_folds = even_folds(corpus, senses, num_folds)
#all three of these work, generate_k_fold_valid generatese the folds in accordance with how they were presented in class
corpus_series_of_folds = split_data_into_folds(corpus, indexes_of_folds)
#this function takes the indexes and splits the data into it's folds
results = []
write_to_file_list = []
#just some containers to hold answers
for idx, fold in enumerate(corpus_series_of_folds):
#for every fold in the set
train_set = _pd.Series()
test_set = fold
#the testing set for this iteration is 'fold'
for i,fld in enumerate(corpus_series_of_folds):
#Go through the folds
if i != idx:
train_set = _pd.concat([train_set,fld], ignore_index=True)
#the training set is everything that is not the test set
num_training_examples = len(train_set)
word_freq_dict, examples_per_sense, unique_words_count = create_bag_of_words(train_set,senses)
#creates a dictionary whose key is a sense and whose value is another dictionary (counter->dict subclass)
#the nested dictionary has key word and value count
this_fold_answers, file_output = new_naive_bayes(word_freq_dict, test_set,senses, num_training_examples, examples_per_sense, unique_words_count, str(idx))
results.extend(this_fold_answers)
write_to_file_list.extend(file_output)
accuracy = calculate_accuracy(results)
write_to_file(file_name, write_to_file_list)
print(file_name, str(round(accuracy*100, 2)),'%')