-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes_model.py
355 lines (279 loc) · 13 KB
/
naive_bayes_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import collections as c
import numpy as np
from pre_processing import pre_process
from joblib import dump
########################### NAIVE BAYES MODEL ######################################
# LanguageModel class represents a language model with the following capabilities:
# represent an n-gram model
# apply laplace smoothing (optionally)
# train on given sentences
# score given sentences (average probability) or files (average probability and standard deviance)
# generate one or any number of sentences
class LanguageModel:
# the n-gram of this LanguageModel, default 2
n = 2
# if laplace smoothing will be applied, default False
laplace = False
# known vocabulary (n-grams) and their frequency of occurence based on training data
vocab_counts = {}
# the counts of all known tokens used in laplace smoothing
token_counts = {}
# the total number of n-grams this was trained on
total_n_grams = 0
# known vocabulary (n-grams) and their probability of occurence based on training data
# this probability is used for scoring while the vocab_counts and total_n_grams are
# used to calculate probability in sentence generation
vocab_probabilities = {}
# constants to define pseudo-word tokens
# access via self.UNK, for instance
UNK = "<UNK>"
SENT_BEGIN = "<s>"
SENT_END = "</s>"
def __init__(self, n_gram, is_laplace_smoothing):
"""Initializes an untrained LanguageModel
Parameters:
n_gram (int): the n-gram order of the language model to create, must be 1 or greater
is_laplace_smoothing (bool): whether or not to use Laplace smoothing
"""
self.n = n_gram
self.laplace = is_laplace_smoothing
def train(self, training_data):
"""Trains the language model on the given data. Assumes that the given data
has tokens that are white-space separated, has one sentence per line, and
that the sentences begin with <s> and end with </s>
Parameters:
training_data (List of List of string): the training data to read
Returns:
None
Saves learned data in the vocab_probabilities field
"""
# create n-grams and tokens from list of list of tokens
n_grams = []
bigrams_backoff = []
tokens = []
for sentence in training_data:
n_grams_sent = self.make_n_grams(sentence)
# if this is an n > 2 model, also score the probabilities for bigrams
if self.n > 2:
n_grams_sent_backoff = self.make_n_grams(sentence, True)
for bigram in n_grams_sent_backoff:
bigrams_backoff.append(bigram)
for n_gram in n_grams_sent:
n_grams.append(n_gram)
for token in sentence:
tokens.append(token)
# calculate the total number of tokens trained on so that probabilities can be calculated
self.total_n_grams = len(n_grams)
# count the frequency of occurence for each n_gram and save this value on the Language Model
self.vocab_counts = c.Counter(n_grams)
self.token_counts = c.Counter(tokens)
# calculate probabilities for each n_gram, applying laplace smoothing if specified
# and save this value on the Language Model so it can be used in scoring
self.vocab_probabilities = self.calculate_probabilities(
self.vocab_counts, self.total_n_grams)
# if this is an n>2 model, this will be used when there is an unknown n-gram context
if len(bigrams_backoff) > 0:
self.total_bigrams = len(bigrams_backoff)
self.bigram_counts = c.Counter(bigrams_backoff)
self.bigram_probabilities = self.calculate_probabilities(
self.bigram_counts, self.total_bigrams)
def make_n_grams(self, tokens, backoff=False):
"""Creates tuples of n length representing the n-grams in the given tokens
Parameters:
tokens (list of str): the list of individual tokens to be grouped into n-grams
Returns:
n_grams (list of tuples of str): the list of n-grams created from the given tokens
"""
n_grams = []
# unigrams are just tokens and will never need to use backoff
if self.n == 1:
return tokens
n = self.n
# generate bigrams
if backoff:
n = 2
# generate n-grams where n is specified by the model
for i in range(n - 1, len(tokens)):
n_gram = [tokens[j] for j in range(i - (n - 1), i + 1)]
n_grams.append(tuple(n_gram))
return n_grams
def calculate_probabilities(self, counts, num_n_grams):
"""Given the counts for each n_gram and the total number of n_grams calculate_probabilities calculates the
probability of occurence for each n_gram, using laplace smoothing if it is specified by the model
Parameters:
counts (Counter): mapping of n_grams to their frequency of occurence
num_n_grams (int): total number of n_grams seen
Returns:
probabilities, a mapping of n_grams to their probability of occurence
"""
if self.n == 1:
return self.calculate_probabilities_for_unigram(counts, num_n_grams)
else:
return self.calculate_probabilities_for_n_gram(counts)
def calculate_probabilities_for_unigram(self, counts, num_unigrams):
"""Given the counts for each unigram and the total number of unigrams this calculates the
probability of occurence for each unigram, using laplace smoothing if it is specified by the model
Parameters:
counts (Counter): mapping of unigrams to their frequency of occurence
num_unigrams (int): total number of unigrams seen
Returns:
probabilities, a mapping of unigrams to their probability of occurence
"""
probabilities = {}
if self.laplace:
for key in counts.keys():
probabilities[key] = (
(counts[key] + 1) / (num_unigrams + len(counts.keys()) - 1))
else:
for key in counts.keys():
probabilities[key] = (counts[key] / num_unigrams)
return probabilities
def calculate_probabilities_for_n_gram(self, counts):
"""Given the counts for all n_grams this calculates the probability of occurence for each n_gram,
using laplace smoothing if it is specified by the model
Parameters:
counts (Counter): mapping of n_grams to their frequency of occurence
Returns:
probabilities, a mapping of n_grams to their probability of occurence
"""
probabilities = {}
# get the count of similar n-grams
similar_n_grams = []
for key in counts.keys():
key_start = key[:-1]
similar_n_grams.append(key_start)
counts_of_similar_n_grams = c.Counter(similar_n_grams)
# divide count of this n-gram by total counts of all similar n-grams
for key in counts.keys():
if self.laplace:
probabilities[key] = (
counts[key] + 1) / (counts_of_similar_n_grams[key[:-1]] + len(self.token_counts.keys()))
else:
# avoid divide by zero error
if (counts_of_similar_n_grams[key[:-1]] == 0):
probabilities[key] = 0
else:
probabilities[key] = counts[key] / \
counts_of_similar_n_grams[key[:-1]]
return probabilities
def generate_sentence(self, init):
"""Generates a single sentence from a trained language model using the Shannon technique.
Returns:
sentence: list of tokens, the generated sentence
"""
sentence = []
# add the necessary number of SENT_BEGIN
# and randomly sample the probability distribution if this is a model unigram
if self.n == 1:
sentence.append(self.SENT_BEGIN)
sentence.append(init)
k = []
v = []
# don't include the sentence begin token
total_grams = self.total_n_grams - \
self.vocab_counts[self.SENT_BEGIN]
for key in self.vocab_counts.keys():
if key != self.SENT_BEGIN:
k.append(key)
v.append(self.vocab_counts[key] / total_grams)
while sentence[-1] != self.SENT_END:
sentence.append(np.random.choice(k, p=v))
# randomly sample the probability distribution if this is a model an n-gram model, n > 1
else:
# add the necessary number of SENT_BEGIN
for i in range(self.n - 1):
sentence.append(self.SENT_BEGIN)
sentence.append(init)
# if this is the initial n-gram, backoff to a bigram model
# generate the next most likely token given the last token in the n_gram so far
# do this until you have an n-gram that is known.
while len(sentence) < self.n + 1:
ngram_sofar = sentence[-1]
# find all possible next words and their probabilities
possible_words_probs = self.get_possible_next_words_probs(
ngram_sofar, True)
# sample the probability distribution (at 0: words, at 1: probabilities) and add to the sentence
sentence.append(np.random.choice(
possible_words_probs[0], p=possible_words_probs[1]))
while sentence[-1] != self.SENT_END:
# get the prefix, n-gram so far
end = len(sentence)
start = end - (self.n - 1)
ngram_sofar = sentence[start:end]
# find all possible next words and their probabilities
possible_words_probs = self.get_possible_next_words_probs(
ngram_sofar)
# sample the probability distribution (at 0: words, at 1: probabilities) and add to the sentence
sentence.append(np.random.choice(
possible_words_probs[0], p=possible_words_probs[1]))
return sentence
def get_possible_next_words_probs(self, ngram_sofar, backoff=False):
''' Given the n-gram so far, generate a list of possible next words and calculate their probabilities.
Parameters: n-gram so far, the prefix
Returns: list of the next possible words and a corresponding list of their probabilites
'''
words = []
counts = []
probs = []
total_count = 0
dictionary = self.vocab_counts.items()
if backoff == True:
dictionary = self.bigram_counts.items()
# first pass
# find all possible n-grams given the n-gram so far
for key, count in dictionary:
# identify the prefix of the n-gram key, all words in n-gram but last
key_start = []
if backoff == True:
key_start = key[0]
else:
for w in key[:-1]:
key_start.append(w)
# if the prefix of the n-gram key matches the ngram_sofar
# then add the last word of the key to the list of possible words, and the count to counts
if key_start == ngram_sofar:
words.append(key[-1])
counts.append(count)
total_count += count
# divide the count of a possible word with the count of all possible words
for count in counts:
probs.append(count / total_count)
return (words, probs)
def generate(self, n, topic_words):
"""Generates n sentences from a trained language model using the Shannon technique.
Parameters:
n (int): the number of sentences to generate
Returns:
list: a list containing strings, one per generated sentence
"""
text = []
for i in range(n):
text.append(self.generate_sentence(topic_words[i]))
return text
def verify_words(self, words):
"""Verifies that this model knows the given words,
if a word is not known, then remove it from the returned list
Parameters: list of words to check if in vocab
Returns: list of given words in vocab
"""
known_words = list(self.token_counts.keys())
given_known_words = []
for w in words:
if w in known_words:
given_known_words.append(w)
return given_known_words
######################## main method for training ##################################
def main():
"""
Train and save the trigram naive bayes mode
"""
training_data_tri = pre_process("PoetryFoundationData.csv", 3)
print("done pre-processing data")
my_clf = LanguageModel(3, False)
my_clf.train(training_data_tri)
print("done training trigram model")
dump(my_clf, "my_clf.pkl")
print("done saving trigram model")
pass
if __name__ == '__main__':
main()