forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrost.py
121 lines (97 loc) · 3.06 KB
/
frost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# https://deeplearningcourses.com/c/unsupervised-machine-learning-hidden-markov-models-in-python
# https://udemy.com/unsupervised-machine-learning-hidden-markov-models-in-python
# http://lazyprogrammer.me
# Model and generate Robert Frost poems.
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import string
import sys
initial = {} # start of a phrase
second_word = {}
transitions = {}
# unfortunately these work different ways
def remove_punctuation_2(s):
return s.translate(None, string.punctuation)
def remove_punctuation_3(s):
return s.translate(str.maketrans('','',string.punctuation))
if sys.version.startswith('2'):
remove_punctuation = remove_punctuation_2
else:
remove_punctuation = remove_punctuation_3
def add2dict(d, k, v):
if k not in d:
d[k] = []
d[k].append(v)
for line in open('robert_frost.txt'):
tokens = remove_punctuation(line.rstrip().lower()).split()
T = len(tokens)
for i in range(T):
t = tokens[i]
if i == 0:
# measure the distribution of the first word
initial[t] = initial.get(t, 0.) + 1
else:
t_1 = tokens[i-1]
if i == T - 1:
# measure probability of ending the line
add2dict(transitions, (t_1, t), 'END')
if i == 1:
# measure distribution of second word
# given only first word
add2dict(second_word, t_1, t)
else:
t_2 = tokens[i-2]
add2dict(transitions, (t_2, t_1), t)
# normalize the distributions
initial_total = sum(initial.values())
for t, c in iteritems(initial):
initial[t] = c / initial_total
def list2pdict(ts):
# turn each list of possibilities into a dictionary of probabilities
d = {}
n = len(ts)
for t in ts:
d[t] = d.get(t, 0.) + 1
for t, c in iteritems(d):
d[t] = c / n
return d
for t_1, ts in iteritems(second_word):
# replace list with dictionary of probabilities
second_word[t_1] = list2pdict(ts)
for k, ts in iteritems(transitions):
transitions[k] = list2pdict(ts)
# generate 4 lines
def sample_word(d):
# print "d:", d
p0 = np.random.random()
# print "p0:", p0
cumulative = 0
for t, p in iteritems(d):
cumulative += p
if p0 < cumulative:
return t
assert(False) # should never get here
def generate():
for i in range(4):
sentence =[]
# initial word
w0 = sample_word(initial)
sentence.append(w0)
# sample second word
w1 = sample_word(second_word[w0])
sentence.append(w1)
# second-order transitions until END
while True:
w2 = sample_word(transitions[(w0, w1)])
if w2 == 'END':
break
sentence.append(w2)
w0 = w1
w1 = w2
print(' '.join(sentence))
generate()
# exercise: make them rhyme!