Skip to content

Commit

Permalink
python 3
Browse files Browse the repository at this point in the history
  • Loading branch information
Lazy Programmer committed Dec 24, 2017
1 parent e4d5fb7 commit 271399a
Show file tree
Hide file tree
Showing 14 changed files with 357 additions and 286 deletions.
37 changes: 28 additions & 9 deletions hmm_class/frost.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,36 @@
# https://udemy.com/unsupervised-machine-learning-hidden-markov-models-in-python
# http://lazyprogrammer.me
# Model and generate Robert Frost poems.
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future



import numpy as np
import string
import sys


initial = {} # start of a phrase
second_word = {}
transitions = {}

def remove_punctuation(s):
# unfortunately these work different ways
def remove_punctuation_2(s):
return s.translate(None, string.punctuation)

def remove_punctuation_3(s):
return s.translate(str.maketrans('','',string.punctuation))

if sys.version.startswith('2'):
remove_punctuation = remove_punctuation_2
else:
remove_punctuation = remove_punctuation_3


def add2dict(d, k, v):
if k not in d:
d[k] = []
Expand All @@ -22,7 +41,7 @@ def add2dict(d, k, v):
tokens = remove_punctuation(line.rstrip().lower()).split()

T = len(tokens)
for i in xrange(T):
for i in range(T):
t = tokens[i]
if i == 0:
# measure the distribution of the first word
Expand All @@ -43,7 +62,7 @@ def add2dict(d, k, v):

# normalize the distributions
initial_total = sum(initial.values())
for t, c in initial.iteritems():
for t, c in iteritems(initial):
initial[t] = c / initial_total

def list2pdict(ts):
Expand All @@ -52,15 +71,15 @@ def list2pdict(ts):
n = len(ts)
for t in ts:
d[t] = d.get(t, 0.) + 1
for t, c in d.iteritems():
for t, c in iteritems(d):
d[t] = c / n
return d

for t_1, ts in second_word.iteritems():
for t_1, ts in iteritems(second_word):
# replace list with dictionary of probabilities
second_word[t_1] = list2pdict(ts)

for k, ts in transitions.iteritems():
for k, ts in iteritems(transitions):
transitions[k] = list2pdict(ts)

# generate 4 lines
Expand All @@ -69,14 +88,14 @@ def sample_word(d):
p0 = np.random.random()
# print "p0:", p0
cumulative = 0
for t, p in d.iteritems():
for t, p in iteritems(d):
cumulative += p
if p0 < cumulative:
return t
assert(False) # should never get here

def generate():
for i in xrange(4):
for i in range(4):
sentence =[]

# initial word
Expand All @@ -95,7 +114,7 @@ def generate():
sentence.append(w2)
w0 = w1
w1 = w2
print ' '.join(sentence)
print(' '.join(sentence))

generate()

Expand Down
11 changes: 9 additions & 2 deletions hmm_class/generate_ht.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
# https://udemy.com/unsupervised-machine-learning-hidden-markov-models-in-python
# http://lazyprogrammer.me
# Generate discrete data from an HMM.
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future


import numpy as np


Expand All @@ -16,7 +23,7 @@ def generate_sequence(N):
s = np.random.choice(xrange(M), p=pi) # initial state
x = np.random.choice(xrange(V), p=B[s]) # initial observation
sequence = [x]
for n in xrange(N-1):
for n in range(N-1):
s = np.random.choice(xrange(M), p=A[s]) # next state
x = np.random.choice(xrange(V), p=B[s]) # next observation
sequence.append(x)
Expand All @@ -25,7 +32,7 @@ def generate_sequence(N):

def main():
with open('coin_data.txt', 'w') as f:
for n in xrange(50):
for n in range(50):
sequence = generate_sequence(30)
sequence = ''.join(symbol_map[s] for s in sequence)
print sequence
Expand Down
27 changes: 17 additions & 10 deletions hmm_class/hmm_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@
# https://udemy.com/unsupervised-machine-learning-hidden-markov-models-in-python
# http://lazyprogrammer.me
# Demonstrate how HMMs can be used for classification.
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future



import string
import numpy as np
import matplotlib.pyplot as plt

from hmmd_theano import HMM
from hmmd_theano2 import HMM
from sklearn.utils import shuffle
from nltk import pos_tag, word_tokenize

Expand All @@ -18,14 +26,14 @@ def fit(self, X, Y, V):
K = len(set(Y)) # number of classes - assume 0..K-1
self.models = []
self.priors = []
for k in xrange(K):
for k in range(K):
# gather all the training data for this class
thisX = [x for x, y in zip(X, Y) if y == k]
C = len(thisX)
self.priors.append(np.log(C))

hmm = HMM(5)
hmm.fit(thisX, V=V, p_cost=0.1, print_period=1, learning_rate=1e-4, max_iter=100)
hmm.fit(thisX, V=V, print_period=1, learning_rate=1e-2, max_iter=80)
self.models.append(hmm)

def score(self, X, Y):
Expand Down Expand Up @@ -56,7 +64,7 @@ def get_data():
for line in open(fn):
line = line.rstrip()
if line:
print line
print(line)
# tokens = remove_punctuation(line.lower()).split()
tokens = get_tags(line)
if len(tokens) > 1:
Expand All @@ -69,26 +77,25 @@ def get_data():
X.append(sequence)
Y.append(label)
count += 1
print count
print(count)
if count >= 50:
break
print "Vocabulary:", word2idx.keys()
print("Vocabulary:", word2idx.keys())
return X, Y, current_idx


def main():
X, Y, V = get_data()
# print "Finished loading data"
print "len(X):", len(X)
print "Vocabulary size:", V
print("len(X):", len(X))
print("Vocabulary size:", V)
X, Y = shuffle(X, Y)
N = 20 # number to test
Xtrain, Ytrain = X[:-N], Y[:-N]
Xtest, Ytest = X[-N:], Y[-N:]

model = HMMClassifier()
model.fit(Xtrain, Ytrain, V)
print "Score:", model.score(Xtest, Ytest)
print("Score:", model.score(Xtest, Ytest))


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 271399a

Please sign in to comment.