-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassifier.py
executable file
·56 lines (45 loc) · 2.13 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
LEMMATIZER = WordNetLemmatizer()
STOP_SET = set(stopwords.words('english'))
def get_words(filename):
"""returns a generator that allows iteration over the words in the
document, so that the entire document doesn't need to be loaded into
memory. Read up about generators here: http://www.dabeaz.com/generators/
If a word is contained in the set of stop words, then it is ignored.
Examples of stop words include "the", "it", "so"
"""
for line in open('data/%s.txt' % filename):
for word in word_tokenize(line):
if word in STOP_SET:
continue
yield word
def lemmatize(word, lemmatizer=LEMMATIZER):
"""Lowercases and lemmatizes the given word using the given lemmatizer.
For information about lemmatization: http://en.wikipedia.org/wiki/Lemmatisation
"""
return lemmatizer.lemmatize(word.lower())
def word_features(filename):
"""iterate over a text file and construct a dictionary of lemmatized words
as keys, and a given key's existence in the dictionary as a feature by
which to train a model.
"""
words = get_words(filename)
return {"contains(%s)" % lemmatize(word): True for word in words}
if __name__=="__main__":
nytimes = [(word_features("nytimes"), "Nytimes")]
nytimes2 = [(word_features("nytimes2"), "Nytimes")]
nytimes3 = [(word_features("nytimes3"), "Nytimes")]
nytimes4 = [(word_features("nytimes4"), "Nytimes")]
nltk_data = [(word_features("nltk"), "NLTK")]
nltk_data2 = [(word_features("nltk2"), "NLTK")]
nltk_data3 = [(word_features("nltk3"), "NLTK")]
feature_set = nytimes + nytimes2 + nytimes3 + nytimes4 + nltk_data + nltk_data2 + nltk_data3
classifier = nltk.NaiveBayesClassifier.train(feature_set)
#classifier.show_most_informative_features(5)
print 'nltk4: ', classifier.classify(word_features("nltk4"))
print 'nytimes5: ', classifier.classify(word_features("nytimes5"))
print 'jezebel: ', classifier.classify(word_features("jezebel"))