-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcombined_classifier.py
138 lines (109 loc) · 4.42 KB
/
combined_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
from __future__ import division
import nltk, string, math, csv, os, random, re, numpy
wnl = nltk.WordNetLemmatizer()
import h_features
import n_feature
import sFeature
import tristan_features
import parse
def get_features(sent):
#print sent
#z.update(hn_feature.n_structural_features(sent))
z = n_feature.n_structural_features(sent)
#z = tristan_features.char_based_features(sent)
#z.update(tristan_features.char_based_features(sent))
#z.update(tristan_features.syntactic_features(sent))
#z.update(n_feature.n_structural_features(sent))
#z.update(sFeature.sFeature(sent))
return z
train_base_path = "data/training/"
train_files = ["Canon PowerShot SD500.txt", "Canon S100.txt", "Diaper Champ.txt", "Hitachi router.txt", "ipod.txt", "Linksys Router.txt", "MicroMP3.txt","Nokia 6600.txt", "norton.txt"]
training_data = {}
for train_file in train_files:
train_path = os.path.join(train_base_path, train_file)
parse.read_txt_data(train_path, training_data)
held_base_path = "data/heldout/"
held_files = ["Apex AD2600 Progressive-scan DVD player.txt", "Canon G3.txt", "Creative Labs Nomad Jukebox Zen Xtra 40GB.txt", "Nikon coolpix 4300.txt", "Nokia 6610.txt"]
held_data = {}
for held_file in held_files:
held_path = os.path.join(held_base_path, held_file)
parse.read_txt_data(held_path, held_data)
training_data = parse.val_to_polarity(training_data)
held_data = parse.val_to_polarity(held_data)
#testing
feature_sets = [(get_features(n), v) for (n,v) in training_data.items()]
random.shuffle(feature_sets)
size = int(len(feature_sets) * 0.9)
print "training results"
train_set, test_set = feature_sets[size:], feature_sets[:size]
#print train_set
#classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier = nltk.DecisionTreeClassifier.train(train_set)
#classifier = nltk.MaxentClassifier.train(train_set)
#classifier = nltk.weka.WekaClassifier.train(train_set)
#polarized_held_data
#polatized_test_set = [(get_features(n), v) for (n,v) in polarized_held_data.items()]
held_test_set = [(get_features(n), v) for (n,v) in held_data.items()]
a = []
b = []
print len(held_test_set)
for t in held_test_set:
#pass
#print t[1]
#print classifier.classify(t[0])
a.append(t[1])
b.append(classifier.classify(t[0]))
correct = 0.0
incorrect = 0.0
#print classified_result
for i in xrange(0, len(b)):
if parse.rangefy(a[i]) == parse.rangefy(b[i]):
correct+=1
else:
incorrect+=1
perc = correct/(correct+incorrect)
print correct
print incorrect
print "TRUE VALUE: %s" % perc
print nltk.classify.accuracy(classifier, test_set)
#classifier.show_most_informative_features()
print " heldout results"
train_set = feature_sets
test_set = [(get_features(n), v) for (n,v) in held_data.items()]
print nltk.classify.accuracy(classifier, test_set)
#classifier.show_most_informative_features()
final_test_files = ['product1.txt', 'product2.txt', 'product3.txt']
final_test_path = "data/testdata/"
output_file_path = "data/testdata/classified_output.txt"
output = f = open(output_file_path, 'w+')
test_file_dict = {} #dict of dicts
for final_test_file in final_test_files:
#file_dict = parse.read_test_data(os.path.join(final_test_path, final_test_file))
key_list, val_list = parse.read_test_data(os.path.join(final_test_path, final_test_file))
test_file_dict[final_test_file] = (key_list, val_list)
print len(test_file_dict) #dict of dict. {"product1.txt":{"sent":1...}, "product2":}
for file_name in final_test_files:
key_list = test_file_dict[file_name][0]
val_list = test_file_dict[file_name][1]
keys = [int(k) for k in key_list]
for i in xrange(0, len(key_list)):
line_num = key_list[i]
sentence = val_list[i]
if "[t]" not in sentence:
output.write("%s\t%s\t%s\n" % (file_name, line_num, classifier.classify(get_features(sentence))))
else:
output.write("%s\t%s\t%s\n" % (file_name, line_num, 0))
output.close()
a = "This is the worst product ever"
print a
print classifier.classify(get_features(a))
a = "This is the worst product ever. I will never buy this piece of shit. terrible. terrible terrible! I mean, I can't believe how bad it is!. blah!"
print a
print classifier.classify(get_features(a))
a = "This is pretty damn awesome. I would definitely buy again"
print a
print classifier.classify(get_features(a))
a = "It is better than the competition, by far!"
print a
print classifier.classify(get_features(a))