-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathh_features.py
200 lines (159 loc) · 7.81 KB
/
h_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from __future__ import division
import nltk, numpy
import parse
import re
import random
import nltk
import os
'''(ii)Feature format: We will use dict as our feature formats. In order to avoid key name collision, we will add our initial plus underschore in front of each key. ("t_len_sent" etc). More over, we will also normalize numeral values. Because the dictionaries won't have key collision, combining the them will be fairly straight forward.
Training data format is also a dictionary. It will consist of the sentence, plus the positive / negative sentiments associated with the dictionary. i.e. #<"I hate ipods":-2, "Music quality is great":2> etc. Moreoever, it is up to the individual to convert the vallues to polarity (though a helper function is available in parser.py.
'''
# train_base_path = "data/training/"
# train_files = ["Canon PowerShot SD500.txt", "Canon S100.txt", "Diaper Champ.txt", "Hitachi router.txt", "ipod.txt", "Linksys Router.txt", "MicroMP3.txt","Nokia 6600.txt", "norton.txt"]
# training_data = {}
# for train_file in train_files:
# train_path = os.path.join(train_base_path, train_file)
# parse.read_txt_data(train_path, training_data)
# held_base_path = "data/heldout/"
# held_files = ["Apex AD2600 Progressive-scan DVD player.txt", "Canon G3.txt", "Creative Labs Nomad Jukebox Zen Xtra 40GB.txt", "Nikon coolpix 4300.txt", "Nokia 6610.txt"]
# held_data = {}
# for held_file in held_files:
# held_path = os.path.join(held_base_path, held_file)
# parse.read_txt_data(held_path, held_data)
# training_data = parse.val_to_polarity(training_data)
# held_data = parse.val_to_polarity(held_data)
#(i)
''' Takes a string as an input. Tokenizes string into list.
returns a dictionary with the following word features:
h_determiner, h_conjunction, h_interjection, h_adjective, h_adjective_comparitive, h_adjective_superlative, h_adverb, h_adverb_comparitive, h_adadverb_superlative, h_gender_specific, h_female_specific, h_male_specific
as well as the 0/1 existance of each of these.
'''
def get_function_features(sent):
func_feats = {}
func_feats['h_determiner'] = 0
func_feats['h_conjunction'] = 0
func_feats['h_interjection'] = 0
func_feats['h_adjective'] = 0
func_feats['h_adjective_comparitive'] = 0
func_feats['h_adjective_superlative'] = 0
func_feats['h_adverb'] = 0
func_feats['h_adverb_comparitive'] = 0
func_feats['h_adverb_superlative'] = 0
func_feats['h_gender_specific'] = 0
func_feats['h_female_specific'] = 0
func_feats['h_male_specific'] = 0
female_words = set(["she", "her","herself"])
male_words = set(["he", "him","himself"])
#use backoff tagger
text = nltk.word_tokenize(sent)
pos = nltk.pos_tag(text)
for word in pos:
if word[0] in female_words:
t = 0
#func_feats['h_female_specific'] +=1
#func_feats['h_gender_specific'] += 1
elif word[0] in male_words:
func_feats['h_male_specific'] +=1
func_feats['h_gender_specific'] += 1
if word[1] =='DT':
func_feats['h_determiner']+=1
elif word[1] =='CC' or word[1] =='IN':
func_feats['h_conjunction'] +=1
elif word[1] =='UH':
func_feats['h_interjection'] +=1
elif word[1] =='JJ':
func_feats['h_adjective'] +=1
elif word[1] =='JJR':
func_feats['h_adjective_comparitive']+=1
func_feats['h_adjective'] +=1
elif word[1] =='JJS':
func_feats['h_adjective_comparitive']+=1
func_feats['h_adjective'] +=1
elif word[1] =='RB':
func_feats['h_adverb'] +=1
elif word[1] =='RBR':
func_feats['h_adverb_comparitive']+=1
func_feats['h_adverb'] +=1
elif word[1] =='RBS':
func_feats['h_adverb_superlative']+=1
func_feats['h_adverb'] +=1
values = func_feats.values()
min_val = min(values)
max_val = max(values)+.000001
for key in func_feats.keys():
func_feats[key+"_normalized"]= (func_feats[key]-min_val)/(max_val-min_val)
if func_feats[key] >0:
func_feats[key+"_exists"] = 1
else:
func_feats[key+"_exists"] = 0
'''
(iii) You must show for each feature that you tried to optimize it; for example, if you used unigrams, you tested for the effects of using stopwords and/or stemming, or for example for bigrams you tested for informative collocations (e.g., using mutual information or chi-square or likelihood ratio, etc) or part of speech patterns a la Turney.
for each feature, I considered both the original and normalized feature. I also considered if it was best to just check for the existence of that feature in the sentence.
I only included the best features in the dictonary I used
adjective_superlative, was the same accross all 3
adverb_exists was the highest
adjective_comparitive_exists and adjective_comparitive were the same
h_adjective_exists was the highest
h_female_specific_exists and h_female_specific were the same
h_determiner_exists was the highest
h_adverb_superlative was the same accross all 3
h_male_specific was the same accross all 3
h_interjection was the same accross all 3
h_gender_specific was the same accross all 3
h_conjunction_exists was the highest
h_adverb_comparitive and h_adverb_comparitive_normalized
'''
best_func_feats = {}
best_func_feats['h_adverb_exists'] = func_feats['h_adverb_exists']
best_func_feats['h_adjective_comparitive'] = func_feats['h_adjective_comparitive']
best_func_feats['h_adjective'] = func_feats['h_adjective']
best_func_feats['h_female_specific'] = func_feats['h_female_specific']
best_func_feats['h_determiner_exists'] = func_feats['h_determiner_exists']
best_func_feats['h_adverb_superlative'] = func_feats['h_adverb_superlative']
best_func_feats['h_male_specific'] = func_feats['h_male_specific']
best_func_feats['h_interjection'] = func_feats['h_interjection']
best_func_feats['h_gender_specific'] = func_feats['h_gender_specific']
best_func_feats['h_conjunction_exists'] = func_feats['h_conjunction_exists']
best_func_feats['h_adverb_comparitive'] = func_feats['h_adverb_comparitive' ]
#return best_func_feats
return func_feats
# #testing
# feature_sets = [(get_function_features(n), v) for (n,v) in training_data.items()]
# random.shuffle(feature_sets)
# size = int(len(feature_sets) * 0.9)
# print "trainging results"
# train_set, test_set = feature_sets[size:], feature_sets[:size]
# #train_set = [line for line in train_set if line]
# classifier = nltk.NaiveBayesClassifier.train(train_set)
# print nltk.classify.accuracy(classifier, test_set)
# classifier.show_most_informative_features()
# print " heldout results"
# train_set = feature_sets
# test_set = [(get_function_features(n), v) for (n,v) in held_data.items()]
# print nltk.classify.accuracy(classifier, test_set)
# classifier.show_most_informative_features()
# #loop though each feature individually...
# classifier=[]
# feature_sets_keys = feature_sets[0][0].keys()
# for i in range(36):
# print i
# feature_seti = [({feature_sets_keys[i]: n[feature_sets_keys[i]]}, v )for (n,v) in feature_sets]
# random.shuffle(feature_seti)
# size = int(len(feature_seti) * 0.9)
# print "trainging results"
# print feature_seti
# train_seti, test_seti = feature_seti[size:], feature_seti[:size]
# classifier += [nltk.NaiveBayesClassifier.train(train_seti)]
# print nltk.classify.accuracy(classifier[i], test_seti)
# classifier[i].show_most_informative_features()
# print feature_sets_keys
# test_set_keys = feature_sets_keys
# print " heldout results"
# i = 0
# for i in range (36):
# print i
# print test_set_keys[i]
# test_seti = [({test_set_keys[i]: n[test_set_keys[i]]}, v )for (n,v) in test_set]
# print nltk.classify.accuracy(classifier[i], test_seti)
# classifier[i].show_most_informative_features()
#Each individual feature must be tried out on the classification task to see how well it performs on the training data and on the held-out data. (iv) These results must be reported and included in the writeup. It is expected that each person individually is able to write code to run and test the classifier in this manner on the features they produce