-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtweet_features.py
146 lines (117 loc) · 3.92 KB
/
tweet_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
@package tweet_features
Convert tweet to feature vector.
These routines help convert arbitrary tweets in to feature vectors.
"""
import numpy
# search patterns for features
testFeatures = \
[('hasAddict', (' addict',)), \
('hasAwesome', ('awesome',)), \
('hasBroken', ('broke',)), \
('hasBad', (' bad',)), \
('hasBug', (' bug',)), \
('hasCant', ('cant','can\'t')), \
('hasCrash', ('crash',)), \
('hasCool', ('cool',)), \
('hasDifficult', ('difficult',)), \
('hasDisaster', ('disaster',)), \
('hasDown', (' down',)), \
('hasDont', ('dont','don\'t','do not','does not','doesn\'t')), \
('hasEasy', (' easy',)), \
('hasExclaim', ('!',)), \
('hasExcite', (' excite',)), \
('hasExpense', ('expense','expensive')), \
('hasFail', (' fail',)), \
('hasFast', (' fast',)), \
('hasFix', (' fix',)), \
('hasFree', (' free',)), \
('hasFrowny', (':(', '):')), \
('hasFuck', ('fuck',)), \
('hasGood', ('good','great')), \
('hasHappy', (' happy',' happi')), \
('hasHate', ('hate',)), \
('hasHeart', ('heart', '<3')), \
('hasIssue', (' issue',)), \
('hasIncredible', ('incredible',)), \
('hasInterest', ('interest',)), \
('hasLike', (' like',)), \
('hasLol', (' lol',)), \
('hasLove', ('love','loving')), \
('hasLose', (' lose',)), \
('hasNeat', ('neat',)), \
('hasNever', (' never',)), \
('hasNice', (' nice',)), \
('hasPoor', ('poor',)), \
('hasPerfect', ('perfect',)), \
('hasPlease', ('please',)), \
('hasSerious', ('serious',)), \
('hasShit', ('shit',)), \
('hasSlow', (' slow',)), \
('hasSmiley', (':)', ':D', '(:')), \
('hasSuck', ('suck',)), \
('hasTerrible', ('terrible',)), \
('hasThanks', ('thank',)), \
('hasTrouble', ('trouble',)), \
('hasUnhappy', ('unhappy',)), \
('hasWin', (' win ','winner','winning')), \
('hasWinky', (';)',)), \
('hasWow', ('wow','omg')) ]
def make_tweet_nparr( txt ):
"""
Extract tweet feature vector as NumPy array.
"""
# result storage
fvec = numpy.empty( len(testFeatures) )
# search for each feature
txtLow = ' ' + txt.lower() + ' '
for i in range( 0, len(testFeatures) ):
key = testFeatures[i][0]
fvec[i] = False
for tstr in testFeatures[i][1]:
fvec[i] = fvec[i] or (txtLow.find(tstr) != -1)
return fvec
def make_tweet_dict( txt ):
"""
Extract tweet feature vector as dictionary.
"""
txtLow = ' ' + txt.lower() + ' '
# result storage
fvec = {}
# search for each feature
for test in testFeatures:
key = test[0]
fvec[key] = False;
for tstr in test[1]:
fvec[key] = fvec[key] or (txtLow.find(tstr) != -1)
return fvec
def tweet_dict_to_nparr( dict ):
"""
Convert dictionary feature vector to numpy array
"""
fvec = numpy.empty( len(testFeatures) )
for i in range( 0, len(testFeatures) ):
fvec[i] = dict[ testFeatures[i][0] ]
return fvec
def tweet_nparr_to_dict( nparr, use_standard_features=False ):
"""
Convert NumPy array to dictionary
"""
fvec = {}
if use_standard_features:
assert len(nparr) == len(testFeatures)
fvec = {}
for i in range( 0, len(nparr) ):
fvec[ testFeatures[i][0] ] = nparr[i]
else:
for i in range( 0, len(nparr) ):
fvec[ str(i) ] = nparr[i]
return fvec
def is_zero_dict( dict ):
"""
Identifies empty feature vectors
"""
has_any_features = False
for key in dict:
has_any_features = has_any_features or dict[key]
return not has_any_features