-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify.py
279 lines (223 loc) · 10.5 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# Note: Using the python-twitter API.
from nltk.corpus.reader import sentiwordnet, wordlist
import twitter
import pandas as pd
import re
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import csv
# Constant for number of tweets fetched from Twitter API.
NUM_TWEETS = 10
""" Utility Functions """
def parse_csv(file_name):
df = pd.read_csv(file_name)
return df
def convert_to_df(record_lst):
return pd.DataFrame(record_lst)
""" initialize API instance - source: https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed
Example Use:
consumer_key='YOUR_CONSUMER_KEY',
consumer_secret='YOUR_CONSUMER_SECRET',
access_token_key='YOUR_ACCESS_TOKEN_KEY',
access_token_secret='YOUR_ACCESS_TOKEN_SECRET')
NOTE: consumer_key & consumer_secret listed as "API key" & "API key secret" on Twitter Developer page.
"""
api = twitter.Api(consumer_key='',
consumer_secret='',
access_token_key='',
access_token_secret='',
tweet_mode = 'extended') # Allow for non-truncated tweet text. Field text in JSON response must be 'status.full_text'
"""
Searches for tweets containing a specified term, returns an iterable containing twitter.Status objects.
NOTE: Twitter limits the number of request you can through the API to 180 requests per 15 minutes.
"""
def buildTestSet():
try:
search_term = input("Enter a word or phrase to be searched: ")
# Limit to the number of tweets to search is 100 (regardless if you type a larger number).
tweets_fetched = api.GetSearch(term = search_term, count = NUM_TWEETS, since = '2021-01-01')
print("\n")
print("Fetched Tweet(s): " + str(len(tweets_fetched)) + "\nTweet(s) contain the following term: " + search_term)
print("\n")
return [{"user": status.user.screen_name, "user_id": status.user.id_str, "created_at": status.user.created_at, "text": status.full_text, "label": None, "Tweet ID": status.id} for status in tweets_fetched]
except:
print("Something went wrong!")
return None
"""
Using Niek Sanders' Corpus of 5k+ classified tweets:
To build the training set, repeat until EOF: 180 requests per 15 minutes.
15 min * 60 sec = 900 sec
(900 seconds / 180 tweets)
Credit: https://gist.github.com/AnasAlmasri/33caec97bca7f33f8c1f8ab457a0b056#file-sentimentanalysis-py
Possibly a result of API requests (tweet IDs need to be ints not strings), tweet IDs will be rounded/truncated, making them pretty much useless.
You don't need them past this point anyways, but just something worth mentioning.
"""
def buildTrainingSet(trainingFile, tweetDataFile):
import csv
import time
corpus = []
with open(trainingFile, "r") as csvfile:
lineReader = csv.reader(csvfile, delimiter = ',', quotechar ="\"")
for row in lineReader:
if not row: # Check if the row is empty.
continue
corpus.append({"topic": row[0], "label": row[1], "tweet_id": row[2]})
sleep_time = 900/180
trainingDataSet = []
for tweet in corpus:
try:
status = api.GetStatus(tweet["tweet_id"])
print("Tweet fetched: " + status.full_text)
tweet["text"] = status.full_text
trainingDataSet.append(tweet)
time.sleep(sleep_time)
except Exception as e:
print(e)
continue
print("\n\nTraining_Set Finished!\n\n")
# Write quotes to empty CSV file in current directory.
with open(tweetDataFile, 'w', encoding='utf-8') as csvfile:
linewriter = csv.writer(csvfile, delimiter = ',', quotechar = "\"")
for tweet in trainingDataSet:
try:
linewriter.writerow([tweet["tweet_id"], tweet["text"], tweet["label"], tweet["topic"]])
except Exception as e:
print(e)
return trainingDataSet
# Clean up tweets for classification (removes "stop" words)
class PreProcessTweets:
def __init__(self):
self.stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER', 'URL'])
def processTweets(self, list_of_tweets):
processed_tweets = []
for tweet in list_of_tweets:
processed_tweets.append((self.clean_tweets(tweet["text"]), tweet["label"]))
return processed_tweets
def clean_tweets(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs (use raw string)
tweet = re.sub('@[^/s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self.stopwords]
"""
Naive Bayes Classification:
1) Build vocabulary of all words present in the training data set.
word_features represents a list of distinct words - with # of occurences in the set as a key.
"""
def buildVocab(preprocessedTrainingData):
vocab = []
for (words, sentiment) in preprocessedTrainingData:
vocab.extend(words)
wordlist = nltk.FreqDist(vocab)
features = wordlist.keys()
return features
"""
2) Parse training set data and compare every word against current tweet assigning a number as such:
label 1 (true): if word in vocabulary is found in tweet (present)
label 0 (false): if word in vocabulary is not found in tweet (absent)
"""
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in tweet_words) # JSON key 'contains word X' where X is the word.
return features
# ================================================================= Auxillary Functions =================================================================
"""
Removes blank rows from csv containing tweet data.
Add rows headers at the top of csv indicating tweet_id, text, label (sentiment), and topic.
"""
def clean_tweet_data():
df = parse_csv('./tweet_data.csv')
convert_to_df(df)
try:
df = df.dropna('index', 'all')
df.columns = ['tweet_id', 'text', 'label', 'topic'] # Removes first tweet for no reason, fuck it.
except Exception as e:
print("In clean_tweet_data(): ", e)
# Overwrite tweet_data.csv in current directory to avoid file confusion.
df.to_csv('tweet_data.csv', index = False)
print("Blank rows removed & columns headers added to tweet_data.csv. Check csv file for changes.\n")
"""
In conjuction with clean_tweet_data(), this function avoid having to build the training set (which takes hours) each time you run the program
by taking tweetData formed in tweet_data.csv and placing into a dataframe-like array of dictionaries. Keys are tweet_id, text, label, topic respectively.
"""
def composeTweetData(tweetData):
cleanedSet = []
x = 0
with open(tweetData, 'r', encoding = 'utf8') as csvfile:
reader = csv.DictReader(csvfile)
for tweetDict in reader:
cleanedSet.append(tweetDict)
return cleanedSet
# ================================================================= Debugging Functions =================================================================
"""
Test fetching of tweets - debugging function.
Use this function conservatively to avoid racking up API requests.
"""
def fetchTweetsTest():
tweet_count = 1
tweets_dict = buildTestSet() # List of tweets
# Iterate through dictionaries
for tweet in tweets_dict:
print("------------------------------------------------------------------------------------------------------------")
print("Tweet #%d\n" % tweet_count)
print("Tweet created at: %s" % tweet['created_at'])
print("Tweet text: %s" % tweet['text'])
print("\n\tTweet made by user: %s\n\tUser ID: %s" % (tweet['user'], tweet['user_id']) )
print("------------------------------------------------------------------------------------------------------------")
print("\n")
tweet_count += 1
return None
# ================================================================= Control Flow ========================================================================
"""
This may look unusual to organize the function calls like this, but coming back to this code after a nearly semester-long hiatus was a headache
and a half. This helped immensely in trying to figure out the control flow.
"""
# test authentication
#print(twitter_api.VerifyCredentials())
# build test set for training
testDataSet = buildTestSet()
# print tweets fetched from buildTestData() - for debugging.
#fetchTweetsTest()
# 8 hour function setup - argument filling
trainingFile = "./corpus.csv" # Need escape sequence so backslashes for directory interpreted correctly within string.
tweetDataFile = "./tweet_data.csv" # This file will be created once the function has run once.
# TODO: build the training set - only need to run once to create tweet_data.csv
#trainingData = buildTrainingSet(trainingFile, tweetDataFile)
# cleanup tweet_data.csv - removes blank rows & add column headers
clean_tweet_data()
# convert tweet_data.csv to an array of dictionaries as below,
# so you don't have to rebuild the training set every time you run the program.
tweetDataDict = composeTweetData(tweetDataFile)
# tweet pre-processing before classification.
tweetProcessor = PreProcessTweets()
preprocessedTrainingData = tweetProcessor.processTweets(tweetDataDict)
preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
# Naive Bayes Classification broken down into 4 steps:
# 1) Building the vocabulary.
# 2) Matching tweets against our vocabulary.
# 3) Building our feature vector.
word_features = buildVocab(preprocessedTrainingData)
trainingFeatures = nltk.classify.apply_features(extract_features, preprocessedTrainingData) # apply_features performs feature extraction.
# 4) Training the classifer.
NBayesClassifer = nltk.NaiveBayesClassifier.train(trainingFeatures)
# Model testing & rudimentary valdiation
NBResultLabels = [NBayesClassifer.classify(extract_features(tweet[0])) for tweet in preprocessedTestSet]
# Get the majority vote
pos = NBResultLabels.count('positive')
neg = NBResultLabels.count('negative')
neutral = NBResultLabels.count('neutral')
sentiments = [neg, neutral, pos]
if sentiments.index(max(sentiments)) == 0:
print("\nOverall negative sentiment of: " + str(100 * (neg/NUM_TWEETS)) + "%\n")
elif sentiments.index(max(sentiments)) == 1:
print("\nOverall neutral sentiment of: " + str(100 * (neutral/NUM_TWEETS)) + "%\n")
elif sentiments.index(max(sentiments)) == 2:
print("\nOverall positive sentiment of: " + str(100 * (pos/NUM_TWEETS)) + "%\n")
# =======================================================================================================================================================