forked from Ashkelso/VSChatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier.py
113 lines (97 loc) · 4.73 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import NuSVC, SVC
import pickle
def prepareData(data_url, testSize):
data = pd.read_excel(data_url)
data = data.loc[data['labels'].isin(['assault', 'sexual abuse'])]
X = data['data']
y = data['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize)
return X_train, X_test, y_train, y_test, X, y
def makeModel(classifier, X_train, y_train):
pipeline = Pipeline([
('bow', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', classifier), # train on TF-IDF vectors with classifier
])
pipeline.fit(X_train, y_train)
return pipeline
def testPerformance (model, model_type, X_test, y_test):
print("results with "+model_type+" classifier: \n")
print("confusion matrix: \n", confusion_matrix(y_test, model.predict(X_test)))
print("\n"+classification_report(y_test, model.predict(X_test)))
#prepare training and testing data
data_url = "C:/Users/Ashley/Dropbox/cs109/SML Chatbot Project/training data.xlsx"
X_train, X_test, y_train, y_test, X, y = prepareData(data_url, testSize=0.2)
#######################
# testing the different classifiers
########################
#random forrest
print("Random Forest")
model = makeModel(RandomForestClassifier(), X_train, y_train)
testPerformance(model, "random forest", X_test, y_test)
print("\ncross validation: ")
ranforest = Pipeline([
('bow', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', RandomForestClassifier()), # train on TF-IDF vectors with classifier
])
r = cross_val_score(ranforest, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(r))+"\nstandard error: "+str(np.std(r)))
#New Support Vector Machine
print("\nNew Support Vector Machine")
model_1 = makeModel(NuSVC(), X_train, y_train)
testPerformance(model_1, "Suport Vector Machine", X_test, y_test)
print("\ncross validation: ")
NuSupport = Pipeline([
('bow', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', NuSVC()), # train on TF-IDF vectors with classifier
])
s = cross_val_score(NuSupport, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(s))+"\nstandard error:"+str(np.std(s)))
#Support vector machine
print("\nSupport Vector Machine")
model_2 = makeModel(SVC(), X_train, y_train)
testPerformance(model_2, "Suport Vector Machine", X_test, y_test)
print("\ncross validation: ")
Support = Pipeline([
('bow', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', SVC()), # train on TF-IDF vectors with classifier
])
s = cross_val_score(Support, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(s))+"\nstandard error:"+str(np.std(s)))
#Multi Naive Bayes
print("\nMultinomial Naive Bayes")
model_3 = makeModel(MultinomialNB(), X_train, y_train)
testPerformance(model_3, "MultinomialNB", X_test, y_test)
print("\ncross validation: ")
Bayes = Pipeline([
('bow', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', MultinomialNB()), # train on TF-IDF vectors with classifier
])
n = cross_val_score(Bayes, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(n))+"\nstandard error: "+str(np.std(n)))
################################
# Making the models
##############################
# # Training on 100% of the data
# forest = makeModel(RandomForestClassifier(), X, y)
# NewSVC = makeModel(NuSVC(), X, y)
# MultiNB = makeModel(MultinomialNB(), X, y)
# # Saving the classifiers
# pickle.dump(forest, open("randomForest.p","wb"))
# print("\nrandom forest saved")
# pickle.dump(NewSVC, open("NewSVC.p","wb"))
# print("\nsupport vector saved")
# pickle.dump(MultiNB, open("MultiNB.p","wb"))
# print("\nnaive bayes saved")