-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathtrain.py
221 lines (193 loc) · 8.15 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
'''
This file contain all the models used in training using
dataset [drebin], we evaluate the metrices using sklearn metrices
Training with different models (SVM (c_values = [1, 10, 100, 1000]),
Recursive Feature Elimination, naive Bayes,
Random Forest (n_jobs = [None, 10] & n_estimators = [10, 1000]),
Extra Trees (n_estimators = [10, 100, 500, 1000] & nJobs = [100, 500, 1000]),
grid search based on SVC model (kernel = [linear, rbf] & c = [1, 10])
grid search based on RF model (n_estimators = [200, 500] &
max_features: [auto, sqrt, log2] &
max_depth: [4, 5, 6, 7, 8] &
criterion: [gini, entropy])
'''
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
recall_score, f1_score, classification_report)
import dask_searchcv as dcv
# from dask.diagnostics import ProgressBar
# from sklearn.model_selection import ParameterGrid
# Used to print the performance metrices for whatever model invokes this method
def print_metrices_out(y_predicted, y_test):
print("Accuracy is %f (in percentage)" %
(accuracy_score(y_test, y_predicted) * 100))
print("Confusion Matrix: \n" + str(confusion_matrix(y_test, y_predicted)))
print("Recall score is %f." % recall_score(y_test, y_predicted))
print("Precision score is %f." %
precision_score(y_test, y_predicted))
print("F1 score is %f." % f1_score(y_test, y_predicted))
print("classification Report: \n" +
str(classification_report(y_test, y_predicted)))
print("-----------------------------------\n")
# This section contains the fitting of data in the model
# and the prediction of the test data passed to the parameter
# Note that you can add n_jobs=-1 to allow the model to use
# all the preprocessors offered by your pc,
# for example with grid search CV which is known for its
# expensive computations it took 6 mins instead of 15mins
# given same dataset, same PC
def train_svm(x_train, y_train, x_test, y_test):
print("\n-------------SVM Model-------------")
model = SVC(gamma='scale')
# SVM Fit
model.fit(x_train, y_train)
# SVM Predict
y_predicted = model.predict(x_test)
print("SVM Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_svm_tuning_c_val(x_train, y_train, x_test, y_test, c_value):
print("-------------SVM, C value Model-------------")
print("C value: " + str(c_value))
model = SVC(gamma='scale', C=c_value)
# SVM Fit
model.fit(x_train, y_train)
# SVM Predict
y_predicted = model.predict(x_test)
print("SVM, tuned C val Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_recursive_feature_elimination(x_train, y_train, x_test, y_test):
print("-------------RFE Model-------------")
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 4)
# RFE Fit
rfe.fit(x_train, y_train)
# RFE Predict
y_predicted = rfe.predict(x_test)
print_metrices_out(y_predicted, y_test)
def train_extra_trees(x_train, y_train, x_test, y_test):
print("-------------Extra Trees Model-------------")
extra_trees = ExtraTreesClassifier(n_estimators=100)
# ET Fit
extra_trees.fit(x_train, y_train)
# ET Predict
y_predicted = extra_trees.predict(x_test)
# ET Matrices
print("ET Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_extra_trees_n_estimators(x_train, y_train, x_test, y_test,
number_estimators):
print("-------------Extra Trees Model-------------")
print("Number of estimators: " + str(number_estimators))
extra_trees = ExtraTreesClassifier(n_estimators=number_estimators)
# ET Fit
extra_trees.fit(x_train, y_train)
# ET Predict
y_predicted = extra_trees.predict(x_test)
# ET Matrices
print("ET Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, n_jobs):
print("-------------Extra Trees Model-------------")
print("Number of jobs: " + str(n_jobs))
extra_trees = ExtraTreesClassifier(n_jobs=n_jobs)
# ET Fit
extra_trees.fit(x_train, y_train)
# ET Predict
y_predicted = extra_trees.predict(x_test)
# ET Matrices
print("ET Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_rf(x_train, y_train, x_test, y_test):
print("-------------RF Model-------------")
model = RandomForestClassifier(n_estimators=100)
# RF Fit
model.fit(x_train, y_train)
# RF Predict
y_predicted = model.predict(x_test)
# RF Matrices
print("RF Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_rf_number_jobs(x_train, y_train, x_test, y_test, number_of_jobs):
print("-------------RF Model with nJobs-------------")
print("N_Jobs: " + str(number_of_jobs))
model = RandomForestClassifier(n_estimators=100, n_jobs=number_of_jobs)
# RF Fit
model.fit(x_train, y_train)
# RF Predict
y_predicted = model.predict(x_test)
# RF Matrices
print("RF Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_rf_number_jobs_estimators(x_train, y_train, x_test, y_test,
numbers_estimators, number_of_jobs):
print("-------------RF Model with nJobs-------------")
print("N_Jobs: " + str(number_of_jobs))
print("N_Estimators: " + str(numbers_estimators))
model = RandomForestClassifier(
n_estimators=numbers_estimators, n_jobs=number_of_jobs)
# RF Fit
model.fit(x_train, y_train)
# RF Predict
y_predicted = model.predict(x_test)
# RF Matrices
print("RF Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_naive_bayes(x_train, y_train, x_test, y_test):
print("-------------NB Model-------------")
bern_naive_bayes = BernoulliNB()
# NB Fit
bern_naive_bayes.fit(x_train, y_train)
# NB Predict
y_predicted = bern_naive_bayes.predict(x_test)
# NB Matrices
print("NB Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_grid_search_using_svm(x_train, y_train, x_test, y_test):
print("-------------GS, SVC Model-------------")
svc = SVC(gamma="scale")
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
# to print out the number combinatoric product of the different parameters
# so if you have cv of 5 - as it is the case here -
# within each grid you will have 5cross validations thus
# thus we have 20 number of fits
# to use it uncomment this import sklearn.model_selection
# pg = ParameterGrid(parameters)
# print(len(pg))
clf = GridSearchCV(svc, parameters, cv=5,
refit=True, n_jobs=-1, verbose=1)
# GS, SVC Fit
clf.fit(x_train, y_train)
# GS, SVC Predict
y_predicted = clf.predict(x_test)
# GS, SVC Matrices
print("GS based on SVC model Evaluation parameters:")
print_metrices_out(y_predicted, y_test)
def train_grid_search_using_rf(x_train, y_train, x_test, y_test):
print("-------------GS, RF Model-------------")
param_grid = {
'n_estimators': [200, 500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': [4, 5, 6, 7, 8],
'criterion': ['gini', 'entropy']
}
clf = dcv.GridSearchCV(estimator=RandomForestClassifier(verbose=3), cv=5,
refit=True,
param_grid=param_grid,
error_score=0)
# GS, RF Fit
# use of progress bar to print out "progress"
# [########################################] | 100% Completed | 4min 11.2s
# to use it uncomment this import dask.diagnostics
# with ProgressBar(dt=60.0):
clf.fit(x_train, y_train)
# GS, RF Predict
y_predicted = clf.predict(x_test)
# GS, RF Matrices
print("GS based on RF model Evaluation parameters:")
print_metrices_out(y_predicted, y_test)