-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathmalware_analysis.py
107 lines (85 loc) · 3.83 KB
/
malware_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
'''
Building a malware analysis tool, based on automatic feature selection
Automatic Feature Selection: automatic feature weighting and selection
based on statistical properties of the training set, where features
are ranked based upon their significance.
'''
import features_selection
import read_dataset
from sklearn.model_selection import train_test_split
import train
import time
# reads dataset, classify content {malware|not malware}
# returns x = {file|features' occurrences} and y = {labels}
x, y = read_dataset.read_data()
# select features from the already classified dataset
# to be used to train our model
print('\nFeatures Selection based on KBest: ')
features_selection.select_features_k_best(x, y)
print('\nFeatures Selection based on Recursive Features Elimination: ')
features_selection.select_features_recursive_feature_elimination(x, y)
print('\nFeatures Selection based on Extra trees classifier: ')
features_selection.select_features_extra_trees(x, y)
print('\nFeatures Selection based on Random Forest classifier: ')
features_selection.select_features_random_forest(x, y)
# Split data into training and testing sets of 80% - 20%
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.2, random_state=42)
print('\nTraining data shape (x, y): ' +
str(x_train.shape), str(y_train.shape))
print('\nTesting data shape (x, y): ' +
str(x_test.shape), str(y_test.shape))
# Model training using different hyperparameters
# and evaluation of performance in terms of metrices and time
train.train_svm(x_train, y_train, x_test, y_test)
train.train_svm_tuning_c_val(x_train, y_train, x_test, y_test, 10)
train.train_svm_tuning_c_val(x_train, y_train, x_test, y_test, 100)
train.train_svm_tuning_c_val(x_train, y_train, x_test, y_test, 1000)
t_start = time.time()
train.train_rf(x_train, y_train, x_test, y_test)
t_finish = time.time()
print(round((t_finish - t_start), 2), "Time to finish with default nJobs\n")
t_start_1 = time.time()
train.train_rf_number_jobs(x_train, y_train, x_test, y_test, 10)
t_finish_1 = time.time()
print(round((t_finish_1 - t_start_1), 2), "Time to finish with 10 nJobs\n")
t_start_2 = time.time()
train.train_rf_number_jobs_estimators(
x_train, y_train, x_test, y_test, 1000, 10)
t_finish_2 = time.time()
print(round((t_finish_2 - t_start_2), 2),
"Time to finish with 10 nJobs and 1000 nEstimators\n")
t_start_3 = time.time()
train.train_extra_trees(x_train, y_train, x_test, y_test)
t_finish_3 = time.time()
print(round((t_finish_3 - t_start_3), 2),
"Time to finish ET with 100 nEstimators\n")
t_start_4 = time.time()
train.train_extra_trees_n_estimators(x_train, y_train, x_test, y_test, 500)
t_finish_4 = time.time()
print(round((t_finish_4 - t_start_4), 2),
"Time to finish ET with 500 nEstimators\n")
t_start_5 = time.time()
train.train_extra_trees_n_estimators(x_train, y_train, x_test, y_test, 1000)
t_finish_5 = time.time()
print(round((t_finish_5 - t_start_5), 2),
"Time to finish ET with 1000 nEstimators\n")
t_start_6 = time.time()
train.train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, 100)
t_finish_6 = time.time()
print(round((t_finish_6 - t_start_6), 2),
"Time to finish ET with 100 nJobs\n")
t_start_7 = time.time()
train.train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, 500)
t_finish_7 = time.time()
print(round((t_finish_7 - t_start_7), 2),
"Time to finish ET with 500 nJobs\n")
t_start_8 = time.time()
train.train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, 1000)
t_finish_8 = time.time()
print(round((t_finish_8 - t_start_8), 2),
"Time to finish ET with 1000 nJobs\n")
train.train_recursive_feature_elimination(x_train, y_train, x_test, y_test)
train.train_grid_search_using_rf(x_train, y_train, x_test, y_test)
train.train_grid_search_using_svm(x_train, y_train, x_test, y_test)
train.train_naive_bayes(x_train, y_train, x_test, y_test)