malware_analysis.py

'''
Building a malware analysis tool, based on automatic feature selection
Automatic Feature Selection: automatic feature weighting and selection
based on statistical properties of the training set, where features
are ranked based upon their significance.
'''
import features_selection
import read_dataset
from sklearn.model_selection import train_test_split
import train
import time


# reads dataset, classify content {malware|not malware}
# returns x = {file|features' occurrences} and y = {labels}
x, y = read_dataset.read_data()

# select features from the already classified dataset
# to be used to train our model
print('\nFeatures Selection based on KBest: ')
features_selection.select_features_k_best(x, y)

print('\nFeatures Selection based on Recursive Features Elimination: ')
features_selection.select_features_recursive_feature_elimination(x, y)

print('\nFeatures Selection based on Extra trees classifier: ')
features_selection.select_features_extra_trees(x, y)

print('\nFeatures Selection based on Random Forest classifier: ')
features_selection.select_features_random_forest(x, y)

# Split data into training and testing sets of 80% - 20%
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

print('\nTraining data shape (x, y): ' +
      str(x_train.shape), str(y_train.shape))

print('\nTesting data shape (x, y): ' +
      str(x_test.shape), str(y_test.shape))

# Model training using different hyperparameters
# and evaluation of performance in terms of metrices and time
train.train_svm(x_train, y_train, x_test, y_test)
train.train_svm_tuning_c_val(x_train, y_train, x_test, y_test, 10)
train.train_svm_tuning_c_val(x_train, y_train, x_test, y_test, 100)
train.train_svm_tuning_c_val(x_train, y_train, x_test, y_test, 1000)

t_start = time.time()
train.train_rf(x_train, y_train, x_test, y_test)
t_finish = time.time()
print(round((t_finish - t_start), 2), "Time to finish with default nJobs\n")

t_start_1 = time.time()
train.train_rf_number_jobs(x_train, y_train, x_test, y_test, 10)
t_finish_1 = time.time()
print(round((t_finish_1 - t_start_1), 2), "Time to finish with 10 nJobs\n")

t_start_2 = time.time()
train.train_rf_number_jobs_estimators(
    x_train, y_train, x_test, y_test, 1000, 10)
t_finish_2 = time.time()
print(round((t_finish_2 - t_start_2), 2),
      "Time to finish with 10 nJobs and 1000 nEstimators\n")

t_start_3 = time.time()
train.train_extra_trees(x_train, y_train, x_test, y_test)
t_finish_3 = time.time()
print(round((t_finish_3 - t_start_3), 2),
      "Time to finish ET with 100 nEstimators\n")

t_start_4 = time.time()
train.train_extra_trees_n_estimators(x_train, y_train, x_test, y_test, 500)
t_finish_4 = time.time()
print(round((t_finish_4 - t_start_4), 2),
      "Time to finish ET with 500 nEstimators\n")

t_start_5 = time.time()
train.train_extra_trees_n_estimators(x_train, y_train, x_test, y_test, 1000)
t_finish_5 = time.time()
print(round((t_finish_5 - t_start_5), 2),
      "Time to finish ET with 1000 nEstimators\n")

t_start_6 = time.time()
train.train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, 100)
t_finish_6 = time.time()
print(round((t_finish_6 - t_start_6), 2),
      "Time to finish ET with 100 nJobs\n")

t_start_7 = time.time()
train.train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, 500)
t_finish_7 = time.time()
print(round((t_finish_7 - t_start_7), 2),
      "Time to finish ET with 500 nJobs\n")

t_start_8 = time.time()
train.train_extra_trees_n_jobs(x_train, y_train, x_test, y_test, 1000)
t_finish_8 = time.time()
print(round((t_finish_8 - t_start_8), 2),
      "Time to finish ET with 1000 nJobs\n")


train.train_recursive_feature_elimination(x_train, y_train, x_test, y_test)
train.train_grid_search_using_rf(x_train, y_train, x_test, y_test)
train.train_grid_search_using_svm(x_train, y_train, x_test, y_test)

train.train_naive_bayes(x_train, y_train, x_test, y_test)