-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathData_prep.py
180 lines (158 loc) · 6.49 KB
/
Data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
## =========================================================================== ##
#| Written by Cooper Coldwell, July 12, 2022 |#
#| If you need a better documented/commented version of this code, check the |#
#| notebook titled "Data_prep.ipnyb". Everything is explained in detail there, |#
#| though it may take longer to run or require more memory. YMMV. |#
## =========================================================================== ##
from __future__ import absolute_import, division, print_function, unicode_literals
# import cupy as cp
import numpy as np
import pandas as pd
# import cudf as cd
import os, sys
import glob as glob
import binascii
import csv
import pickle
# import PIL.Image as Image
from scapy.all import *
from pathlib import Path
from tqdm.auto import tqdm
pathToNormal = 'Normal-1UE/'
pathToNormal2UE = 'Normal-2UE/'
pathToAttack = 'Attacks/'
try:
os.mkdir('NEW-PREPPED-DATA')
except:
pass
processedPath = 'NEW-PREPPED-DATA/'
datasets = glob(pathToNormal+'allcap*.pcapng')
print('\nDatasets: \n',datasets,'\n')
print('Start processing normal-1ue data')
print('This could take up to an hour or more')
payloads = []
for file in tqdm(datasets):
pcap = sniff(offline=str(file))
for packet in pcap:
if not Raw in packet:
continue
payload = binascii.hexlify(packet[Raw].original)
payloads.append(payload)
print('\tConverted',len(payloads),'packets to strings.')
print('\tShuffling and saving to csv')
data = {'raw':payloads}
df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True)
df.loc[:,'label'] = 'normal'
df.to_csv(f"{processedPath}normal_data.csv", index=False)
print('The first 3 processed packets look like: \n',df.head(3))
print('\n\nStart processing normal-2ue data')
print('This may take a while')
datasets = glob(pathToNormal2UE+'allcap*.pcapng')
payloads = []
for pcap in datasets:
pcap = sniff(offline=str(file))
for packet in pcap:
if not Raw in packet:
continue
payload = binascii.hexlify(packet[Raw].original)
payloads.append(payload)
print('\tConverted',len(payloads),'packets to strings.')
print('\tPickling to avoid data loss in the event memory runs out')
with open('2ue.p','wb') as file:
pickle.dump(payloads,file)
with open('2ue.p','rb') as file:
payloads = pickle.load(file)
print('\tShuffling and saving to csv')
data = {'raw':payloads,'label':['normal']*len(payloads)}
# print(data['label'][0])
df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True)
df.to_csv(f"{processedPath}normal_data_2ue.csv", index=False)
print('\n\nStart processing attack data')
print('This should be quicker')
try:
del dataset, payload, payloads, data, df
except:
pass
sets = []
# print(os.listdir(pathToAttack))
for i in os.listdir(pathToAttack):
dataset = glob(pathToAttack+i+'/Attacks*.pcapng')
try:
# print(dataset[0])
sets.append(str(dataset[0]))
except:
print("Failed to find 'Attacks*.pcapng' file in folder: ", str(pathToAttack+i))
payloads = []
for file in sets:
pcap = sniff(offline=str(file))
for packet in pcap[Raw]:
if not Raw in packet:
continue
payload = binascii.hexlify(packet[Raw].original)
payloads.append(payload)
print('\tConverted',len(payloads),'packets to strings.')
print('\tShuffling and saving to csv')
data = {'raw':payloads}
df = pd.DataFrame(data=data)
df.loc[:,'label'] = 'attack'
df.to_csv(f"{processedPath}malicious_data.csv", index=False)
try:
del df
except:
pass
print('\n\nReading the data back in from the CSVs')
normal = pd.read_csv(f"{processedPath}normal_data.csv")
normal2UE = pd.read_csv(f"{processedPath}normal_data_2ue.csv")
malicious = pd.read_csv(f"{processedPath}malicious_data.csv")
print('\nCreating a data set with equal parts attack and normal')
mixed = malicious.sample(frac=1,random_state=100) #take all the malicious
mixed = pd.concat([mixed, normal.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-1ue
mixed = pd.concat([mixed, normal2UE.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-2ue
mixed = mixed.sample(frac=1,random_state=1) #shuffle the data before processing
## Separate the labels (important for using the mixed data to evaluate an autoencoder)
mixed_labels = mixed.pop('label')
np.save(f'{processedPath}mixed_labels.npy',mixed_labels)
del mixed_labels
print('Packets in malicious: ',len(malicious))
print('Packets in mixed: ',len(mixed))
print('Mixed set is of the expected size: ',len(malicious)*2==len(mixed))
print('\nPad the payloads to the same length, then convert to an array of bytes.')
print("The output is saved as:")
print('\t- mixed.npy')
max_packet_length = 1024
def ReshapePackets(dataFrame,saveToFilename,max_packet_length):
'''Converts from byte strings in a DataFrame to a numpy array of bytes'''
array = np.array(dataFrame['raw'])
array = np.ascontiguousarray(array)
payloads = []
array.shape
for i in range(array.shape[0]):
# print(array[i])
# Standardize the length of the strings:
payloadStr = array[i].split('\'')[1]
payloadStr = payloadStr.ljust(max_packet_length+2, u'0')
payloadStr = payloadStr[0:max_packet_length]
array[i] = payloadStr.encode('utf8')
# Convert to array:
array[i] = np.frombuffer(array[i],dtype=np.uint8,count=max_packet_length)
payloads.append(np.reshape(array[i],(array[i].shape[0],1,1)))
payloads = np.array(payloads)
print('New data shape: ',payloads.shape)
np.save(saveToFilename,payloads)
ReshapePackets(mixed,f'{processedPath}mixed.npy',max_packet_length)
del mixed
print('\nCreating a data set with equal parts normal-1ue and normal-2ue')
totalNormal = pd.concat([normal.sample(frac=1,random_state=2022),
normal2UE.sample(frac=1,random_state=100)[0:len(normal)]
])
totalNormal = totalNormal.sample(frac=1,random_state=2022)
print("\nPad the sets' payloads, then convert them to arrays of bytes.\nThe outputs are saved as:")
print("\t- normal.npy")
print("\t- normal2UE.npy")
print("\t- total_normal.npy")
ReshapePackets(normal,f'{processedPath}normal.npy',max_packet_length)
del normal
ReshapePackets(normal2UE,f'{processedPath}normal2UE.npy',max_packet_length)
del normal2UE
ReshapePackets(totalNormal,f'{processedPath}total_normal.npy',max_packet_length)
del totalNormal