Merge pull request #70 from kookmin-sw/feature/pe-code-refactoring

refactor: experiment results and code cleanup
kookmin-sw · Jun 5, 2024 · e50ab1c · e50ab1c
2 parents 05265f3 + babe363
commit e50ab1c
Show file tree

Hide file tree

Showing 12 changed files with 24,564 additions and 23,587 deletions.
diff --git a/GIPS/iot_signature.ipynb b/GIPS/iot_signature.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -194,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -203,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -212,17 +212,23 @@
      "text": [
       "('PING', 731)\n",
       "('PONG\\n', 707)\n",
-      "('root', 239)\n",
       "('Password: ', 26)\n"
      ]
     }
    ],
    "source": [
     "for cluster, sig in signature.items():\n",
     "    for token in sig:\n",
-    "        if token[1] not in filters:\n",
+    "        if token[0] not in filters:\n",
     "            print(token)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/PE/GIPS/core/JIG.py b/PE/GIPS/core/JIG.py
@@ -3,7 +3,7 @@
 from tqdm import tqdm
 
 
-def IORA(sum_vector_): # 이해 가능한 사람은 공유해주세요
+def IORA(sum_vector_):
     sum_vector = sorted(sum_vector_, reverse=True)
     total = sum(sum_vector)
 
@@ -19,30 +19,9 @@ def IORA(sum_vector_): # 이해 가능한 사람은 공유해주세요
         total -= sum_vector[idx]
     return thetaC
 
-'''
-vectors: virtual vector
-thetaJ: hyper parameter
-'''
 
 def JIG(vectors, thetaJ):
-    '''
-    의사 코드
-    M <= vectors의 크기
-    MV <= 빅그룹을 카운팅할 배열
-    big_group_indices <= 빅그룹으로 판별된 데이터를 저장할 집합
-    
-    for vector in vectors
-        mv에 vector 추가
-
-        thetaC <= 빅그룹 식별 임계값 IOPA 이용
-
-        reatio <= vector의 1인 값이 추가 되었을 때 NV의 그 값이 thetaC보다 클때 카운트
-        
-        if reatio / K > thetaJ
-            big_group_indices에 vector의 index추가
-
-    big_group_indices 반환
-    '''
+
     M = len(vectors[0])
     MV = np.zeros(M, dtype=np.int32)
     big_group_indices = []

diff --git a/PE/GIPS/core/MV2.py b/PE/GIPS/core/MV2.py
@@ -1,38 +1,15 @@
 import numpy as np
-from core.utils import AEchunking, minHash
+from core.utils import minHash
 from tqdm import tqdm
 
 
-'''
-paylaods: 입력 데이터(패킷 페이로드)
-window_size: CDC(cotent defined chuncking)으로 청킹하는 모듈
-K: minhash의 개수
-M: bitmap의 사이즈
-'''
-
-
-def MV2(payloads, window_size, K, M):
-    '''
-    의사 코드
-    minhashed_virtual_vectors <= 배열
-
-    for payload in playloads
-        chunks <= payload를 청킹한 값
-        encode_pos <= chunks를 민해싱한 값
-
-        vector <= 비트맵
-        encoding <= 해싱값에 맞는 위치에 인코딩한 값
-
-        minhashed_virtual_vector에 vector 추가
-
-    minhashed_virtual_vectors 반환
-    '''
+def MV2(payloads, K, M):
 
     minhashed_virtual_vectors = []
 
     print('make minhashed vector')
     for payload in tqdm(payloads):
-        chunks = list(payload)  # AEchunking(payload, W=window_size)
+        chunks = list(payload)
         encode_pos = minHash(chunks, K) % M
 
         vector = np.zeros(M, dtype=np.int8)
@@ -41,3 +18,4 @@ def MV2(payloads, window_size, K, M):
         minhashed_virtual_vectors.append(vector)
 
     return minhashed_virtual_vectors
+
diff --git a/PE/GIPS/core/SG2.py b/PE/GIPS/core/SG2.py
@@ -7,16 +7,15 @@
 from tqdm import tqdm
 
 
-def SG2(payloads, window_size, vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio):
+def SG2(payloads, vector_size, eps, minpts, hh1_size, hh2_size, ratio):
     fine_vectors = []
 
     print('chunking')
     for payload in tqdm(payloads):
-        chunks = payload  # AEchunking(payload, window_size)
+        chunks = payload
         vector = np.zeros(vector_size, dtype=np.int8)
         for chunk in chunks:
-            idx = int(hashlib.md5(chunk.encode()).hexdigest(),
-                      16) % vector_size
+            idx = int(hashlib.md5(chunk.encode()).hexdigest(), 16) % vector_size
             vector[idx] += 1
 
         fine_vectors.append(vector)

diff --git a/PE/GIPS/main.py b/PE/GIPS/main.py
@@ -4,9 +4,9 @@
 
 
 def GIPS(str_feature,
-		 window_size, K, M, # MV2 파라미터
+		 K, M, # MV2 파라미터
 		 thetaJ,  # JIG 파라미터
-		 vector_size, eps, minpts, ngram, hh1_size, hh2_size, hh3_size, ratio # SG2, AWL 파라미터
+		 vector_size, eps, minpts, hh1_size, hh2_size, ratio # SG2
 		):
 
 	print(f'data no: {len(str_feature)}')
@@ -16,11 +16,10 @@ def GIPS(str_feature,
 		feature = list(feature)
 
 	# 빅 그룹 식별
-	minhashed_virtual_vectors = MV2(payloads=str_feature, window_size=window_size, K=K, M=M)
+	minhashed_virtual_vectors = MV2(payloads=str_feature, K=K, M=M)
 
 	big_group_indices = JIG(vectors=minhashed_virtual_vectors, thetaJ=thetaJ)
 
-
 	big_group_payloads = []
 	non_big_group_paylaods = []
 
@@ -31,7 +30,7 @@ def GIPS(str_feature,
 			non_big_group_paylaods.append(payload)
 
 	# 시그니처 생성
-	cluster_signatures = SG2(payloads=big_group_payloads, window_size=window_size, vector_size=vector_size, 
-							 eps=eps, minpts=minpts, ngram=ngram, hh1_size=hh1_size, hh2_size=hh2_size, hh3_size=hh3_size, ratio=ratio)
+	cluster_signatures = SG2(payloads=big_group_payloads, vector_size=vector_size, eps=eps, 
+						  minpts=minpts, hh1_size=hh1_size, hh2_size=hh2_size, ratio=ratio)
 
 	return cluster_signatures