From ac00dcfa7699bedc5874943181dbbd69c511fd5d Mon Sep 17 00:00:00 2001
From: chkp-eyalit <eyalit@checkpoint.com>
Date: Wed, 27 May 2020 10:51:10 +0300
Subject: [PATCH 1/4] [Setup] Can switch to use the latest sark

sark was pinpointed to 2.0 when moved to Python3. As sark is now
more developed, we can use the latest version withouta Python issues.
Solves one of the cases in issue #41.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c4d7195..b1f95c2 100755
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
       url='https://github.com/CheckPointSW/Karta',
       license='MIT',
       packages=find_packages(),
-      install_requires=['elementals', 'sark==2.0', 'pydocstyle', 'flake8', 'click', 'scikit-learn'],
+      install_requires=['elementals', 'sark', 'pydocstyle', 'flake8', 'click', 'scikit-learn'],
       classifiers=[
                     "Programming Language :: Python :: 3",
                     "License :: OSI Approved :: MIT License (MIT License)",

From c1c6f451e01fee90abfa2f09a26962d74cbd8217 Mon Sep 17 00:00:00 2001
From: chkp-eyalit <eyalit@checkpoint.com>
Date: Wed, 27 May 2020 11:20:55 +0300
Subject: [PATCH 2/4] [Bug Fix] struct.unpack("L") varies in size

Although python's documentation says otherwise, the "L" format stands
for long, and could be 4/8 bytes, depending on the compilation. Changed
it to use "I" (int) so the size will be 4 bytes fixed.

Fixes a bug reported in issue #41.
---
 src/thumbs_up/utils/function.py         | 2 +-
 src/thumbs_up/utils/pattern_observer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/thumbs_up/utils/function.py b/src/thumbs_up/utils/function.py
index 16234a5..57cbef9 100644
--- a/src/thumbs_up/utils/function.py
+++ b/src/thumbs_up/utils/function.py
@@ -65,7 +65,7 @@ def __init__(self, analyzer, feature_size, inner_offset, classifiers_start_offse
         self._mixed_classifiers = {}
         self._type_classifier   = None
         # seed the random generator
-        numpy.random.seed(seed=struct.unpack("L", ida_nalt.retrieve_input_file_md5()[:4])[0])
+        numpy.random.seed(seed=struct.unpack("I", ida_nalt.retrieve_input_file_md5()[:4])[0])
 
     def isFuncStart(self, ea):
         """Check if the given effective address is the start of a known function.
diff --git a/src/thumbs_up/utils/pattern_observer.py b/src/thumbs_up/utils/pattern_observer.py
index 0c9dc65..d786a1f 100644
--- a/src/thumbs_up/utils/pattern_observer.py
+++ b/src/thumbs_up/utils/pattern_observer.py
@@ -82,7 +82,7 @@ def decide(self):
         # Now check for a basic alignment rule
         seen_eas = list(map(lambda x: x[0], self._records))
         # Deterministic results per binary, but still random
-        random.seed(struct.unpack("L", ida_nalt.retrieve_input_file_md5()[:4])[0])
+        random.seed(struct.unpack("I", ida_nalt.retrieve_input_file_md5()[:4])[0])
         while True:
             # Check against two random candidates, and always make sure the representative isn't rare
             measure_candidate = seen_eas[random.randint(0, len(seen_eas) - 1)]

From c7c30ef02345cad9ff85a9463f0d4ef6b268291a Mon Sep 17 00:00:00 2001
From: chkp-eyalit <eyalit@checkpoint.com>
Date: Wed, 27 May 2020 11:29:33 +0300
Subject: [PATCH 3/4] [Enhancement] Make sure the random seed is arc-agnostic

Using "!I" instead of "I" to make sure the results will be the same
on both Big-Endian and Little-Endian machines. A bit on the safe side,
but it does no harm.
---
 src/thumbs_up/utils/function.py         | 2 +-
 src/thumbs_up/utils/pattern_observer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/thumbs_up/utils/function.py b/src/thumbs_up/utils/function.py
index 57cbef9..40523fb 100644
--- a/src/thumbs_up/utils/function.py
+++ b/src/thumbs_up/utils/function.py
@@ -65,7 +65,7 @@ def __init__(self, analyzer, feature_size, inner_offset, classifiers_start_offse
         self._mixed_classifiers = {}
         self._type_classifier   = None
         # seed the random generator
-        numpy.random.seed(seed=struct.unpack("I", ida_nalt.retrieve_input_file_md5()[:4])[0])
+        numpy.random.seed(seed=struct.unpack("!I", ida_nalt.retrieve_input_file_md5()[:4])[0])
 
     def isFuncStart(self, ea):
         """Check if the given effective address is the start of a known function.
diff --git a/src/thumbs_up/utils/pattern_observer.py b/src/thumbs_up/utils/pattern_observer.py
index d786a1f..718c9b1 100644
--- a/src/thumbs_up/utils/pattern_observer.py
+++ b/src/thumbs_up/utils/pattern_observer.py
@@ -82,7 +82,7 @@ def decide(self):
         # Now check for a basic alignment rule
         seen_eas = list(map(lambda x: x[0], self._records))
         # Deterministic results per binary, but still random
-        random.seed(struct.unpack("I", ida_nalt.retrieve_input_file_md5()[:4])[0])
+        random.seed(struct.unpack("!I", ida_nalt.retrieve_input_file_md5()[:4])[0])
         while True:
             # Check against two random candidates, and always make sure the representative isn't rare
             measure_candidate = seen_eas[random.randint(0, len(seen_eas) - 1)]

From 0e3ab0e2a93979c756bc5f2b29033aaa8714fa56 Mon Sep 17 00:00:00 2001
From: chkp-eyalit <eyalit@checkpoint.com>
Date: Wed, 27 May 2020 14:24:38 +0300
Subject: [PATCH 4/4] [Bug Fix] Enforce logic applies only to active code types

Fixes the main bug in issue #41. When there was only 1 active code
type (THUMB) and we were scanning a line from the other code type (ARM)
we triggered an exception.

Scanned the code to make sure no prediction will be made on code
types which are not active at the moment of the decision.
---
 src/thumbs_up/analyzer_utils.py     |  5 +++--
 src/thumbs_up/analyzers/analyzer.py | 15 +++++++++++++--
 src/thumbs_up/utils/fptr.py         |  2 +-
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/thumbs_up/analyzer_utils.py b/src/thumbs_up/analyzer_utils.py
index b844253..ca5c4b0 100644
--- a/src/thumbs_up/analyzer_utils.py
+++ b/src/thumbs_up/analyzer_utils.py
@@ -153,8 +153,10 @@ def functionScan(analyzer, scs):
             if search_func or analyzer.switch_identifier.isSwitchCase(line.start_ea):
                 line = line.next
                 continue
+            original_code_type = analyzer.codeType(line.start_ea)
             # If this is code, check that it matches the start of a function, and make it a function
-            if line.is_code and analyzer.func_classifier.predictFunctionStartMixed(line.start_ea):
+            if line.is_code and analyzer.supportedCodeType(original_code_type) and \
+                        analyzer.func_classifier.predictFunctionStartMixed(line.start_ea):
                 if not ida_funcs.add_func(line.start_ea):
                     line = line.next
                 else:
@@ -168,7 +170,6 @@ def functionScan(analyzer, scs):
             # If unknown, check if a function and don't try to keep the same code type
             if line.is_unknown:
                 guess_code_type = analyzer.func_classifier.predictFunctionStartType(line.start_ea)
-                original_code_type = analyzer.codeType(line.start_ea)
                 if analyzer.func_classifier.predictFunctionStart(line.start_ea, guess_code_type):
                     if original_code_type != guess_code_type:
                         analyzer.setCodeType(line.start_ea, line.start_ea + 1, guess_code_type)
diff --git a/src/thumbs_up/analyzers/analyzer.py b/src/thumbs_up/analyzers/analyzer.py
index 935b78e..cd48994 100644
--- a/src/thumbs_up/analyzers/analyzer.py
+++ b/src/thumbs_up/analyzers/analyzer.py
@@ -247,7 +247,7 @@ def isValidCodePtr(self, ptr_ea):
             True iff the code pointer is valid
         """
         ptr_type = self.ptrCodeType(ptr_ea)
-        return self.isCodeAligned(self.cleanPtr(ptr_ea), ptr_type) and ptr_type in self.activeCodeTypes()
+        return self.isCodeAligned(self.cleanPtr(ptr_ea), ptr_type) and self.supportedCodeType(ptr_type)
 
     def hasCodeTypes(self):
         """Check if the given CPU has multiple code types.
@@ -289,9 +289,20 @@ def disableCodeType(self, code_type):
         Args:
             code_type (int): code type to be disabled
         """
-        if code_type in self._active_code_types:
+        if self.supportedCodeType(code_type):
             self._active_code_types.remove(code_type)
 
+    def supportedCodeType(self, code_type):
+        """Check if a given code_type is actively supported.
+
+        Args:
+            code_type (int): code type to be checked
+
+        Return Value:
+            The code type of the annotated pointer
+        """
+        return code_type in self._active_code_types
+
     def ptrCodeType(self, ptr_ea):
         """Extract the code type of the annotated pointer.
 
diff --git a/src/thumbs_up/utils/fptr.py b/src/thumbs_up/utils/fptr.py
index a74bacb..bfbbd7f 100644
--- a/src/thumbs_up/utils/fptr.py
+++ b/src/thumbs_up/utils/fptr.py
@@ -200,7 +200,7 @@ def locateDataPtrs(self, scs, sds):
                     continue
                 # check for a function ptr
                 value = self._analyzer.parseAdderss(cur_ea)
-                # make sure it is valid
+                # make sure it is valid (enforces that the code_type is active)
                 if self.isValidCodePtr(value, scs):
                     func_value = self._analyzer.cleanPtr(value)
                     code_type  = self._analyzer.ptrCodeType(value)