peepdf update and test (kevoreilly#2491)

rkoumis · Feb 12, 2025 · df2152c · df2152c
1 parent f89c890
commit df2152c
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 9 deletions.
diff --git a/lib/cuckoo/common/integrations/peepdf.py b/lib/cuckoo/common/integrations/peepdf.py
@@ -44,8 +44,8 @@ def _set_base_uri(pdf):
     try:
         for version in range(pdf.updates + 1):
             trailer, _ = pdf.trailer[version]
-            if trailer is not None:
-                elem = trailer.dict.getElementByName("/Root")
+            if trailer:
+                elem = trailer.getTrailerDictionary().getElementByName("/Root")
                 if elem:
                     elem = _get_obj_val(pdf, version, elem)
                 if elem:
@@ -60,7 +60,7 @@ def _set_base_uri(pdf):
                     return elem.getValue()
     except Exception as e:
         log.exception(e)
-        return ""
+        return
 
 
 def peepdf_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]:
@@ -92,16 +92,16 @@ def peepdf_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]:
             metadata = metatmp
         objects = body.objects
         for index in objects:
-            oid = objects[index].id
+            oid = objects[index].thisId
             offset = objects[index].offset
             size = objects[index].size
-            details = objects[index].object
+            details = objects[index].obj
             obj_data = {
                 "Object ID": oid,
                 "Offset": offset,
                 "Size": size,
             }
-            if details.type == "stream":
+            if details.objType == "stream":
                 decoded_stream = details.decodedStream
                 if isJavascript(decoded_stream.strip()):
                     jsdata = None
@@ -129,7 +129,7 @@ def peepdf_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]:
                         ret_data += tmp
                     obj_data["Data"] = ret_data
                     retobjects.append(obj_data)
-            elif details.type == "dictionary" and details.containsJScode:
+            elif details.objType == "dictionary" and details.containsJScode:
                 js_elem = details.getElementByName("/JS")
                 if js_elem:
                     jsdata = None
@@ -157,7 +157,7 @@ def peepdf_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]:
                         ret_data += tmp
                     obj_data["Data"] = ret_data
                     retobjects.append(obj_data)
-            elif details.type == "dictionary" and details.hasElement("/A"):
+            elif details.objType == "dictionary" and details.hasElement("/A"):
                 # verify it to be a link type annotation
                 subtype_elem = details.getElementByName("/Subtype")
                 type_elem = details.getElementByName("/Type")
@@ -169,7 +169,7 @@ def peepdf_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]:
                     continue
                 a_elem = details.getElementByName("/A")
                 a_elem = _get_obj_val(pdf, i, a_elem)
-                if a_elem and a_elem.type == "dictionary" and a_elem.hasElement("/URI"):
+                if a_elem and a_elem.getType() == "dictionary" and a_elem.hasElement("/URI"):
                     uri_elem = a_elem.getElementByName("/URI")
                     if uri_elem:
                         uri_elem = _get_obj_val(pdf, i, uri_elem)

diff --git a/tests/test_peepdf.py b/tests/test_peepdf.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+
+import pytest
+
+from lib.cuckoo.common.integrations.peepdf import peepdf_parse
+
+data_dir = Path(__file__).parent / "data" / "malware"
+pdf_path = data_dir / "ad6cedb0d1244c1d740bf5f681850a275c4592281cdebb491ce533edd9d6a77d"
+
+expected_result = {
+    "Info": {
+       "Creator": "Scribus 1.3.3.12",
+       "Producer": "Scribus PDF Library 1.3.3.12",
+       "Author": ""
+    },
+    "Dates": [],
+    "Keywords": {},
+    "JSStreams": [
+        {
+            "Object ID": 13,
+            "Offset": 872,
+            "Size": 1255,
+        }
+    ],
+    "All_URLs": []
+}
+
+pdfresult = {"Info": {}, "Dates": [], "Keywords": {}, "JSStreams": [], "All_URLs": []}
+
+
+@pytest.mark.skipif(not data_dir.exists(), reason="Required data file is not present")
+class TestPeepdf:
+    """Class to test peepdf_parse."""
+    @pytest.mark.skipif(
+        not pdf_path.exists(),
+        reason="Required data file is not present",
+    )
+    def test_peepdf_parse_valid_pdf(self):
+        """Test parsing a valid PDF sample."""
+        result = peepdf_parse(str(pdf_path), pdfresult)
+        del result["JSStreams"][0]["Data"]
+
+        assert result == expected_result