handle bytestring

run-llama · Jan 3, 2024 · fc9c95f · fc9c95f
1 parent d45de6c
commit fc9c95f
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 3 deletions.
diff --git a/llama_hub/file/pdf/README.md b/llama_hub/file/pdf/README.md
@@ -4,7 +4,7 @@ This loader extracts the text from a local PDF file using the `PyPDF2` Python pa
 
 ## Usage
 
-To use this loader, you need to pass in a `Path` to a local file.
+To use this loader, you need to pass in a `Path` to a local file or a PDF byte stream.
 
 ```python
 from pathlib import Path

diff --git a/llama_hub/file/pdf/base.py b/llama_hub/file/pdf/base.py
@@ -21,7 +21,14 @@ def load_data(
             file = Path(file)
 
         # Open the file if it's not already open, else use it as it is
-        context = open(file, "rb") if isinstance(file, Path) else file
+        if isinstance(file, Path):
+            context = open(file, "rb")
+            if extra_info:
+                extra_info.update({"file_name": file.name})
+            else:
+                extra_info = {"file_name": file.name}
+        else:
+            context = file
 
         with context as fp:
             # Create a PDF object
@@ -36,7 +43,7 @@ def load_data(
                 # Extract the text from the page
                 page_text = pdf.pages[page].extract_text()
                 page_label = pdf.page_labels[page]
-                metadata = {"page_label": page_label, "file_name": file.name}
+                metadata = {"page_label": page_label}
 
                 if extra_info is not None:
                     metadata.update(extra_info)