Upload API takes longer time than uploading through the UI. #1853

llmwesee · 2024-09-25T08:46:49Z

The documentation provides sample code for uploading a file from the client side through the upload_api. However, I have observed that uploading files through this API takes longer than uploading through the UI. I would like to know the reason for this discrepancy and how I can make the process faster. Also, if multiple clients send files through this API, how is this handled?

import os
import ast
import time
from tqdm import tqdm
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from gradio_client import Client

class DocumentUploader:
    def __init__(self, host_url: str, api_key: str = 'EMPTY'):
        self.client = Client(host_url)
        self.api_key = api_key

    def upload_document(self, local_file_path: str) -> str:
        """Uploads a document to the server and returns the server file path with a progress bar."""
        with tqdm(total=100, desc=f"Uploading {os.path.basename(local_file_path)}", unit='%', ncols=80) as pbar:
            _, server_file_path = self.client.predict(local_file_path, api_name='/upload_api')
            pbar.update(100)
        return server_file_path

    def add_document_and_ocr(self, server_file_path: str, loaders: list) -> dict:
        """Adds the document to the server with OCR processing and shows progress."""
        with tqdm(total=100, desc=f"Processing {os.path.basename(server_file_path)}", unit='%', ncols=80) as pbar:
            res = self.client.predict(
                server_file_path, "UserData", True, 512, True, *loaders, api_name='/add_file_api'
            )
            pbar.update(100)
        return res

    def query_document(self, langchain_mode: str, instruction: str) -> str:
        """Queries the document based on given instructions and returns the response."""
        kwargs = dict(langchain_mode=langchain_mode, instruction=instruction)
        res = self.client.predict(str(kwargs), api_name='/submit_nochat_api')
        return ast.literal_eval(res)['response']

    def process_file(self, file_path: str):
        """Processes a single file with progress display."""
        loaders = [
            ['Caption', 'CaptionBlip2', 'Pix2Struct', 'OCR', 'DocTR'],
            ['PyMuPDF', 'Unstructured', 'PyPDF', 'TryHTML', 'OCR'],
            None, None
        ]
        print(f"Processing single file: {file_path}")
        server_file_path = self.upload_document(file_path)
        self.add_document_and_ocr(server_file_path, loaders)

    def query_uploaded_documents(self, instruction: str):
        """Queries the already uploaded documents."""
        response = self.query_document("UserData", instruction)
        print("Query response:", response)


class NewFileHandler(FileSystemEventHandler):
    """Event handler that triggers when a new file is added to the folder."""
    def __init__(self, uploader: DocumentUploader):
        self.uploader = uploader

    def on_created(self, event):
        """Triggered when a new file is created in the watched folder."""
        if not event.is_directory and event.src_path.endswith(".pdf"):
            print(f"New file detected: {event.src_path}")
            self.uploader.process_file(event.src_path)


def monitor_folder(folder_path: str, uploader: DocumentUploader):
    """Monitors the folder and triggers the uploader when new files are added."""
    event_handler = NewFileHandler(uploader)
    observer = Observer()
    observer.schedule(event_handler, folder_path, recursive=False)
    observer.start()
    print(f"Monitoring folder: {folder_path}")

    try:
        while True:
            time.sleep(1)  # Keep the script running
    except KeyboardInterrupt:
        observer.stop()

    observer.join()


# Usage example
host_url = "http://xx.xx.x.xx:7860/" 
folder_path = "data"

uploader = DocumentUploader(host_url)

# Start monitoring the folder
monitor_folder(folder_path, uploader)

The text was updated successfully, but these errors were encountered:

pseudotensor · 2024-09-29T00:43:21Z

Hi, maybe the API and UI are using different options by default for which handlers (e.g. doctr, unstructured, OCR, vision, etc.) are used. Good to compare logs for each.

Note that the API and UI use the same code and use gradio's unified way of generating API from UI itself.

h2oai deleted a comment Oct 24, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Upload API takes longer time than uploading through the UI. #1853

Upload API takes longer time than uploading through the UI. #1853

llmwesee commented Sep 25, 2024

pseudotensor commented Sep 29, 2024

Upload API takes longer time than uploading through the UI. #1853

Upload API takes longer time than uploading through the UI. #1853

Comments

llmwesee commented Sep 25, 2024

pseudotensor commented Sep 29, 2024