Skip to content

Commit

Permalink
feat(deploy): add automated OmniParser deployment
Browse files Browse the repository at this point in the history
* add working omniparser deploy.py, Dockerfile, pyproject.toml, README.md, .env.example, .dockerignore, client.py
  • Loading branch information
abrichr authored Feb 19, 2025
1 parent acdbb7b commit 8acd7c0
Show file tree
Hide file tree
Showing 7 changed files with 1,027 additions and 0 deletions.
3 changes: 3 additions & 0 deletions deploy/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION=
10 changes: 10 additions & 0 deletions deploy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
```
# First time setup
cd deploy
uv venv
source .venv/bin/activate
uv pip install -e .
# Subsequent usage
python deploy/models/omniparser/deploy.py start
```
20 changes: 20 additions & 0 deletions deploy/deploy/models/omniparser/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
__pycache__
*.pyc
*.pyo
*.pyd
.Python
env
pip-log.txt
pip-delete-this-directory.txt
.tox
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.log
.pytest_cache
.env
.venv
.DS_Store
59 changes: 59 additions & 0 deletions deploy/deploy/models/omniparser/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
FROM nvidia/cuda:12.3.1-devel-ubuntu22.04

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
git-lfs \
wget \
libgl1 \
libglib2.0-0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install

RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
bash miniconda.sh -b -p /opt/conda && \
rm miniconda.sh
ENV PATH="/opt/conda/bin:$PATH"

RUN conda create -n omni python=3.12 && \
echo "source activate omni" > ~/.bashrc
ENV CONDA_DEFAULT_ENV=omni
ENV PATH="/opt/conda/envs/omni/bin:$PATH"

WORKDIR /app

RUN git clone https://github.com/microsoft/OmniParser.git && \
cd OmniParser && \
git lfs install && \
git lfs pull

WORKDIR /app/OmniParser

RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
pip uninstall -y opencv-python opencv-python-headless && \
pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
pip install -r requirements.txt && \
pip install huggingface_hub fastapi uvicorn

# Download V2 weights
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
mkdir -p /app/OmniParser/weights && \
cd /app/OmniParser && \
rm -rf weights/icon_detect weights/icon_caption weights/icon_caption_florence && \
for folder in icon_caption icon_detect; do \
huggingface-cli download microsoft/OmniParser-v2.0 --local-dir weights --repo-type model --include "$folder/*"; \
done && \
mv weights/icon_caption weights/icon_caption_florence

# Pre-download OCR models during build
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
cd /app/OmniParser && \
python3 -c "import easyocr; reader = easyocr.Reader(['en']); print('Downloaded EasyOCR model')" && \
python3 -c "from paddleocr import PaddleOCR; ocr = PaddleOCR(lang='en', use_angle_cls=False, use_gpu=False, show_log=False); print('Downloaded PaddleOCR model')"

CMD ["python3", "/app/OmniParser/omnitool/omniparserserver/omniparserserver.py", \
"--som_model_path", "/app/OmniParser/weights/icon_detect/model.pt", \
"--caption_model_path", "/app/OmniParser/weights/icon_caption_florence", \
"--device", "cuda", \
"--BOX_TRESHOLD", "0.05", \
"--host", "0.0.0.0", \
"--port", "8000"]
128 changes: 128 additions & 0 deletions deploy/deploy/models/omniparser/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Client module for interacting with the OmniParser server."""

import base64
import fire
import requests

from loguru import logger
from PIL import Image, ImageDraw


def image_to_base64(image_path: str) -> str:
"""Convert an image file to base64 string.
Args:
image_path: Path to the image file
Returns:
str: Base64 encoded string of the image
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")


def plot_results(
original_image_path: str,
som_image_base64: str,
parsed_content_list: list[dict[str, list[float]]],
) -> None:
"""Plot parsing results on the original image.
Args:
original_image_path: Path to the original image
som_image_base64: Base64 encoded SOM image
parsed_content_list: List of parsed content with bounding boxes
"""
# Open original image
image = Image.open(original_image_path)
width, height = image.size

# Create drawable image
draw = ImageDraw.Draw(image)

# Draw bounding boxes and labels
for item in parsed_content_list:
# Get normalized coordinates and convert to pixel coordinates
x1, y1, x2, y2 = item["bbox"]
x1 = int(x1 * width)
y1 = int(y1 * height)
x2 = int(x2 * width)
y2 = int(y2 * height)

label = item["content"]

# Draw rectangle
draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)

# Draw label background
text_bbox = draw.textbbox((x1, y1), label)
draw.rectangle(
[text_bbox[0] - 2, text_bbox[1] - 2, text_bbox[2] + 2, text_bbox[3] + 2],
fill="white",
)

# Draw label text
draw.text((x1, y1), label, fill="red")

# Show image
image.show()


def parse_image(
image_path: str,
server_url: str,
) -> None:
"""Parse an image using the OmniParser server.
Args:
image_path: Path to the image file
server_url: URL of the OmniParser server
"""
# Remove trailing slash from server_url if present
server_url = server_url.rstrip("/")

# Convert image to base64
base64_image = image_to_base64(image_path)

# Prepare request
url = f"{server_url}/parse/"
payload = {"base64_image": base64_image}

try:
# First, check if the server is available
probe_url = f"{server_url}/probe/"
probe_response = requests.get(probe_url)
probe_response.raise_for_status()
logger.info("Server is available")

# Make request to API
response = requests.post(url, json=payload)
response.raise_for_status()

# Parse response
result = response.json()
som_image_base64 = result["som_image_base64"]
parsed_content_list = result["parsed_content_list"]

# Plot results
plot_results(image_path, som_image_base64, parsed_content_list)

# Print latency
logger.info(f"API Latency: {result['latency']:.2f} seconds")

except requests.exceptions.ConnectionError:
logger.error(f"Error: Could not connect to server at {server_url}")
logger.error("Please check if the server is running and the URL is correct")
except requests.exceptions.RequestException as e:
logger.error(f"Error making request to API: {e}")
except Exception as e:
logger.error(f"Error: {e}")


def main() -> None:
"""Main entry point for the client application."""
fire.Fire(parse_image)


if __name__ == "__main__":
main()
Loading

0 comments on commit 8acd7c0

Please sign in to comment.