Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

Open
pseudotensor opened this issue Jul 13, 2024 · 1 comment
Assignees
Labels
type/feature Feature request

Comments

@pseudotensor
Copy link
Collaborator

import cv2
from openai import OpenAI

client = OpenAI(base_url='http://<ip>:80/v1')
model="OpenGVLab/InternVL2-26B"
#client = OpenAI(base_url='http://<ip>:80/v1')
#model = 'OpenGVLab/InternVL-Chat-V1-5'

prompt = """<response_instructions>
- Act as a keen observer with a sharp eye for detail.
- Analyze the content within the images.
- Provide insights based on your observations.
- Avoid making up facts.
- Finally, according to our chat history, above documents, above figure captions, or given images, generate a well-structured response.
</response_instructions>
What tower do you see in the image?
"""

from PIL import Image
import base64
import requests
from io import BytesIO


# The encoding function I linked previously - but we actually don't use this function in the API server
def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
    """encode image to base64 format."""

    buffered = BytesIO()
    if format == 'JPEG':
        image = image.convert('RGB')
    image.save(buffered, format)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')


# This is what we use in the API server to load the base64 string to image
def load_image_from_base64(image: str):
    """Load image from base64 format."""
    return Image.open(BytesIO(base64.b64decode(image)))


image1 = '/tmp/image_file_764ae7bd-6b02-4ffb-b9d6-83e754c30952.jpeg'
image2 = '/tmp/image_file_1bfb88ea-a545-4b1f-a31f-051dbb90a378.jpeg'
image3 = '/tmp/image_file_ac5589e7-92a3-470f-a933-40d6bad38052.jpeg'

#from PIL import Image


def remove_padding(image_path, output_path, background_color=(255, 255, 255)):
    # Read the image
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold to get a binary image
    _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY)

    # Invert the binary image
    inverted_binary = cv2.bitwise_not(binary)

    # Find contours
    contours, _ = cv2.findContours(inverted_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Get the bounding box of the largest contour
    x, y, w, h = cv2.boundingRect(contours[0])
    for contour in contours:
        x1, y1, w1, h1 = cv2.boundingRect(contour)
        if w1 * h1 > w * h:
            x, y, w, h = x1, y1, w1, h1

    # Crop the image to the bounding box
    cropped_image = image[y:y+h, x:x+w]

    # Save the cropped image
    cv2.imwrite(output_path, cropped_image)


# Example usage
if False:
    ext = 'b.jpg'
    remove_padding(image1, image1 + ext)
    remove_padding(image2, image2 + ext)
    remove_padding(image3, image3 + ext)
else:
    ext = ''

image1_64 = base64.b64encode(open(image1 + ext, 'rb').read()).decode('utf-8')
image2_64 = base64.b64encode(open(image2 + ext, 'rb').read()).decode('utf-8')
image3_64 = base64.b64encode(open(image3 + ext, 'rb').read()).decode('utf-8')

system_prompt = "You are h2oGPTe, an expert question-answering AI system created by H2O.ai that performs like GPT-4 by OpenAI."

messages = [
    #{'role': 'system', 'content': system_prompt},
    {
        'role': 'user',
        'content': [
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image1_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image2_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image3_64,
                }
             },
            {'type': 'text', 'text': prompt},
        ],
    }
]

response = client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=300,
    temperature=0.0,
)

print(response.choices[0])

gives:

The image does not show a tower. Instead, it shows two separate items:\n\n1. A receipt from a shopping store.\n2. A cake with a message congratulating Kate and Duke on their upcoming arrival.\n\nIf you have any specific questions about these items, please let me know!
@pseudotensor
Copy link
Collaborator Author


image_file_ac5589e7-92a3-470f-a933-40d6bad38052
image_file_1bfb88ea-a545-4b1f-a31f-051dbb90a378
image_file_764ae7bd-6b02-4ffb-b9d6-83e754c30952

@pseudotensor pseudotensor self-assigned this Jul 13, 2024
@pseudotensor pseudotensor added the type/feature Feature request label Jul 13, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
type/feature Feature request
Projects
None yet
Development

No branches or pull requests

1 participant