From bb9bc0c1b8d72fc34d2acce8421e15a00077e4c2 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 19:07:22 +0000 Subject: [PATCH 1/9] Add class to describe image samples and loading logic for images from url --- src/guidellm/utils/images.py | 69 ++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/guidellm/utils/images.py diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py new file mode 100644 index 0000000..5e96ce1 --- /dev/null +++ b/src/guidellm/utils/images.py @@ -0,0 +1,69 @@ +from PIL import Image +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse +from pydantic import Field, ConfigDict +from typing import List, Optional +from io import BytesIO + +from loguru import logger + +import requests + +from guidellm.config import settings +from guidellm.core.serializable import Serializable + +__all__ = ["load_images", "ImageDescriptor"] + +class ImageDescriptor(Serializable): + """ + A class to represent image data in serializable format. + """ + model_config = ConfigDict(arbitrary_types_allowed=True) + + url: Optional[str] = Field(description="url address for image.") + image: Image.Image = Field(description="PIL image", exclude=True) + filename: Optional[int] = Field( + default=None, + description="Image filename.", + ) + + +def load_images(data: str) -> List[ImageDescriptor]: + """ + Load an HTML file from a path or URL + + :param data: the path or URL to load the HTML file from + :type data: Union[str, Path] + :return: Descriptor containing image url and the data in PIL.Image.Image format + :rtype: ImageDescriptor + """ + + images = [] + if not data: + return None + if isinstance(data, str) and data.startswith("http"): + response = requests.get(data, timeout=settings.request_timeout) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + for img_tag in soup.find_all("img"): + img_url = img_tag.get("src") + + if img_url: + # Handle relative URLs + img_url = urljoin(data, img_url) + + # Download the image + logger.debug("Loading image: {}", img_url) + img_response = requests.get(img_url) + img_response.raise_for_status() + + # Load image into Pillow + images.append( + ImageDescriptor( + url=img_url, + image=Image.open(BytesIO(img_response.content)), + ) + ) + + return images \ No newline at end of file From 59002b511339a22833a20a51023c40b680a1a3f5 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 19:07:49 +0000 Subject: [PATCH 2/9] Add class to describe image samples and loading logic for images from url --- src/guidellm/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 2fdd8ca..6f2f669 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -12,6 +12,7 @@ split_lines_by_punctuation, split_text, ) +from .images import load_images, ImageDescriptor from .transformers import ( load_transformers_dataset, resolve_transformers_dataset, From cb1f244ac5ce15dad230bbf84f541868b3ffa393 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 19:10:01 +0000 Subject: [PATCH 3/9] Add url used to download images from for emulated requests --- src/guidellm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/guidellm/config.py b/src/guidellm/config.py index c3d950e..df750ea 100644 --- a/src/guidellm/config.py +++ b/src/guidellm/config.py @@ -90,6 +90,7 @@ class EmulatedDataSettings(BaseModel): "force_new_line_punctuation": True, } ) + image_source: List[str] = "https://www.gutenberg.org/cache/epub/1342/pg1342-images.html" class OpenAISettings(BaseModel): From 24e652721ef3fe8268754f5802cb49e878145384 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 19:10:24 +0000 Subject: [PATCH 4/9] Add support to images in requests --- src/guidellm/backend/openai.py | 25 ++++++++++++++++++++++--- src/guidellm/core/request.py | 15 ++++++++++++++- src/guidellm/request/emulated.py | 17 +++++++++++++++-- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index 8c83f91..c740b34 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -1,4 +1,5 @@ from typing import AsyncGenerator, Dict, List, Optional +import io, base64 from loguru import logger from openai import AsyncOpenAI, OpenAI @@ -103,11 +104,11 @@ async def make_request( request_args.update(self._request_args) + messages = self._build_messages(request) + stream = await self._async_client.chat.completions.create( model=self.model, - messages=[ - {"role": "user", "content": request.prompt}, - ], + messages=messages, stream=True, **request_args, ) @@ -167,3 +168,21 @@ def validate_connection(self): except Exception as error: logger.error("Failed to validate OpenAI connection: {}", error) raise error + + def _build_messages(self, request: TextGenerationRequest) -> Dict: + if request.number_images == 0: + messages = [{"role": "user", "content": request.prompt}] + else: + content = [] + for image in request.images: + stream = io.BytesIO() + im_format = image.image.format or "PNG" + image.image.save(stream, format=im_format) + im_b64 = base64.b64encode(stream.getvalue()).decode("ascii") + image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"} + content.append({"type": "image_url", "image_url": image_url}) + + content.append({"type": "text", "text": request.prompt}) + messages = [{"role": "user", "content": content}] + + return messages diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py index 4f7315c..8f93b56 100644 --- a/src/guidellm/core/request.py +++ b/src/guidellm/core/request.py @@ -1,9 +1,10 @@ import uuid -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List from pydantic import Field from guidellm.core.serializable import Serializable +from guidellm.utils import ImageDescriptor class TextGenerationRequest(Serializable): @@ -16,6 +17,10 @@ class TextGenerationRequest(Serializable): description="The unique identifier for the request.", ) prompt: str = Field(description="The input prompt for the text generation.") + images: Optional[List[ImageDescriptor]] = Field( + default=None, + description="Input images.", + ) prompt_token_count: Optional[int] = Field( default=None, description="The number of tokens in the input prompt.", @@ -29,6 +34,13 @@ class TextGenerationRequest(Serializable): description="The parameters for the text generation request.", ) + @property + def number_images(self) -> int: + if self.images is None: + return 0 + else: + return len(self.images) + def __str__(self) -> str: prompt_short = ( self.prompt[:32] + "..." @@ -41,4 +53,5 @@ def __str__(self) -> str: f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, " f"output_token_count={self.output_token_count}, " f"params={self.params})" + f"images={self.number_images}" ) diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py index 7d481cb..b7053de 100644 --- a/src/guidellm/request/emulated.py +++ b/src/guidellm/request/emulated.py @@ -11,7 +11,7 @@ from guidellm.config import settings from guidellm.core.request import TextGenerationRequest from guidellm.request.base import GenerationMode, RequestGenerator -from guidellm.utils import clean_text, filter_text, load_text, split_text +from guidellm.utils import clean_text, filter_text, load_text, split_text, load_images __all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"] @@ -30,6 +30,7 @@ class EmulatedConfig: generated_tokens_variance (Optional[int]): Variance for generated tokens. generated_tokens_min (Optional[int]): Minimum number of generated tokens. generated_tokens_max (Optional[int]): Maximum number of generated tokens. + images (Optional[int]): Number of input images. """ @staticmethod @@ -47,7 +48,7 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig": """ if not config: logger.debug("Creating default configuration") - return EmulatedConfig(prompt_tokens=1024, generated_tokens=256) + return EmulatedConfig(prompt_tokens=1024, generated_tokens=256, images=0) if isinstance(config, dict): logger.debug("Loading configuration from dict: {}", config) @@ -105,6 +106,8 @@ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig": generated_tokens_min: Optional[int] = None generated_tokens_max: Optional[int] = None + images: int = 0 + @property def prompt_tokens_range(self) -> Tuple[int, int]: """ @@ -327,6 +330,8 @@ def __init__( settings.emulated_data.filter_start, settings.emulated_data.filter_end, ) + if self._config.images > 0: + self._images = load_images(settings.emulated_data.image_source) self._rng = np.random.default_rng(random_seed) # NOTE: Must be after all the parameters since the queue population @@ -355,6 +360,7 @@ def create_item(self) -> TextGenerationRequest: logger.debug("Creating new text generation request") target_prompt_token_count = self._config.sample_prompt_tokens(self._rng) prompt = self.sample_prompt(target_prompt_token_count) + images = self.sample_images() prompt_token_count = len(self.tokenizer.tokenize(prompt)) output_token_count = self._config.sample_output_tokens(self._rng) logger.debug("Generated prompt: {}", prompt) @@ -363,6 +369,7 @@ def create_item(self) -> TextGenerationRequest: prompt=prompt, prompt_token_count=prompt_token_count, output_token_count=output_token_count, + images=images, ) def sample_prompt(self, tokens: int) -> str: @@ -395,3 +402,9 @@ def sample_prompt(self, tokens: int) -> str: right = mid return self._tokens.create_text(start_line_index, left) + + + def sample_images(self): + image_indices = self._rng.choice(len(self._images), size=self._config.images, replace=False) + + return [self._images[i] for i in image_indices] \ No newline at end of file From 394670999785536b696978eae411cbcf7c4583cd Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 19:45:56 +0000 Subject: [PATCH 5/9] quality fixes --- src/guidellm/backend/openai.py | 7 ++++--- src/guidellm/core/request.py | 2 +- src/guidellm/request/emulated.py | 8 ++++---- src/guidellm/utils/__init__.py | 2 +- src/guidellm/utils/images.py | 27 +++++++++++++-------------- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index c740b34..f75bb3b 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -1,5 +1,6 @@ +import base64 +import io from typing import AsyncGenerator, Dict, List, Optional -import io, base64 from loguru import logger from openai import AsyncOpenAI, OpenAI @@ -182,7 +183,7 @@ def _build_messages(self, request: TextGenerationRequest) -> Dict: image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"} content.append({"type": "image_url", "image_url": image_url}) - content.append({"type": "text", "text": request.prompt}) + content.append({"type": "text", "text": request.prompt}) messages = [{"role": "user", "content": content}] - + return messages diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py index 8f93b56..a1ff199 100644 --- a/src/guidellm/core/request.py +++ b/src/guidellm/core/request.py @@ -1,5 +1,5 @@ import uuid -from typing import Any, Dict, Optional, List +from typing import Any, Dict, List, Optional from pydantic import Field diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py index b7053de..9dc3825 100644 --- a/src/guidellm/request/emulated.py +++ b/src/guidellm/request/emulated.py @@ -11,7 +11,7 @@ from guidellm.config import settings from guidellm.core.request import TextGenerationRequest from guidellm.request.base import GenerationMode, RequestGenerator -from guidellm.utils import clean_text, filter_text, load_text, split_text, load_images +from guidellm.utils import clean_text, filter_text, load_images, load_text, split_text __all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"] @@ -402,9 +402,9 @@ def sample_prompt(self, tokens: int) -> str: right = mid return self._tokens.create_text(start_line_index, left) - - + + def sample_images(self): image_indices = self._rng.choice(len(self._images), size=self._config.images, replace=False) - return [self._images[i] for i in image_indices] \ No newline at end of file + return [self._images[i] for i in image_indices] diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 6f2f669..1e51f22 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -1,3 +1,4 @@ +from .images import ImageDescriptor, load_images from .injector import create_report, inject_data from .progress import BenchmarkReportProgress from .text import ( @@ -12,7 +13,6 @@ split_lines_by_punctuation, split_text, ) -from .images import load_images, ImageDescriptor from .transformers import ( load_transformers_dataset, resolve_transformers_dataset, diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py index 5e96ce1..5d73bc0 100644 --- a/src/guidellm/utils/images.py +++ b/src/guidellm/utils/images.py @@ -1,13 +1,12 @@ -from PIL import Image -from bs4 import BeautifulSoup -from urllib.parse import urljoin, urlparse -from pydantic import Field, ConfigDict -from typing import List, Optional from io import BytesIO - -from loguru import logger +from typing import List, Optional +from urllib.parse import urljoin import requests +from bs4 import BeautifulSoup +from loguru import logger +from PIL import Image +from pydantic import ConfigDict, Field from guidellm.config import settings from guidellm.core.serializable import Serializable @@ -19,14 +18,14 @@ class ImageDescriptor(Serializable): A class to represent image data in serializable format. """ model_config = ConfigDict(arbitrary_types_allowed=True) - + url: Optional[str] = Field(description="url address for image.") image: Image.Image = Field(description="PIL image", exclude=True) filename: Optional[int] = Field( default=None, description="Image filename.", ) - + def load_images(data: str) -> List[ImageDescriptor]: """ @@ -45,25 +44,25 @@ def load_images(data: str) -> List[ImageDescriptor]: response = requests.get(data, timeout=settings.request_timeout) response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") for img_tag in soup.find_all("img"): img_url = img_tag.get("src") if img_url: # Handle relative URLs img_url = urljoin(data, img_url) - + # Download the image logger.debug("Loading image: {}", img_url) img_response = requests.get(img_url) img_response.raise_for_status() - + # Load image into Pillow images.append( ImageDescriptor( - url=img_url, + url=img_url, image=Image.open(BytesIO(img_response.content)), ) ) - return images \ No newline at end of file + return images From 7d93b020d34e28ebdf04e7bc39a13c867fa6ef97 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 19:53:57 +0000 Subject: [PATCH 6/9] Quality fixes --- src/guidellm/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/guidellm/__init__.py b/src/guidellm/__init__.py index e562018..b10b445 100644 --- a/src/guidellm/__init__.py +++ b/src/guidellm/__init__.py @@ -6,6 +6,7 @@ # flake8: noqa import os + import transformers # type: ignore os.environ["TOKENIZERS_PARALLELISM"] = "false" # Silence warnings for tokenizers From a441dade284aa9e682c652430a739d7298c3e82e Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 4 Nov 2024 20:01:26 +0000 Subject: [PATCH 7/9] Quality fixes --- src/guidellm/request/emulated.py | 4 +++- src/guidellm/utils/__init__.py | 2 ++ src/guidellm/utils/images.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/guidellm/request/emulated.py b/src/guidellm/request/emulated.py index 9dc3825..f15387e 100644 --- a/src/guidellm/request/emulated.py +++ b/src/guidellm/request/emulated.py @@ -405,6 +405,8 @@ def sample_prompt(self, tokens: int) -> str: def sample_images(self): - image_indices = self._rng.choice(len(self._images), size=self._config.images, replace=False) + image_indices = self._rng.choice( + len(self._images), size=self._config.images, replace=False, + ) return [self._images[i] for i in image_indices] diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 1e51f22..eb4931b 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -38,4 +38,6 @@ "resolve_transformers_dataset_split", "split_lines_by_punctuation", "split_text", + "ImageDescriptor", + "load_images", ] diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py index 5d73bc0..5c5a727 100644 --- a/src/guidellm/utils/images.py +++ b/src/guidellm/utils/images.py @@ -65,4 +65,4 @@ def load_images(data: str) -> List[ImageDescriptor]: ) ) - return images + return images From 570670b6c2a24869a40635f0112af7d92da0e73c Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 5 Nov 2024 01:11:06 +0000 Subject: [PATCH 8/9] Quality fixes --- src/guidellm/backend/openai.py | 2 +- src/guidellm/utils/images.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py index f75bb3b..90d2791 100644 --- a/src/guidellm/backend/openai.py +++ b/src/guidellm/backend/openai.py @@ -179,7 +179,7 @@ def _build_messages(self, request: TextGenerationRequest) -> Dict: stream = io.BytesIO() im_format = image.image.format or "PNG" image.image.save(stream, format=im_format) - im_b64 = base64.b64encode(stream.getvalue()).decode("ascii") + im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8") image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"} content.append({"type": "image_url", "image_url": image_url}) diff --git a/src/guidellm/utils/images.py b/src/guidellm/utils/images.py index 5c5a727..5d73bc0 100644 --- a/src/guidellm/utils/images.py +++ b/src/guidellm/utils/images.py @@ -65,4 +65,4 @@ def load_images(data: str) -> List[ImageDescriptor]: ) ) - return images + return images From 984da28e4423f1726888b6a37de30621029d3622 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 5 Nov 2024 02:43:04 +0000 Subject: [PATCH 9/9] Add new dependencies --- .pre-commit-config.yaml | 3 +++ pyproject.toml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a085bb..6bcf150 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,6 +27,9 @@ repos: pyyaml, requests, rich, + pillow, + base64, + io, transformers, # dev dependencies diff --git a/pyproject.toml b/pyproject.toml index 6ab2c6e..b83abfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,9 @@ dependencies = [ "pyyaml>=6.0.0", "requests", "rich", + "pillow", + "base64", + "io", "transformers", ]