neuralmagic · parfeniukink · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -14,20 +14,31 @@ Before you begin, ensure you have the following installed:
 - `pip` (Python package installer)
 - `git` (version control system)
 
-### Installation
+### Clone the repository:
 
-1. Clone the repository:
+```sh
+git clone https://github.com/neuralmagic/guidellm.git
+cd guidellm
+```
 
-   ```bash
-   git clone https://github.com/neuralmagic/guidellm.git
-   cd guidellm
-   ```
+### Install dependencies:
 
-2. Install the required dependencies:
+All the dependencies are specified in `pyproject.toml` file. There is an option to install only required dependencies and optional dependencies
 
-   ```bash
-   pip install -e .[dev]
-   ```
+Install required dependencies along with optional `dev` dependencies.
+
+```bash
+git clone https://github.com/neuralmagic/guidellm.git
+cd guidellm
+pip install -e .[dev]
+```
+
+If you work with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
+
+```sh
+pip install -e .[deepsparse]
+# or pip install -e '.[deepsparse]'
+```
 
 ## Project Structure
 
@@ -46,8 +57,9 @@ guidellm/
 └── README.md
 ```
 
-- **src/guidellm/**: Main source code for the project.
-- **tests/**: Test cases categorized into unit, integration, and end-to-end tests.
+- `pyproject.toml`: Project metadata
+- `**src/guidellm/**`: Main source code for the project.
+- `**tests/**`: Test cases categorized into unit, integration, and end-to-end tests.
 
 ## Development Environment Setup
 
@@ -234,12 +246,14 @@ The project configuration entrypoint is represented by lazy-loaded `settigns` si
 The project is fully configurable with environment variables. All the default values and
 
 ```py
-class NestedIntoLogging(BaseModel):
+class Nested(BaseModel):
     nested: str = "default value"
 
 class LoggingSettings(BaseModel):
     # ...
+
     disabled: bool = False
+    nested: Nested = Nested()
 
 
 class Settings(BaseSettings):

diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,9 @@ dev = [
     "types-requests~=2.32.0",
     "types-toml",
 ]
+deepsparse = [
+    "deepsparse; python_version < '3.12'",
+]
 
 
 [project.entry-points.console_scripts]
@@ -104,6 +107,10 @@ exclude = ["venv", ".tox"]
 # Check: https://mypy.readthedocs.io/en/latest/config_file.html#import-discovery
 follow_imports = 'silent'
 
+[[tool.mypy.overrides]]
+module = ["deepsparse.*", "transformers.*"]
+ignore_missing_imports=true
+
 
 [tool.ruff]
 line-length = 88
@@ -117,11 +124,14 @@ indent-style = "space"
 [tool.ruff.lint]
 ignore = [
     "PLR0913",
+    "PLR2004", # allow numbers without constants definitions
+    "RET505", # allow `else` block after `if (condition): return value` line
     "TCH001",
     "COM812",
     "ISC001",
     "TCH002",
     "PLW1514", # allow Path.open without encoding
+    "S311", # allow standard pseudo-random generators
 
 ]
 select = [
@@ -177,19 +187,19 @@ select = [
     "FIX", # flake8-fixme: detects FIXMEs and other temporary comments that should be resolved
 ]
 
-[tool.ruff.lint.extend-per-file-ignores]
-"tests/**/*.py" = [
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = [
     "S101", # asserts allowed in tests
+    "S105", # allow hardcoded passwords in tests
+    "S106", # allow hardcoded passwords in tests
     "ARG", # Unused function args allowed in tests
     "PLR2004", # Magic value used in comparison
     "TCH002", # No import only type checking in tests
     "SLF001", # enable private member access in tests
-    "S105", # allow hardcoded passwords in tests
-    "S311", # allow standard pseudo-random generators in tests
     "PT011", # allow generic exceptions in tests
     "N806", # allow uppercase variable names in tests
     "PGH003", # allow general ignores in tests
-    "S106", # allow hardcoded passwords in tests
     "PLR0915", # allow complext statements in tests
 ]
 

diff --git a/src/guidellm/backend/__init__.py b/src/guidellm/backend/__init__.py
@@ -1,10 +1,3 @@
 from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
-from .openai import OpenAIBackend
 
-__all__ = [
-    "Backend",
-    "BackendEngine",
-    "BackendEnginePublic",
-    "GenerativeResponse",
-    "OpenAIBackend",
-]
+__all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
@@ -15,7 +15,7 @@
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
 
 
-BackendEnginePublic = Literal["openai_server"]
+BackendEnginePublic = Literal["openai_server", "deepsparse"]
 BackendEngine = Union[BackendEnginePublic, Literal["test"]]
 
 
@@ -117,9 +117,10 @@ def __init__(self, type_: BackendEngine, target: str, model: str):
         :param target: The target URL for the backend.
         :param model: The model used by the backend.
         """
-        self._type = type_
-        self._target = target
-        self._model = model
+
+        self._type: BackendEngine = type_
+        self._target: str = target
+        self._model: str = model
 
         self.test_connection()
 

diff --git a/src/guidellm/backend/deepsparse/__init__.py b/src/guidellm/backend/deepsparse/__init__.py
@@ -0,0 +1,28 @@
+"""
+This package encapsulates the "Deepsparse Backend" implementation.
+
+ref: https://github.com/neuralmagic/deepsparse
+
+The `deepsparse` package supports Python3.6..Python3.11,
+when the `guidellm` start from Python3.8.
+
+Safe range of versions is Python3.8..Python3.11
+for the Deepsparse Backend implementation.
+
+In the end ensure that the `deepsparse` package is installed.
+"""
+
+from guidellm.utils import check_python_version, module_is_available
+
+check_python_version(min_version="3.8", max_version="3.11")
+module_is_available(
+    module="deepsparse",
+    helper=(
+        "`deepsparse` package is not available. "
+        "Please try `pip install -e '.[deepsparse]'`"
+    ),
+)
+
+from .backend import DeepsparseBackend  # noqa: E402
+
+__all__ = ["DeepsparseBackend"]
diff --git a/src/guidellm/backend/deepsparse/backend.py b/src/guidellm/backend/deepsparse/backend.py
@@ -0,0 +1,121 @@
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from deepsparse import Pipeline, TextGeneration
+from loguru import logger
+
+from guidellm.backend import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+
+@Backend.register(backend_type="deepsparse")
+class DeepsparseBackend(Backend):
+    """
+    An Deepsparse backend implementation for the generative AI result.
+    """
+
+    def __init__(self, model: Optional[str] = None, **request_args):
+        self._request_args: Dict[str, Any] = request_args
+        self._model = self._get_model(model)
+        self.pipeline: Pipeline = TextGeneration(model=self._model)
+
+        super().__init__(type_="deepsparse", model=self._model, target="not used")
+
+        logger.info(f"Deepsparse Backend uses model {self._model}")
+
+    def _get_model(self, model_from_cli: Optional[str] = None) -> str:
+        """Provides the model by the next priority list:
+        1. from function argument (comes from CLI)
+        1. from environment variable
+        2. `self.default_model` from `self.available_models`
+        """
+
+        if model_from_cli is not None:
+            return model_from_cli
+        elif settings.llm_model is not None:
+            logger.info(
+                "Using Deepsparse model from environment variable: "
+                f"{settings.llm_model}"
+            )
+            return settings.llm_model
+        else:
+            logger.info(f"Using default Deepsparse model: {self.default_model}")
+            logger.info(
+                "To customize the model either set the 'GUIDELLM__LLM_MODEL' "
+                "environment variable or set the CLI argument '--model'"
+            )
+            return self.default_model
+
+    async def make_request(
+        self, request: TextGenerationRequest
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the Deepsparse Python API client.
+
+        :param request: The result request to submit.
+        :type request: TextGenerationRequest
+        :return: An iterator over the generative responses.
+        :rtype: Iterator[GenerativeResponse]
+        """
+
+        logger.debug(
+            f"Making request to Deepsparse backend with prompt: {request.prompt}"
+        )
+
+        token_count = 0
+        request_args = {
+            **self._request_args,
+            "streaming": True,
+            "max_new_tokens": request.output_token_count,
+        }
+
+        if not (output := self.pipeline(prompt=request.prompt, **request_args)):
+            yield GenerativeResponse(
+                type_="final",
+                prompt=request.prompt,
+                prompt_token_count=request.prompt_token_count,
+                output_token_count=token_count,
+            )
+            return
+
+        for generation in output.generations:
+            if not (token := generation.text):
+                yield GenerativeResponse(
+                    type_="final",
+                    prompt=request.prompt,
+                    prompt_token_count=request.prompt_token_count,
+                    output_token_count=token_count,
+                )
+                return
+            else:
+                token_count += 1
+                yield GenerativeResponse(
+                    type_="token_iter",
+                    add_token=token,
+                    prompt=request.prompt,
+                    prompt_token_count=request.prompt_token_count,
+                    output_token_count=token_count,
+                )
+
+        yield GenerativeResponse(
+            type_="final",
+            prompt=request.prompt,
+            prompt_token_count=request.prompt_token_count,
+            output_token_count=token_count,
+        )
+
+    def available_models(self) -> List[str]:
+        """
+        Get the available models for the backend.
+
+        :return: A list of available models.
+        :rtype: List[str]
+        """
+
+        # WARNING: The default model from the documentation is defined here
+        return ["hf:mgoin/TinyStories-33M-quant-deepsparse"]
+
+    def _token_count(self, text: str) -> int:
+        token_count = len(text.split())
+        logger.debug(f"Token count for text '{text}': {token_count}")
+        return token_count
diff --git a/src/guidellm/config.py b/src/guidellm/config.py
@@ -128,6 +128,7 @@ class Settings(BaseSettings):
     ```sh
     export GUIDELLM__LOGGING__DISABLED=true
     export GUIDELLM__OPENAI__API_KEY=******
+    export GUIDELLM__LLM_MODEL=******
     ```
     """
 
@@ -141,6 +142,7 @@ class Settings(BaseSettings):
 
     # general settings
     env: Environment = Environment.PROD
+    llm_model: str = "mistralai/Mistral-7B-Instruct-v0.3"
     request_timeout: int = 30
     max_concurrency: int = 512
     num_sweep_profiles: int = 9
@@ -152,8 +154,6 @@ class Settings(BaseSettings):
 
     # Request settings
     openai: OpenAISettings = OpenAISettings()
-
-    # Report settings
     report_generation: ReportGenerationSettings = ReportGenerationSettings()
 
     @model_validator(mode="after")

diff --git a/src/guidellm/executor/profile_generator.py b/src/guidellm/executor/profile_generator.py
@@ -190,12 +190,14 @@ def next(self, current_report: TextGenerationBenchmarkReport) -> Optional[Profil
         elif self.mode == "sweep":
             profile = self.create_sweep_profile(
                 self.generated_count,
-                sync_benchmark=current_report.benchmarks[0]
-                if current_report.benchmarks
-                else None,
-                throughput_benchmark=current_report.benchmarks[1]
-                if len(current_report.benchmarks) > 1
-                else None,
+                sync_benchmark=(
+                    current_report.benchmarks[0] if current_report.benchmarks else None
+                ),
+                throughput_benchmark=(
+                    current_report.benchmarks[1]
+                    if len(current_report.benchmarks) > 1
+                    else None
+                ),
             )
         else:
             err = ValueError(f"Invalid mode: {self.mode}")

diff --git a/src/guidellm/scheduler/base.py b/src/guidellm/scheduler/base.py
@@ -227,9 +227,7 @@ async def run(self) -> AsyncGenerator[SchedulerResult, None]:
         count_total = (
             self.max_number
             if self.max_number
-            else round(self.max_duration)
-            if self.max_duration
-            else 0
+            else round(self.max_duration) if self.max_duration else 0
         )
 
         # yield initial result for progress tracking
@@ -246,9 +244,7 @@ async def run(self) -> AsyncGenerator[SchedulerResult, None]:
             count_completed = (
                 min(run_count, self.max_number)
                 if self.max_number
-                else round(time.time() - start_time)
-                if self.max_duration
-                else 0
+                else round(time.time() - start_time) if self.max_duration else 0
             )
 
             yield SchedulerResult(
@@ -267,9 +263,7 @@ async def run(self) -> AsyncGenerator[SchedulerResult, None]:
             count_completed=(
                 benchmark.request_count + benchmark.error_count
                 if self.max_number
-                else round(time.time() - start_time)
-                if self.max_duration
-                else 0
+                else round(time.time() - start_time) if self.max_duration else 0
             ),
             benchmark=benchmark,
         )