diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 4ea5d8a..0000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Publish Python distribution to PyPI - -on: - push: - tags: - - v* - -jobs: - build: - name: Build distribution - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.8" - - - name: Install pypa/build nad pypa/twine - run: >- - python3 -m pip install build twine --user - - - name: Build a binary wheel - run: python3 -m build - - publish-to-pypi: - name: Publish Python distribution to PyPI - needs: - - build - if: startsWith(github.ref, 'refs/tags/v') - runs-on: ubuntu-latest - steps: - - name: 🚀📦 Publish to PyPI - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - - run: python -m twine upload dist/* diff --git a/README.md b/README.md index cafa03d..0a19a8c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@
@@ -9,14 +9,14 @@ Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inference -[![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPI Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPI%20Release)](https://pypi.python.org/pypi/guidellm) [![Pypi Release](https://img.shields.io/pypi/v/guidellm-nightly.svg?label=PyPI%20Nightly)](https://pypi.python.org/pypi/guidellm-nightly) [![Python Versions](https://img.shields.io/pypi/pyversions/guidellm.svg?label=Python)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml) +[![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPI Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPI%20Release)](https://pypi.python.org/pypi/guidellm) [![Pypi Release](https://img.shields.io/pypi/v/guidellm-nightly.svg?label=PyPI%20Nightly)](https://pypi.python.org/pypi/guidellm-nightly) [![Python Versions](https://img.shields.io/badge/Python-3.8--3.12-orange)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml) ## Overview
@@ -98,11 +98,11 @@ After the evaluation is completed, GuideLLM will summarize the results, includin The output results will start with a summary of the evaluation, followed by the requests data for each benchmark run. For example, the start of the output will look like the following: - + The end of the output will include important performance summary metrics such as request latency, time to first token (TTFT), inter-token latency (ITL), and more: - + #### 4. Use the Results diff --git a/src/guidellm/core/report.py b/src/guidellm/core/report.py index b6791e4..c48eed5 100644 --- a/src/guidellm/core/report.py +++ b/src/guidellm/core/report.py @@ -147,19 +147,15 @@ def _create_benchmark_report_data_tokens_summary( for benchmark in report.benchmarks_sorted: table.add_row( _benchmark_rate_id(benchmark), - f"{benchmark.prompt_token_distribution.mean:.2f}", + f"{benchmark.prompt_token:.2f}", ", ".join( f"{percentile:.1f}" - for percentile in benchmark.prompt_token_distribution.percentiles( - [1, 5, 50, 95, 99] - ) + for percentile in benchmark.prompt_token_percentiles ), - f"{benchmark.output_token_distribution.mean:.2f}", + f"{benchmark.output_token:.2f}", ", ".join( f"{percentile:.1f}" - for percentile in benchmark.output_token_distribution.percentiles( - [1, 5, 50, 95, 99] - ) + for percentile in benchmark.output_token_percentiles ), ) logger.debug("Created data tokens summary table for the report.") @@ -181,7 +177,7 @@ def _create_benchmark_report_dist_perf_summary( "Benchmark", "Request Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (sec)", "Time to First Token [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)", - "Inter Token Latency [1%, 5%, 10%, 50%, 90% 95%, 99%] (ms)", + "Inter Token Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)", title="[magenta]Performance Stats by Benchmark[/magenta]", title_style="bold", title_justify="left", @@ -193,21 +189,15 @@ def _create_benchmark_report_dist_perf_summary( _benchmark_rate_id(benchmark), ", ".join( f"{percentile:.2f}" - for percentile in benchmark.request_latency_distribution.percentiles( - [1, 5, 10, 50, 90, 95, 99] - ) + for percentile in benchmark.request_latency_percentiles ), ", ".join( f"{percentile * 1000:.1f}" - for percentile in benchmark.ttft_distribution.percentiles( - [1, 5, 10, 50, 90, 95, 99] - ) + for percentile in benchmark.time_to_first_token_percentiles ), ", ".join( f"{percentile * 1000:.1f}" - for percentile in benchmark.itl_distribution.percentiles( - [1, 5, 10, 50, 90, 95, 99] - ) + for percentile in benchmark.inter_token_latency_percentiles ), ) logger.debug("Created distribution performance summary table for the report.") diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py index f218784..aebd176 100644 --- a/src/guidellm/core/result.py +++ b/src/guidellm/core/result.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Literal, Optional, Union from loguru import logger -from pydantic import Field +from pydantic import Field, computed_field from guidellm.core.distribution import Distribution from guidellm.core.request import TextGenerationRequest @@ -221,6 +221,7 @@ def __iter__(self): """ return iter(self.results) + @computed_field # type: ignore[misc] @property def request_count(self) -> int: """ @@ -231,6 +232,7 @@ def request_count(self) -> int: """ return len(self.results) + @computed_field # type: ignore[misc] @property def error_count(self) -> int: """ @@ -241,6 +243,7 @@ def error_count(self) -> int: """ return len(self.errors) + @computed_field # type: ignore[misc] @property def total_count(self) -> int: """ @@ -251,6 +254,7 @@ def total_count(self) -> int: """ return self.request_count + self.error_count + @computed_field # type: ignore[misc] @property def start_time(self) -> Optional[float]: """ @@ -264,6 +268,7 @@ def start_time(self) -> Optional[float]: return self.results[0].start_time + @computed_field # type: ignore[misc] @property def end_time(self) -> Optional[float]: """ @@ -277,6 +282,7 @@ def end_time(self) -> Optional[float]: return self.results[-1].end_time + @computed_field # type: ignore[misc] @property def duration(self) -> float: """ @@ -290,6 +296,7 @@ def duration(self) -> float: return self.end_time - self.start_time + @computed_field # type: ignore[misc] @property def completed_request_rate(self) -> float: """ @@ -303,6 +310,7 @@ def completed_request_rate(self) -> float: return len(self.results) / self.duration + @computed_field # type: ignore[misc] @property def request_latency(self) -> float: """ @@ -332,6 +340,19 @@ def request_latency_distribution(self) -> Distribution: ] ) + @computed_field # type: ignore[misc] + @property + def request_latency_percentiles(self) -> List[float]: + """ + Get standard percentiles of request latency in seconds. + + :return: List of percentile request latency in seconds + :rtype: List[float] + """ + return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99]) + + + @computed_field # type: ignore[misc] @property def time_to_first_token(self) -> float: """ @@ -361,6 +382,20 @@ def ttft_distribution(self) -> Distribution: ] ) + @computed_field # type: ignore[misc] + @property + def time_to_first_token_percentiles(self) -> List[float]: + """ + Get standard percentiles for time taken to decode the first token + in milliseconds. + + :return: List of percentile time taken to decode the first token + in milliseconds. + :rtype: List[float] + """ + return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99]) + + @computed_field # type: ignore[misc] @property def inter_token_latency(self) -> float: """ @@ -388,6 +423,18 @@ def itl_distribution(self) -> Distribution: ] ) + @computed_field # type: ignore[misc] + @property + def inter_token_latency_percentiles(self) -> List[float]: + """ + Get standard percentiles for the time between tokens in milliseconds. + + :return: List of percentiles for the average time between tokens. + :rtype: List[float] + """ + return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99]) + + @computed_field # type: ignore[misc] @property def output_token_throughput(self) -> float: """ @@ -403,6 +450,17 @@ def output_token_throughput(self) -> float: return total_tokens / self.duration + @computed_field # type: ignore[misc] + @property + def prompt_token(self) -> float: + """ + Get the average number of prompt tokens. + + :return: The average number of prompt tokens. + :rtype: float + """ + return self.prompt_token_distribution.mean + @property def prompt_token_distribution(self) -> Distribution: """ @@ -413,6 +471,28 @@ def prompt_token_distribution(self) -> Distribution: """ return Distribution(data=[result.prompt_token_count for result in self.results]) + @computed_field # type: ignore[misc] + @property + def prompt_token_percentiles(self) -> List[float]: + """ + Get standard percentiles for number of prompt tokens. + + :return: List of percentiles of number of prompt tokens. + :rtype: List[float] + """ + return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99]) + + @computed_field # type: ignore[misc] + @property + def output_token(self) -> float: + """ + Get the average number of output tokens. + + :return: The average number of output tokens. + :rtype: float + """ + return self.output_token_distribution.mean + @property def output_token_distribution(self) -> Distribution: """ @@ -423,6 +503,18 @@ def output_token_distribution(self) -> Distribution: """ return Distribution(data=[result.output_token_count for result in self.results]) + @computed_field # type: ignore[misc] + @property + def output_token_percentiles(self) -> List[float]: + """ + Get standard percentiles for number of output tokens. + + :return: List of percentiles of number of output tokens. + :rtype: List[float] + """ + return self.output_token_distribution.percentiles([1, 5, 50, 95, 99]) + + @computed_field # type: ignore[misc] @property def overloaded(self) -> bool: if (