From 7da4f0bd44ec13f87ef535a607122786b4b285ca Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Fri, 22 Nov 2024 16:08:12 -0800 Subject: [PATCH 1/2] Disable TRT-LLM echo by default --- genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py index 551b05d7..e30c0961 100644 --- a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py +++ b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py @@ -60,6 +60,7 @@ def convert( "model": model_name, "text_input": [text], "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS], # default + "exclude_input_from_output": True, # default } self._add_request_params(payload, config) request_body["data"].append(payload) From a2f62cdb454cdfef401452f488e3b6c8c9da0c91 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Fri, 22 Nov 2024 16:13:44 -0800 Subject: [PATCH 2/2] Update docs --- genai-perf/README.md | 5 ++--- .../genai_perf/inputs/converters/tensorrtllm_converter.py | 2 +- genai-perf/genai_perf/parser.py | 5 +---- genai-perf/tests/test_triton_tensorrtllm_converter.py | 4 ++++ templates/genai-perf-templates/README_template | 5 ++--- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/genai-perf/README.md b/genai-perf/README.md index 57c399ce..56ae86c9 100644 --- a/genai-perf/README.md +++ b/genai-perf/README.md @@ -388,9 +388,8 @@ a request in order. Random means that assignment is uniformly random ##### `--backend {tensorrtllm,vllm}` -When using the "triton" service-kind, this is the backend of the model. For the -TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the -model config to not echo the input tokens in the output. (default: tensorrtllm) +When using the "triton" service-kind, this is the backend of the model. +(default: tensorrtllm) ##### `--endpoint ` diff --git a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py index e30c0961..3db2bc32 100644 --- a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py +++ b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py @@ -60,7 +60,7 @@ def convert( "model": model_name, "text_input": [text], "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS], # default - "exclude_input_from_output": True, # default + "exclude_input_in_output": [True], # default } self._add_request_params(payload, config) request_body["data"].append(payload) diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py index e9df28a1..204fcdb0 100644 --- a/genai-perf/genai_perf/parser.py +++ b/genai-perf/genai_perf/parser.py @@ -644,10 +644,7 @@ def _add_endpoint_args(parser): default="tensorrtllm", required=False, help=f'When using the "triton" service-kind, ' - "this is the backend of the model. " - "For the TENSORRT-LLM backend, you currently must set " - "'exclude_input_in_output' to true in the model config to " - "not echo the input tokens in the output.", + "this is the backend of the model. ", ) endpoint_group.add_argument( diff --git a/genai-perf/tests/test_triton_tensorrtllm_converter.py b/genai-perf/tests/test_triton_tensorrtllm_converter.py index 95d3315a..9bd95bf1 100644 --- a/genai-perf/tests/test_triton_tensorrtllm_converter.py +++ b/genai-perf/tests/test_triton_tensorrtllm_converter.py @@ -75,11 +75,13 @@ def test_convert_default(self): "model": "test_model", "text_input": ["text input one"], "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS], + "exclude_input_in_output": [True], }, { "model": "test_model", "text_input": ["text input two"], "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS], + "exclude_input_in_output": [True], }, ] } @@ -116,6 +118,7 @@ def test_convert_with_request_parameters(self): "max_tokens": [1234], "stream": [True], "additional_key": ["additional_value"], + "exclude_input_in_output": [True], }, { "model": "test_model", @@ -124,6 +127,7 @@ def test_convert_with_request_parameters(self): "max_tokens": [1234], "stream": [True], "additional_key": ["additional_value"], + "exclude_input_in_output": [True], }, ] } diff --git a/templates/genai-perf-templates/README_template b/templates/genai-perf-templates/README_template index 86ab1ea0..ac8aa086 100644 --- a/templates/genai-perf-templates/README_template +++ b/templates/genai-perf-templates/README_template @@ -386,9 +386,8 @@ a request in order. Random means that assignment is uniformly random ##### `--backend {tensorrtllm,vllm}` -When using the "triton" service-kind, this is the backend of the model. For the -TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the -model config to not echo the input tokens in the output. (default: tensorrtllm) +When using the "triton" service-kind, this is the backend of the model. +(default: tensorrtllm) ##### `--endpoint `