triton-inference-server · dyastremsky · Nov 23, 2024 · Nov 23, 2024
diff --git a/genai-perf/README.md b/genai-perf/README.md
@@ -388,9 +388,8 @@ a request in order. Random means that assignment is uniformly random
 
 ##### `--backend {tensorrtllm,vllm}`
 
-When using the "triton" service-kind, this is the backend of the model. For the
-TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the
-model config to not echo the input tokens in the output. (default: tensorrtllm)
+When using the "triton" service-kind, this is the backend of the model.
+(default: tensorrtllm)
 
 ##### `--endpoint <str>`
 

diff --git a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
@@ -60,6 +60,7 @@ def convert(
                     "model": model_name,
                     "text_input": [text],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],  # default
+                    "exclude_input_in_output": [True],  # default
                 }
                 self._add_request_params(payload, config)
                 request_body["data"].append(payload)

diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py
@@ -644,10 +644,7 @@ def _add_endpoint_args(parser):
         default="tensorrtllm",
         required=False,
         help=f'When using the "triton" service-kind, '
-        "this is the backend of the model. "
-        "For the TENSORRT-LLM backend, you currently must set "
-        "'exclude_input_in_output' to true in the model config to "
-        "not echo the input tokens in the output.",
+        "this is the backend of the model. ",
     )
 
     endpoint_group.add_argument(

diff --git a/genai-perf/tests/test_triton_tensorrtllm_converter.py b/genai-perf/tests/test_triton_tensorrtllm_converter.py
@@ -75,11 +75,13 @@ def test_convert_default(self):
                     "model": "test_model",
                     "text_input": ["text input one"],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],
+                    "exclude_input_in_output": [True],
                 },
                 {
                     "model": "test_model",
                     "text_input": ["text input two"],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],
+                    "exclude_input_in_output": [True],
                 },
             ]
         }
@@ -116,6 +118,7 @@ def test_convert_with_request_parameters(self):
                     "max_tokens": [1234],
                     "stream": [True],
                     "additional_key": ["additional_value"],
+                    "exclude_input_in_output": [True],
                 },
                 {
                     "model": "test_model",
@@ -124,6 +127,7 @@ def test_convert_with_request_parameters(self):
                     "max_tokens": [1234],
                     "stream": [True],
                     "additional_key": ["additional_value"],
+                    "exclude_input_in_output": [True],
                 },
             ]
         }

diff --git a/templates/genai-perf-templates/README_template b/templates/genai-perf-templates/README_template
@@ -386,9 +386,8 @@ a request in order. Random means that assignment is uniformly random
 
 ##### `--backend {tensorrtllm,vllm}`
 
-When using the "triton" service-kind, this is the backend of the model. For the
-TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the
-model config to not echo the input tokens in the output. (default: tensorrtllm)
+When using the "triton" service-kind, this is the backend of the model.
+(default: tensorrtllm)
 
 ##### `--endpoint <str>`