From 7da4f0bd44ec13f87ef535a607122786b4b285ca Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Fri, 22 Nov 2024 16:08:12 -0800
Subject: [PATCH 1/2] Disable TRT-LLM echo by default

---
 genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py | 1 +
 1 file changed, 1 insertion(+)
diff --git a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
index 551b05d7..e30c0961 100644
--- a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
+++ b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
@@ -60,6 +60,7 @@ def convert(
                     "model": model_name,
                     "text_input": [text],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],  # default
+                    "exclude_input_from_output": True,  # default
                 }
                 self._add_request_params(payload, config)
                 request_body["data"].append(payload)

From a2f62cdb454cdfef401452f488e3b6c8c9da0c91 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Fri, 22 Nov 2024 16:13:44 -0800
Subject: [PATCH 2/2] Update docs

---
 genai-perf/README.md                                         | 5 ++---
 .../genai_perf/inputs/converters/tensorrtllm_converter.py    | 2 +-
 genai-perf/genai_perf/parser.py                              | 5 +----
 genai-perf/tests/test_triton_tensorrtllm_converter.py        | 4 ++++
 templates/genai-perf-templates/README_template               | 5 ++---
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/genai-perf/README.md b/genai-perf/README.md
index 57c399ce..56ae86c9 100644
--- a/genai-perf/README.md
+++ b/genai-perf/README.md
@@ -388,9 +388,8 @@ a request in order. Random means that assignment is uniformly random
 
 ##### `--backend {tensorrtllm,vllm}`
 
-When using the "triton" service-kind, this is the backend of the model. For the
-TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the
-model config to not echo the input tokens in the output. (default: tensorrtllm)
+When using the "triton" service-kind, this is the backend of the model.
+(default: tensorrtllm)
 
 ##### `--endpoint <str>`
 
diff --git a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
index e30c0961..3db2bc32 100644
--- a/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
+++ b/genai-perf/genai_perf/inputs/converters/tensorrtllm_converter.py
@@ -60,7 +60,7 @@ def convert(
                     "model": model_name,
                     "text_input": [text],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],  # default
-                    "exclude_input_from_output": True,  # default
+                    "exclude_input_in_output": [True],  # default
                 }
                 self._add_request_params(payload, config)
                 request_body["data"].append(payload)
diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py
index e9df28a1..204fcdb0 100644
--- a/genai-perf/genai_perf/parser.py
+++ b/genai-perf/genai_perf/parser.py
@@ -644,10 +644,7 @@ def _add_endpoint_args(parser):
         default="tensorrtllm",
         required=False,
         help=f'When using the "triton" service-kind, '
-        "this is the backend of the model. "
-        "For the TENSORRT-LLM backend, you currently must set "
-        "'exclude_input_in_output' to true in the model config to "
-        "not echo the input tokens in the output.",
+        "this is the backend of the model. ",
     )
 
     endpoint_group.add_argument(
diff --git a/genai-perf/tests/test_triton_tensorrtllm_converter.py b/genai-perf/tests/test_triton_tensorrtllm_converter.py
index 95d3315a..9bd95bf1 100644
--- a/genai-perf/tests/test_triton_tensorrtllm_converter.py
+++ b/genai-perf/tests/test_triton_tensorrtllm_converter.py
@@ -75,11 +75,13 @@ def test_convert_default(self):
                     "model": "test_model",
                     "text_input": ["text input one"],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],
+                    "exclude_input_in_output": [True],
                 },
                 {
                     "model": "test_model",
                     "text_input": ["text input two"],
                     "max_tokens": [DEFAULT_TENSORRTLLM_MAX_TOKENS],
+                    "exclude_input_in_output": [True],
                 },
             ]
         }
@@ -116,6 +118,7 @@ def test_convert_with_request_parameters(self):
                     "max_tokens": [1234],
                     "stream": [True],
                     "additional_key": ["additional_value"],
+                    "exclude_input_in_output": [True],
                 },
                 {
                     "model": "test_model",
@@ -124,6 +127,7 @@ def test_convert_with_request_parameters(self):
                     "max_tokens": [1234],
                     "stream": [True],
                     "additional_key": ["additional_value"],
+                    "exclude_input_in_output": [True],
                 },
             ]
         }
diff --git a/templates/genai-perf-templates/README_template b/templates/genai-perf-templates/README_template
index 86ab1ea0..ac8aa086 100644
--- a/templates/genai-perf-templates/README_template
+++ b/templates/genai-perf-templates/README_template
@@ -386,9 +386,8 @@ a request in order. Random means that assignment is uniformly random
 
 ##### `--backend {tensorrtllm,vllm}`
 
-When using the "triton" service-kind, this is the backend of the model. For the
-TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the
-model config to not echo the input tokens in the output. (default: tensorrtllm)
+When using the "triton" service-kind, this is the backend of the model.
+(default: tensorrtllm)
 
 ##### `--endpoint <str>`