build: Upgrade to 24.08, TRT-LLM 0.12.0, and Triton CLI v0.0.11 (#83)

Co-authored-by: David Yastremsky <[email protected]>
triton-inference-server · Sep 5, 2024 · 1d872d7 · 1d872d7
1 parent dda4ca8
commit 1d872d7
Show file tree

Hide file tree

Showing 20 changed files with 956 additions and 2,236 deletions.
diff --git a/README.md b/README.md
@@ -22,8 +22,8 @@ and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
 
 For vLLM and TRT-LLM, you can use their respective images:
-- `nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3`
-- `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3`
 
 If you decide to run the CLI on the host or in a custom image, please
 see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
@@ -38,6 +38,7 @@ matrix below:
 
 | Triton CLI Version | TRT-LLM Version | Triton Container Tag |
 |:------------------:|:---------------:|:--------------------:|
+| 0.0.11 | v0.12.0 | 24.08 |
 | 0.0.10 | v0.11.0 | 24.07 |
 | 0.0.9 | v0.10.0 | 24.06 |
 | 0.0.8 | v0.9.0 | 24.05 |
@@ -57,7 +58,7 @@ It is also possible to install from a specific branch name, a commit hash
 or a tag name. For example to install `triton_cli` with a specific tag:
 
 ```bash
-GIT_REF="0.0.10"
+GIT_REF="0.0.11"
 pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
 ```
 
@@ -92,7 +93,7 @@ triton -h
 triton import -m gpt2
 
 # Start server pointing at the default model repository
-triton start --image nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3
+triton start --image nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3
 
 # Infer with CLI
 triton infer -m gpt2 --prompt "machine learning is"
@@ -135,6 +136,8 @@ The following models have currently been tested for vLLM through the CLI:
 - `llama-2-7b-chat`
 - `llama-3-8b`
 - `llama-3-8b-instruct`
+- `llama-3.1-8b`
+- `llama-3.1-8b-instruct`
 
 
 #### Example
@@ -146,10 +149,10 @@ docker run -ti \
   --shm-size=1g --ulimit memlock=-1 \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].10
+pip install git+https://github.com/triton-inference-server/[email protected].11
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
@@ -193,10 +196,14 @@ engine builds through the CLI:
 - `llama-2-7b-chat`
 - `llama-3-8b`
 - `llama-3-8b-instruct`
+- `llama-3.1-8b`
+- `llama-3.1-8b-instruct`
 
 > [!NOTE]
-> Building a TRT-LLM engine for Llama-2-7B or Llama-3-8B models
-> may require system RAM of at least 48GB of RAM.
+> 1. Building a TRT-LLM engine for Llama-2-7B, Llama-3-8B, or Llama-3.1-8B
+>    models may require system RAM of at least 48GB of RAM.
+>
+> 2. Llama 3.1 may require `pip install transformers>=4.43.1`
 
 
 #### Example
@@ -215,10 +222,10 @@ docker run -ti \
   -v /tmp:/tmp \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].10
+pip install git+https://github.com/triton-inference-server/[email protected].11
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,10 +47,10 @@ keywords = []
 requires-python = ">=3.10,<4"
 # TODO: Add [gpu] set of dependencies for trtllm once it's available on pypi
 dependencies = [
-    "grpcio>=1.64.0",
+    "grpcio>=1.65.5",
     "directory-tree == 0.0.4", # may remove in future
     "docker == 6.1.3",
-    "genai-perf @ git+https://github.com/triton-inference-server/client.git@r24.07#subdirectory=src/c++/perf_analyzer/genai-perf",
+    "genai-perf @ git+https://github.com/triton-inference-server/perf_analyzer.git@r24.08#subdirectory=genai-perf",
     # TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
     "numpy >=1.21,<2",
     "protobuf>=3.7.0",
@@ -59,7 +59,7 @@ dependencies = [
     "rich == 13.5.2",
     # TODO: Test on cpu-only machine if [cuda] dependency is an issue,
     # Use explicit client version matching genai-perf version for tagged release
-    "tritonclient[all] == 2.48",
+    "tritonclient[all] == 2.49",
     "huggingface-hub >= 0.19.4",
     # Testing
     "pytest >= 8.1.1", # may remove later

diff --git a/src/triton_cli/__init__.py b/src/triton_cli/__init__.py
@@ -24,4 +24,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = "0.0.10"
+__version__ = "0.0.11"
diff --git a/src/triton_cli/docker/Dockerfile b/src/triton_cli/docker/Dockerfile
@@ -1,9 +1,11 @@
 # TRT-LLM image contains engine building and runtime dependencies
-FROM nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+FROM nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
 
 # Setup vLLM Triton backend
 RUN mkdir -p /opt/tritonserver/backends/vllm && \
-    wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/r24.07/src/model.py
+    git clone -b r24.08 https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend && \
+    cp -r /tmp/vllm_backend/src/* /opt/tritonserver/backends/vllm && \
+    rm -r /tmp/vllm_backend
 
 # vLLM runtime dependencies
-RUN pip install "vllm==0.5.0.post1"
+RUN pip install "vllm==0.5.3.post1" "setuptools==74.0.0"
diff --git a/src/triton_cli/profile.py b/src/triton_cli/profile.py
@@ -34,7 +34,7 @@
 # ================================================
 def build_command(args: argparse.Namespace, executable: str):
     skip_args = ["func"]
-    cmd = [executable]
+    cmd = [executable, "profile"]
     for arg, value in vars(args).items():
         if arg in skip_args:
             pass
@@ -45,12 +45,6 @@ def build_command(args: argparse.Namespace, executable: str):
                 cmd += [f"-{arg}"]
             else:
                 cmd += [f"--{arg}"]
-        # [DLIS-6656] - Remove backend renaming.
-        # This allows "tensorrtllm" to be used as the backend for consistency.
-        # Once GenAI-Perf releases 24.05, "tensorrtllm" as the backend value
-        # will be supported by default.
-        elif arg == "backend" and value in ["tensorrtllm", "trtllm"]:
-            cmd += ["--backend", "tensorrtllm"]
         else:
             if len(arg) == 1:
                 cmd += [f"-{arg}", f"{value}"]

diff --git a/src/triton_cli/repository.py b/src/triton_cli/repository.py
@@ -92,6 +92,12 @@
     "meta-llama/Meta-Llama-3-8B-Instruct": {
         "hf_allow_patterns": ["*.safetensors", "*.json"],
     },
+    "meta-llama/Meta-Llama-3.1-8B": {
+        "hf_allow_patterns": ["*.safetensors", "*.json"],
+    },
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": {
+        "hf_allow_patterns": ["*.safetensors", "*.json"],
+    },
     "gpt2": {
         "hf_allow_patterns": ["*.safetensors", "*.json"],
         "hf_ignore_patterns": ["onnx/*"],

diff --git a/src/triton_cli/templates/trt_llm/postprocessing/1/model.py b/src/triton_cli/templates/trt_llm/postprocessing/1/model.py
@@ -142,6 +142,10 @@ def execute(self, requests):
             generation_logits = pb_utils.get_input_tensor_by_name(
                 request, 'GENERATION_LOGITS')
 
+            # Get the batch index
+            batch_index = pb_utils.get_input_tensor_by_name(
+                request, 'BATCH_INDEX')
+
             # Reshape Input
             # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
             # tokens_batch = tokens_batch.T
@@ -197,6 +201,15 @@ def execute(self, requests):
                     np.array([[[[0.0]]]], dtype=np.float32))
                 outputs.append(out_generation_logits)
 
+            if batch_index:
+                out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX',
+                                                  batch_index.as_numpy())
+                outputs.append(out_batch_index)
+            else:
+                out_batch_index = pb_utils.Tensor(
+                    'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32))
+                outputs.append(out_batch_index)
+
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
             # Below is an example of how you can set errors in inference
@@ -224,8 +237,14 @@ def _postprocessing(self, tokens_batch, sequence_lengths):
         for batch_idx, beam_tokens in enumerate(tokens_batch):
             for beam_idx, tokens in enumerate(beam_tokens):
                 seq_len = sequence_lengths[batch_idx][beam_idx]
+                # Exclude fake ids in multimodal models
+                fake_id_len = 0
+                for i in range(seq_len):
+                    if tokens[i] < len(self.tokenizer.vocab):
+                        fake_id_len = i
+                        break
                 output = self.tokenizer.decode(
-                    tokens[:seq_len],
+                    tokens[fake_id_len:seq_len],
                     skip_special_tokens=self.skip_special_tokens)
                 outputs.append(output.encode('utf8'))
         return outputs
diff --git a/src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt b/src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt
@@ -61,6 +61,12 @@ input [
     data_type: TYPE_FP32
     dims: [ -1, -1, -1 ]
     optional: true
+  },
+  {
+    name: "BATCH_INDEX"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
   }
 ]
 output [
@@ -88,6 +94,11 @@ output [
     name: "OUT_GENERATION_LOGITS"
     data_type: TYPE_FP32
     dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "OUT_BATCH_INDEX"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
   }
 ]