From c7fe48335eb7b8c230097c1920c5a4c8d8f99232 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Fri, 26 Apr 2024 14:55:49 +0200
Subject: [PATCH] Forward arguments from TGI launcher to the model (#28)

* Include revision

* Expose match_batch_size as envvar for TGI entrypoint

* Remove Intellij files from git

* Remove unused variable in entrypoint

* again

* Fix TGI_MAX_INPUT_LENGTH to TGI_MAX_INPUT_TOKENS to stay in tokens

* Let's allow to use specific TGI commit

* Delete comments

* Makes it possible to install specific commit of TGI also in tgi_test

* Oops missing one file

* leverage forwarded variables from the launcher to allocate the model

* Fix invalid variable name

* Add missing find-links argument to make the dependend tests running

* Update tests with new args

* Revert using git and use curl + github archive

* let's define max-batch-prefill-tokens too

* Let's map the model_id to the value provided by

* Remove overriding TGI entrypoint
---
 .github/workflows/check_code_quality.yml      |  2 +-
 .github/workflows/doc-build.yml               |  2 +-
 .github/workflows/doc-pr-build.yml            |  2 +-
 .gitignore                                    |  3 +-
 Makefile                                      |  2 +-
 optimum/tpu/modeling.py                       |  1 +
 text-generation-inference/Dockerfile          | 12 ++----
 text-generation-inference/entrypoint.sh       | 43 +++++++++++++++----
 text-generation-inference/server/Makefile     |  6 +--
 .../server/text_generation_server/cli.py      | 14 +++++-
 .../text_generation_server/generator.py       | 10 ++++-
 .../server/text_generation_server/server.py   | 10 ++++-
 text-generation-inference/tests/test_gemma.py |  2 +-
 text-generation-inference/tests/test_gpt2.py  |  8 ++--
 14 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index 1ca2450b..ecc93d29 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -44,7 +44,7 @@ jobs:
       run: |
         source venv/bin/activate
         pip install --upgrade pip
-        pip install .[quality]
+        pip install .[quality] -f https://storage.googleapis.com/libtpu-releases/index.html
     - name: Check style with ruff
       run: |
         source venv/bin/activate
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index e3942951..5f299b52 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -47,7 +47,7 @@ jobs:
       - name: Setup environment
         run: |
           pip install -U pip
-          pip install ".[quality]"
+          pip install ".[quality]" -f https://storage.googleapis.com/libtpu-releases/index.html
 
       - name: Make documentation
         shell: bash
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
index e951b80a..c6536ecd 100644
--- a/.github/workflows/doc-pr-build.yml
+++ b/.github/workflows/doc-pr-build.yml
@@ -32,7 +32,7 @@ jobs:
       - name: Setup environment
         run: |
           pip install -U pip
-          pip install ".[quality]"
+          pip install ".[quality]" -f https://storage.googleapis.com/libtpu-releases/index.html
 
       - name: Make documentation
         shell: bash
diff --git a/.gitignore b/.gitignore
index 0e689919..7beb0e7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,4 +132,5 @@ dmypy.json
 # Models
 *.pt
 
-.vscode
\ No newline at end of file
+.vscode
+.idea/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index a29fdee8..a9b8f125 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 
 .PHONY:	build_dist style style_check clean
 
-TGI_VERSION ?= 2.0.0
+TGI_VERSION ?= 5bc3d65dd32ba1f979540caeccbf3dd8798dd9df
 
 rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))
 
diff --git a/optimum/tpu/modeling.py b/optimum/tpu/modeling.py
index d45e386c..f5244770 100644
--- a/optimum/tpu/modeling.py
+++ b/optimum/tpu/modeling.py
@@ -54,6 +54,7 @@ def from_pretrained(
         cls = config_name_to_class(pretrained_model_name_or_path)
         model = cls.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         model.to(device)
+
         # Update config with specific data)
         if task is not None or getattr(model.config, "task", None) is None:
             model.config.task = task
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 2ba86d0f..669a8b02 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -3,7 +3,7 @@ FROM alpine AS tgi
 ARG TGI_VERSION
 RUN test -n ${TGI_VERSION:?}
 RUN mkdir -p /tgi
-ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
+ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
 RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
@@ -120,15 +120,9 @@ COPY --from=pyserver /pyserver/build/dist dist
 RUN pip install dist/text_generation_server*.tar.gz
 
 # TPU compatible image
-FROM tpu_base as tpu_entrypoint
+FROM tpu_base
 
 COPY text-generation-inference/entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
 
-ENTRYPOINT ["./entrypoint.sh"]
-
-# Final image
-FROM tpu_base
-
-ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]
+ENTRYPOINT ["./entrypoint.sh"]
\ No newline at end of file
diff --git a/text-generation-inference/entrypoint.sh b/text-generation-inference/entrypoint.sh
index f79c0f64..97e12b3a 100644
--- a/text-generation-inference/entrypoint.sh
+++ b/text-generation-inference/entrypoint.sh
@@ -1,17 +1,44 @@
 #!/bin/bash
 
-if [[ -z "${HF_MODEL_ID}" ]]; then
-  echo "HF_MODEL_ID must be set"
+# Hugging Face Hub related
+if [[ -z "${MODEL_ID}" ]]; then
+  echo "MODEL_ID must be set"
   exit 1
 fi
-export MODEL_ID="${HF_MODEL_ID}"
+export MODEL_ID="${MODEL_ID}"
 
-if [[ -n "${HF_MODEL_REVISION}" ]]; then
-  export REVISION="${HF_MODEL_REVISION}"
+# TGI related
+if [[ -n "${TGI_MAX_CONCURRENT_REQUESTS}" ]]; then
+  export TGI_MAX_CONCURRENT_REQUESTS="${TGI_MAX_CONCURRENT_REQUESTS}"
+else
+  export TGI_MAX_CONCURRENT_REQUESTS 4
 fi
 
-if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
-  export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
+if [[ -n "${TGI_MAX_BATCH_SIZE}" ]]; then
+  export TGI_MAX_BATCH_SIZE="${TGI_MAX_BATCH_SIZE}"
+else
+  export TGI_MAX_BATCH_SIZE 1
 fi
 
-text-generation-launcher --port 8080
+if [[ -n "${TGI_MAX_INPUT_TOKENS}" ]]; then
+  export TGI_MAX_INPUT_TOKENS="${TGI_MAX_INPUT_TOKENS}"
+else
+  export TGI_MAX_INPUT_TOKENS 128
+fi
+
+if [[ -n "${TGI_MAX_TOTAL_TOKENS}" ]]; then
+  export TGI_MAX_TOTAL_TOKENS="${TGI_MAX_TOTAL_TOKENS}"
+else
+  export TGI_MAX_TOTAL_TOKENS 256
+fi
+
+TGI_MAX_BATCH_PREFILL_TOKENS=$(( TGI_MAX_BATCH_SIZE*TGI_MAX_INPUT_TOKENS ))
+
+text-generation-launcher --port 8080 \
+  --max-concurrent-requests ${TGI_MAX_CONCURRENT_REQUESTS} \
+  --max-batch-size ${TGI_MAX_BATCH_SIZE} \
+  --max-batch-prefill-tokens ${TGI_MAX_BATCH_PREFILL_TOKENS} \
+  --max-input-tokens ${TGI_MAX_INPUT_TOKENS} \
+  --max-total-tokens ${TGI_MAX_TOTAL_TOKENS} \
+  --model-id ${MODEL_ID}
+
diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index 95be9970..7be0b3e4 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -2,7 +2,7 @@
 pkg_name := text_generation_server
 BUILDDIR ?= $(CURDIR)/build
 VERSION ?= 0.0.1
-TGI_VERSION ?= 1.4.2
+TGI_VERSION ?= v2.0.1
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)
@@ -39,7 +39,7 @@ endif
 
 $(BUILDDIR)/tgi/proto/%.proto:
 	install -d $(BUILDDIR)/tgi
-	curl -L https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
+	curl -L https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
 	tar -C $(BUILDDIR)/tgi -xf $(BUILDDIR)/tgi/sources.tar.gz --strip-components=1
 
 # Three python files are generated for each protobuf
@@ -57,4 +57,4 @@ $(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PR
 	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
 
 gen-server: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
-	python -m build $(BUILDDIR)
+	python -m build $(BUILDDIR)
\ No newline at end of file
diff --git a/text-generation-inference/server/text_generation_server/cli.py b/text-generation-inference/server/text_generation_server/cli.py
index 0d2ae236..8a292e0c 100644
--- a/text-generation-inference/server/text_generation_server/cli.py
+++ b/text-generation-inference/server/text_generation_server/cli.py
@@ -1,3 +1,4 @@
+import os
 import sys
 from typing import Optional
 
@@ -58,8 +59,19 @@ def serve(
     from optimum.tpu.model import fetch_model
     from .server import serve
 
+    # Read environment variables forwarded by the launcher
+    max_batch_size = int(os.environ.get("MAX_BATCH_SIZE", "1"))
+    max_total_tokens = int(os.environ.get("MAX_TOTAL_TOKENS", "64"))
+
+    # Start the server
     model_path = fetch_model(model_id, revision)
-    serve(model_path, uds_path)
+    serve(
+        model_path,
+        revision=revision,
+        max_batch_size=max_batch_size,
+        max_sequence_length=max_total_tokens,
+        uds_path=uds_path
+    )
 
 
 @app.command()
diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py
index d13ec221..d0e65f22 100644
--- a/text-generation-inference/server/text_generation_server/generator.py
+++ b/text-generation-inference/server/text_generation_server/generator.py
@@ -621,6 +621,9 @@ def _clear(self, request_ids: List):
     def from_pretrained(
         cls,
         model_path: str,
+        revision: str,
+        max_batch_size: int,
+        max_sequence_length: int
     ):
         """Instantiate a TpuGenerator.
 
@@ -633,7 +636,12 @@ def from_pretrained(
         """
         logger.info("Loading model (this can take a few minutes).")
         start = time.time()
-        model = AutoModelForCausalLM.from_pretrained(model_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            revision=revision,
+            batch_size=max_batch_size,
+            sequence_length=max_sequence_length
+        )
         end = time.time()
         logger.info(f"Model successfully loaded in {end - start:.2f} s.")
         tokenizer = AutoTokenizer.from_pretrained(model_path)
diff --git a/text-generation-inference/server/text_generation_server/server.py b/text-generation-inference/server/text_generation_server/server.py
index 22361876..186d3c5d 100644
--- a/text-generation-inference/server/text_generation_server/server.py
+++ b/text-generation-inference/server/text_generation_server/server.py
@@ -50,6 +50,9 @@ async def Decode(self, request, context):
 
 def serve(
     model_path: str,
+    revision: str,
+    max_batch_size: int,
+    max_sequence_length: int,
     uds_path: Path,
 ):
     async def serve_inner(model_path: str):
@@ -58,7 +61,12 @@ async def serve_inner(model_path: str):
         server_urls = [local_url]
 
         try:
-            generator = TpuGenerator.from_pretrained(model_path)
+            generator = TpuGenerator.from_pretrained(
+                model_path,
+                revision=revision,
+                max_batch_size=max_batch_size,
+                max_sequence_length=max_sequence_length
+            )
         except Exception:
             logger.exception("Error when initializing model")
             raise
diff --git a/text-generation-inference/tests/test_gemma.py b/text-generation-inference/tests/test_gemma.py
index a1ec51c1..fe361a05 100644
--- a/text-generation-inference/tests/test_gemma.py
+++ b/text-generation-inference/tests/test_gemma.py
@@ -57,7 +57,7 @@ def test_decode_single(model_path):
     max_new_tokens = 20
     generated_text = "\n\nThe first thing I noticed was the smell of the rain. It was a smell I had never"
 
-    generator = TpuGenerator.from_pretrained(model_path)
+    generator = TpuGenerator.from_pretrained(model_path, revision="", max_batch_size=1, max_sequence_length=SEQUENCE_LENGTH)
     request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=False)
     batch = Batch(id=0, requests=[request], size=1, max_tokens=SEQUENCE_LENGTH)
     generations, next_batch = generator.prefill(batch)
diff --git a/text-generation-inference/tests/test_gpt2.py b/text-generation-inference/tests/test_gpt2.py
index abe8dd16..1f26b9cf 100644
--- a/text-generation-inference/tests/test_gpt2.py
+++ b/text-generation-inference/tests/test_gpt2.py
@@ -24,7 +24,7 @@ def model_path():
 
 
 def test_info(model_path):
-    generator = TpuGenerator.from_pretrained(model_path)
+    generator = TpuGenerator.from_pretrained(model_path, revision="", max_batch_size=1, max_sequence_length=1)
     info = generator.info
     assert info.requires_padding is True
     assert info.device_type == "xla"
@@ -81,7 +81,7 @@ def create_request(
 )
 @pytest.mark.parametrize("batch_size", [1, 4], ids=["single", "multiple"])
 def test_prefill(input_text, token_id, token_text, do_sample, batch_size, model_path):
-    generator = TpuGenerator.from_pretrained(model_path)
+    generator = TpuGenerator.from_pretrained(model_path, revision="", max_batch_size=batch_size, max_sequence_length=SEQUENCE_LENGTH)
     requests = []
     max_new_tokens = 20
     for i in range(batch_size):
@@ -120,7 +120,7 @@ def test_prefill(input_text, token_id, token_text, do_sample, batch_size, model_
     ids=["greedy", "sample"],
 )
 def test_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
-    generator = TpuGenerator.from_pretrained(model_path)
+    generator = TpuGenerator.from_pretrained(model_path, revision="", max_batch_size=1, max_sequence_length=SEQUENCE_LENGTH)
     request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
     batch = Batch(id=0, requests=[request], size=1, max_tokens=SEQUENCE_LENGTH)
     generations, next_batch = generator.prefill(batch)
@@ -140,7 +140,7 @@ def test_decode_single(input_text, max_new_tokens, generated_text, do_sample, mo
 
 
 def test_decode_multiple(model_path):
-    generator = TpuGenerator.from_pretrained(model_path)
+    generator = TpuGenerator.from_pretrained(model_path, revision="", max_batch_size=1, max_sequence_length=SEQUENCE_LENGTH)
     input_text = "Once upon a time"
     max_new_tokens = 20
     # Prefill a single request, remembering the generated token