diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 2e4aecdd3e16e..a378bc6baa5a5 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -4,11 +4,11 @@ import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB # Note that we have 400 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/3792 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 3e4e409466b8a..99972afa21d1e 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -23,6 +23,6 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference -docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/basic.py +docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' + python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B ' diff --git a/Dockerfile b/Dockerfile index 0b9f74e08dc68..7ecb643f46272 100644 --- a/Dockerfile +++ b/Dockerfile @@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=300 +ARG VLLM_MAX_SIZE_MB=400 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \