vllm-project · khluu · Sep 23, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,12 @@ terraform.rc
 
 .env
 
-.vscode/
+.vscode/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.cache
+*.log
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+click==8.1.7
+pydantic==2.9.2
diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/pipeline_generator/__init__.py b/scripts/pipeline_generator/__init__.py
diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py
@@ -0,0 +1,60 @@
+from pydantic import BaseModel, Field
+from typing import List, Dict, Any, Optional
+
+from .utils import AgentQueue
+
+
+class TestStep(BaseModel):
+    """This class represents a test step defined in the test configuration file."""
+    label: str
+    fast_check: bool = False
+    mirror_hardwares: List[str] = Field(default_factory=list)
+    gpu: str = ""
+    num_gpus: int = 1
+    num_nodes: int = 1
+    working_dir: str = "/vllm-workspace/tests"
+    source_file_dependencies: List[str] = Field(default_factory=list)
+    no_gpu: bool = False
+    soft_fail: bool = False
+    parallelism: int = 1
+    optional: bool = False
+    command: Optional[str] = None
+    commands: Optional[List[str]] = None
+
+
+class BuildkiteStep(BaseModel):
+    """This class represents a step in Buildkite format."""
+    label: str
+    key: str
+    agents: Dict[str, Any] = {"queue": AgentQueue.AWS_CPU}
+    commands: Optional[List[str]] = None
+    plugins: Optional[List[Dict]] = None
+    parallelism: Optional[int] = None
+    soft_fail: Optional[bool] = None
+    depends_on: Optional[str] = "build"
+    env: Optional[Dict[str, str]] = None
+    retry: Optional[Dict[str, Any]] = None
+
+
+class BuildkiteBlockStep(BaseModel):
+    """This class represents a block step in Buildkite format."""
+    block: str
+    depends_on: Optional[str] = "build"
+    key: str
+
+
+def get_step_key(step_label: str) -> str:
+    step_label = step_label.replace(", ", ",")
+    step_key = ""
+    skip_chars = "()%"
+    for char in step_label.lower():
+        if char in ", ":
+            step_key += "-"
+        elif char not in skip_chars:
+            step_key += char
+
+    return step_key
+
+
+def get_block_step(step_label: str) -> BuildkiteBlockStep:
+    return BuildkiteBlockStep(block=f"Run {step_label}", key=f"block-{get_step_key(step_label)}")
diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py
@@ -0,0 +1,62 @@
+import enum
+from typing import Optional, List
+
+# Constants
+HF_HOME = "/root/.cache/huggingface"
+DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
+VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
+VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
+AMD_REPO = "rocm/vllm-ci"
+A100_GPU = "a100"
+
+# File paths
+TEST_PATH = ".buildkite/test-pipeline.yaml"
+EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
+PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
+MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"
+
+STEPS_TO_BLOCK = []
+
+
+class AgentQueue(str, enum.Enum):
+    AWS_CPU = "cpu_queue"
+    AWS_SMALL_CPU = "small_cpu_queue"
+    AWS_1xL4 = "gpu_1_queue"
+    AWS_4xL4 = "gpu_4_queue"
+    A100 = "a100-queue"
+    AMD_GPU = "amd"
+    AMD_CPU = "amd-cpu"
+
+
+def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
+    if no_gpu:
+        return AgentQueue.AWS_SMALL_CPU
+    if gpu_type == A100_GPU:
+        return AgentQueue.A100
+    return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4
+
+
+def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str:
+    """Convert test commands into one-line command with the right directory."""
+    working_dir = step_working_dir or DEFAULT_WORKING_DIR
+    test_commands_str = "; ".join(test_commands)
+    return f"cd {working_dir}; {test_commands_str}"
+
+
+def get_multi_node_test_command(
+        test_commands: List[str],
+        working_dir: str,
+        num_nodes: int,
+        num_gpus: int,
+        docker_image_path: str
+        ) -> str:
+    quoted_commands = [f"'{command}'" for command in test_commands]
+    multi_node_command = [
+        MULTI_NODE_TEST_SCRIPT,
+        working_dir or DEFAULT_WORKING_DIR,
+        str(num_nodes),
+        str(num_gpus),
+        docker_image_path,
+        *quoted_commands
+    ]
+    return " ".join(map(str, multi_node_command))
diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py
diff --git a/scripts/tests/pipeline_generator/__init__.py b/scripts/tests/pipeline_generator/__init__.py
diff --git a/scripts/tests/pipeline_generator/test_step.py b/scripts/tests/pipeline_generator/test_step.py
@@ -0,0 +1,35 @@
+import pytest
+import sys
+
+
+from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep
+
+
+@pytest.mark.parametrize(
+    ("step_label", "expected_result"),
+    [
+        ("Test Step", "test-step"),
+        ("Test Step 2", "test-step-2"),
+        ("Test (Step)", "test-step"),
+        ("Test A, B, C", "test-a-b-c"),
+    ],
+)
+def test_get_step_key(step_label: str, expected_result: str):
+    assert get_step_key(step_label) == expected_result
+
+
+@pytest.mark.parametrize(
+    ("step_label", "expected_result"),
+    [
+        ("Test Step", BuildkiteBlockStep(block="Run Test Step", key="block-test-step")),
+        ("Test Step 2", BuildkiteBlockStep(block="Run Test Step 2", key="block-test-step-2")),
+        ("Test (Step)", BuildkiteBlockStep(block="Run Test (Step)", key="block-test-step")),
+        ("Test A, B, C", BuildkiteBlockStep(block="Run Test A, B, C", key="block-test-a-b-c")),
+    ],
+)
+def test_get_block_step(step_label: str, expected_result: BuildkiteBlockStep):
+    assert get_block_step(step_label) == expected_result
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/scripts/tests/pipeline_generator/test_utils.py b/scripts/tests/pipeline_generator/test_utils.py
@@ -0,0 +1,66 @@
+import pytest
+import sys
+from typing import List
+
+from scripts.pipeline_generator.utils import (
+    get_agent_queue,
+    get_full_test_command,
+    get_multi_node_test_command,
+    AgentQueue,
+    MULTI_NODE_TEST_SCRIPT,
+)
+
+
+@pytest.mark.parametrize(
+    ("no_gpu", "gpu_type", "num_gpus", "expected_result"),
+    [
+        (True, None, None, AgentQueue.AWS_SMALL_CPU),
+        (False, "a100", None, AgentQueue.A100),
+        (False, None, 1, AgentQueue.AWS_1xL4),
+        (False, None, 4, AgentQueue.AWS_4xL4),
+    ],
+)
+def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue):
+    assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result
+
+
+@pytest.mark.parametrize(
+    ("test_commands", "step_working_dir", "expected_result"),
+    [
+        (["echo 'hello'"], None, "cd /vllm-workspace/tests; echo 'hello'"),
+        (["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests; echo 'hello'"),
+        (["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests; echo 'hello1'; echo 'hello2'"),
+    ],
+)
+def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str):
+    assert get_full_test_command(test_commands, step_working_dir) == expected_result
+
+
+def test_get_multi_node_test_command():
+    test_commands = [
+        (
+            "distributed/test_same_node.py;"
+            "pytest -v -s distributed/test_multi_node_assignment.py;"
+            "pytest -v -s distributed/test_pipeline_parallel.py"
+        ),
+        "distributed/test_same_node.py",
+    ]
+    working_dir = "/vllm-workspace/tests"
+    num_nodes = 2
+    num_gpus = 4
+    docker_image_path = "ecr-path/vllm-ci-test-repo:latest"
+    expected_multi_node_command = [
+        MULTI_NODE_TEST_SCRIPT,
+        working_dir,
+        num_nodes,
+        num_gpus,
+        docker_image_path,
+        f"'{test_commands[0]}'",
+        f"'{test_commands[1]}'",
+    ]
+    expected_result = " ".join(map(str, expected_multi_node_command))
+    assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))