[3/n][pipeline-gen] Add TestStep and BuildkiteStep along with validat…

…ors (#39) * p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> --------- Signed-off-by: kevin <[email protected]>
vllm-project · Oct 3, 2024 · be76542 · be76542
1 parent 10d0c43
commit be76542
Show file tree

Hide file tree

Showing 3 changed files with 163 additions and 7 deletions.
diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py
@@ -1,9 +1,73 @@
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, root_validator, model_validator
 from typing import List, Dict, Any, Optional
+from typing_extensions import Self
 
-from .utils import AgentQueue
+from .utils import AgentQueue, GPUType
 
 BUILD_STEP_KEY = "build"
+DEFAULT_TEST_WORKING_DIR = "/vllm-workspace/tests"
+
+class TestStep(BaseModel):
+    """This class represents a test step defined in the test configuration file."""
+    label: str
+    working_dir: Optional[str] = DEFAULT_TEST_WORKING_DIR
+    optional: Optional[bool] = False
+    fast_check: Optional[bool] = None
+    mirror_hardwares: Optional[List[str]] = None
+    no_gpu: Optional[bool] = None
+    gpu: Optional[GPUType] = None
+    num_gpus: Optional[int] = None
+    num_nodes: Optional[int] = None
+    source_file_dependencies: Optional[List[str]] = None
+    soft_fail: Optional[bool] = None
+    parallelism: Optional[int] = None
+    command: Optional[str] = None
+    commands: Optional[List[str]] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_and_convert_command(cls, values) -> Any:
+        """
+        Validate that either 'command' or 'commands' is defined.
+        If 'command' is defined, convert it to 'commands'.
+        """
+        if not values.get("command") and not values.get("commands"):
+            raise ValueError("Either 'command' or 'commands' must be defined.")
+        if values.get("command") and values.get("commands"):
+            raise ValueError("Only one of 'command' or 'commands' can be defined.")
+        if values.get("command"):
+            values["commands"] = [values["command"]]
+            del values["command"]
+        return values
+
+    @model_validator(mode="after")
+    def validate_gpu(self) -> Self:
+        if self.gpu and self.no_gpu:
+            raise ValueError("Both 'gpu' and 'no_gpu' cannot be defined together.")
+        return self
+
+    @model_validator(mode="after")
+    def validate_multi_node(self) -> Self:
+        if self.num_nodes and not self.num_gpus:
+            raise ValueError("'num_gpus' must be defined if 'num_nodes' is defined.")
+        if self.num_nodes and len(self.commands) != self.num_nodes:
+            raise ValueError("Number of commands must match the number of nodes.")
+        return self
+
+
+class BuildkiteStep(BaseModel):
+    """This class represents a step in Buildkite format."""
+    label: str
+    agents: Dict[str, AgentQueue] = {"queue": AgentQueue.AWS_CPU}
+    commands: List[str]
+    key: Optional[str] = None
+    plugins: Optional[List[Dict]] = None
+    parallelism: Optional[int] = None
+    soft_fail: Optional[bool] = None
+    depends_on: Optional[str] = "build"
+    env: Optional[Dict[str, str]] = None
+    retry: Optional[Dict[str, Any]] = None
+
 
 class BuildkiteBlockStep(BaseModel):
     """This class represents a block step in Buildkite format."""

diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py
@@ -7,7 +7,6 @@
 VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
 VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
 AMD_REPO = "rocm/vllm-ci"
-A100_GPU = "a100"
 
 # File paths
 TEST_PATH = ".buildkite/test-pipeline.yaml"
@@ -23,6 +22,8 @@
 
 STEPS_TO_BLOCK = []
 
+class GPUType(str, enum.Enum):
+    A100 = "a100"
 
 class AgentQueue(str, enum.Enum):
     AWS_CPU = "cpu_queue"
@@ -37,7 +38,7 @@ class AgentQueue(str, enum.Enum):
 def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
     if no_gpu:
         return AgentQueue.AWS_SMALL_CPU
-    if gpu_type == A100_GPU:
+    if gpu_type == GPUType.A100.value:
         return AgentQueue.A100
     return AgentQueue.AWS_1xL4 if not num_gpus or num_gpus == 1 else AgentQueue.AWS_4xL4
 

diff --git a/scripts/tests/pipeline_generator/test_step.py b/scripts/tests/pipeline_generator/test_step.py
@@ -1,9 +1,9 @@
 import pytest
 import sys
+from pydantic import ValidationError
 
-
-from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep
-
+from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep, TestStep, DEFAULT_TEST_WORKING_DIR, BuildkiteStep
+from scripts.pipeline_generator.utils import AgentQueue, GPUType
 
 @pytest.mark.parametrize(
     ("step_label", "expected_result"),
@@ -30,6 +30,97 @@ def test_get_step_key(step_label: str, expected_result: str):
 def test_get_block_step(step_label: str, expected_result: BuildkiteBlockStep):
     assert get_block_step(step_label) == expected_result
 
+def test_create_test_step_with_command():
+    test_step = TestStep(
+        label="Test Step",
+        command="echo 'hello'",
+    )
+    assert test_step.label == "Test Step"
+    # Check default values
+    assert test_step.working_dir == DEFAULT_TEST_WORKING_DIR
+    assert test_step.optional is False
+    assert test_step.commands == ["echo 'hello'"]
+    assert test_step.command is None
+
+
+def test_create_test_step_fail_duplicate_command():
+    with pytest.raises(ValueError):
+        test_step = TestStep(
+            label="Test Step",
+            command="echo 'hello'",
+            commands=["echo 'hello'"],
+        )
+
+def test_create_test_step_fail_gpu_and_no_gpu():
+    with pytest.raises(ValueError, match="cannot be defined together"):
+        test_step = TestStep(
+            label="Test Step",
+            command="echo 'hello'",
+            gpu="a100",
+            no_gpu=True,
+        )
+
+def test_create_test_step_fail_gpu():
+    with pytest.raises(ValidationError):
+        test_step = TestStep(
+            label="Test Step",
+            command="echo 'hello'",
+            gpu="abc100",
+        )
+
+def test_create_test_step_multi_node():
+    with pytest.raises(ValueError, match="'num_gpus' must be defined if 'num_nodes' is defined."):
+        test_step = TestStep(
+            label="Test Step",
+            command="echo 'hello'",
+            num_nodes=2,
+        )
+
+    with pytest.raises(ValueError, match="Number of commands must match the number of nodes."):
+        test_step = TestStep(
+            label="Test Step",
+            num_nodes=2,
+            num_gpus=2,
+            commands=["echo 'hello1'", "echo 'hello2'", "echo 'hello3'"],
+        )
+
+    test_step = TestStep(
+        label="Test Step",
+        num_nodes=2,
+        num_gpus=2,
+        commands=["echo 'hello1'", "echo 'hello2'"],
+    )
+    assert test_step.label == "Test Step"
+    assert test_step.num_nodes == 2
+    assert test_step.num_gpus == 2
+    assert test_step.commands == ["echo 'hello1'", "echo 'hello2'"]
+
+def test_create_buildkite_step():
+    buildkite_step = BuildkiteStep(
+        label="Test Step",
+        key="test-step",
+        commands = ["echo 'hello'"],
+    )
+    assert buildkite_step.label == "Test Step"
+    assert buildkite_step.key == "test-step"
+    assert buildkite_step.agents == {"queue": AgentQueue.AWS_CPU}
+    assert buildkite_step.depends_on == "build"
+
+def test_create_buildkite_step_fail_no_command():
+    with pytest.raises(ValidationError):
+        buildkite_step = BuildkiteStep(
+            label="Test Step",
+            key="test-step",
+        )
+
+def test_create_buildkite_step_fail_wrong_agent_queue():
+    with pytest.raises(ValidationError):
+        buildkite_step = BuildkiteStep(
+            label="Test Step",
+            key="test-step",
+            agents={"queue": "wrong-queue"},
+        )
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))