Skip to content

Commit

Permalink
[3/n][pipeline-gen] Add TestStep and BuildkiteStep along with validat…
Browse files Browse the repository at this point in the history
…ors (#39)

* p

Signed-off-by: kevin <[email protected]>

* p

Signed-off-by: kevin <[email protected]>

* p

Signed-off-by: kevin <[email protected]>

---------

Signed-off-by: kevin <[email protected]>
  • Loading branch information
khluu authored Oct 3, 2024
1 parent 10d0c43 commit be76542
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 7 deletions.
68 changes: 66 additions & 2 deletions scripts/pipeline_generator/step.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,73 @@
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, root_validator, model_validator
from typing import List, Dict, Any, Optional
from typing_extensions import Self

from .utils import AgentQueue
from .utils import AgentQueue, GPUType

BUILD_STEP_KEY = "build"
DEFAULT_TEST_WORKING_DIR = "/vllm-workspace/tests"

class TestStep(BaseModel):
"""This class represents a test step defined in the test configuration file."""
label: str
working_dir: Optional[str] = DEFAULT_TEST_WORKING_DIR
optional: Optional[bool] = False
fast_check: Optional[bool] = None
mirror_hardwares: Optional[List[str]] = None
no_gpu: Optional[bool] = None
gpu: Optional[GPUType] = None
num_gpus: Optional[int] = None
num_nodes: Optional[int] = None
source_file_dependencies: Optional[List[str]] = None
soft_fail: Optional[bool] = None
parallelism: Optional[int] = None
command: Optional[str] = None
commands: Optional[List[str]] = None

@model_validator(mode="before")
@classmethod
def validate_and_convert_command(cls, values) -> Any:
"""
Validate that either 'command' or 'commands' is defined.
If 'command' is defined, convert it to 'commands'.
"""
if not values.get("command") and not values.get("commands"):
raise ValueError("Either 'command' or 'commands' must be defined.")
if values.get("command") and values.get("commands"):
raise ValueError("Only one of 'command' or 'commands' can be defined.")
if values.get("command"):
values["commands"] = [values["command"]]
del values["command"]
return values

@model_validator(mode="after")
def validate_gpu(self) -> Self:
if self.gpu and self.no_gpu:
raise ValueError("Both 'gpu' and 'no_gpu' cannot be defined together.")
return self

@model_validator(mode="after")
def validate_multi_node(self) -> Self:
if self.num_nodes and not self.num_gpus:
raise ValueError("'num_gpus' must be defined if 'num_nodes' is defined.")
if self.num_nodes and len(self.commands) != self.num_nodes:
raise ValueError("Number of commands must match the number of nodes.")
return self


class BuildkiteStep(BaseModel):
"""This class represents a step in Buildkite format."""
label: str
agents: Dict[str, AgentQueue] = {"queue": AgentQueue.AWS_CPU}
commands: List[str]
key: Optional[str] = None
plugins: Optional[List[Dict]] = None
parallelism: Optional[int] = None
soft_fail: Optional[bool] = None
depends_on: Optional[str] = "build"
env: Optional[Dict[str, str]] = None
retry: Optional[Dict[str, Any]] = None


class BuildkiteBlockStep(BaseModel):
"""This class represents a block step in Buildkite format."""
Expand Down
5 changes: 3 additions & 2 deletions scripts/pipeline_generator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
AMD_REPO = "rocm/vllm-ci"
A100_GPU = "a100"

# File paths
TEST_PATH = ".buildkite/test-pipeline.yaml"
Expand All @@ -23,6 +22,8 @@

STEPS_TO_BLOCK = []

class GPUType(str, enum.Enum):
A100 = "a100"

class AgentQueue(str, enum.Enum):
AWS_CPU = "cpu_queue"
Expand All @@ -37,7 +38,7 @@ class AgentQueue(str, enum.Enum):
def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
if no_gpu:
return AgentQueue.AWS_SMALL_CPU
if gpu_type == A100_GPU:
if gpu_type == GPUType.A100.value:
return AgentQueue.A100
return AgentQueue.AWS_1xL4 if not num_gpus or num_gpus == 1 else AgentQueue.AWS_4xL4

Expand Down
97 changes: 94 additions & 3 deletions scripts/tests/pipeline_generator/test_step.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pytest
import sys
from pydantic import ValidationError


from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep

from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep, TestStep, DEFAULT_TEST_WORKING_DIR, BuildkiteStep
from scripts.pipeline_generator.utils import AgentQueue, GPUType

@pytest.mark.parametrize(
("step_label", "expected_result"),
Expand All @@ -30,6 +30,97 @@ def test_get_step_key(step_label: str, expected_result: str):
def test_get_block_step(step_label: str, expected_result: BuildkiteBlockStep):
assert get_block_step(step_label) == expected_result

def test_create_test_step_with_command():
test_step = TestStep(
label="Test Step",
command="echo 'hello'",
)
assert test_step.label == "Test Step"
# Check default values
assert test_step.working_dir == DEFAULT_TEST_WORKING_DIR
assert test_step.optional is False
assert test_step.commands == ["echo 'hello'"]
assert test_step.command is None


def test_create_test_step_fail_duplicate_command():
with pytest.raises(ValueError):
test_step = TestStep(
label="Test Step",
command="echo 'hello'",
commands=["echo 'hello'"],
)

def test_create_test_step_fail_gpu_and_no_gpu():
with pytest.raises(ValueError, match="cannot be defined together"):
test_step = TestStep(
label="Test Step",
command="echo 'hello'",
gpu="a100",
no_gpu=True,
)

def test_create_test_step_fail_gpu():
with pytest.raises(ValidationError):
test_step = TestStep(
label="Test Step",
command="echo 'hello'",
gpu="abc100",
)

def test_create_test_step_multi_node():
with pytest.raises(ValueError, match="'num_gpus' must be defined if 'num_nodes' is defined."):
test_step = TestStep(
label="Test Step",
command="echo 'hello'",
num_nodes=2,
)

with pytest.raises(ValueError, match="Number of commands must match the number of nodes."):
test_step = TestStep(
label="Test Step",
num_nodes=2,
num_gpus=2,
commands=["echo 'hello1'", "echo 'hello2'", "echo 'hello3'"],
)

test_step = TestStep(
label="Test Step",
num_nodes=2,
num_gpus=2,
commands=["echo 'hello1'", "echo 'hello2'"],
)
assert test_step.label == "Test Step"
assert test_step.num_nodes == 2
assert test_step.num_gpus == 2
assert test_step.commands == ["echo 'hello1'", "echo 'hello2'"]

def test_create_buildkite_step():
buildkite_step = BuildkiteStep(
label="Test Step",
key="test-step",
commands = ["echo 'hello'"],
)
assert buildkite_step.label == "Test Step"
assert buildkite_step.key == "test-step"
assert buildkite_step.agents == {"queue": AgentQueue.AWS_CPU}
assert buildkite_step.depends_on == "build"

def test_create_buildkite_step_fail_no_command():
with pytest.raises(ValidationError):
buildkite_step = BuildkiteStep(
label="Test Step",
key="test-step",
)

def test_create_buildkite_step_fail_wrong_agent_queue():
with pytest.raises(ValidationError):
buildkite_step = BuildkiteStep(
label="Test Step",
key="test-step",
agents={"queue": "wrong-queue"},
)


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))

0 comments on commit be76542

Please sign in to comment.