-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pipeline generator steps #32
Changes from 9 commits
717b163
e3aac9c
49043e5
72f1fd2
3cdd2b3
ab323d9
231cd2f
df69376
6bea5dc
5a086b9
3c45849
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
click==8.1.7 | ||
pydantic==2.9.2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from pydantic import BaseModel, Field | ||
from typing import List, Dict, Any, Optional | ||
|
||
from .utils import AgentQueue | ||
|
||
|
||
class TestStep(BaseModel): | ||
"""This class represents a test step defined in the test configuration file.""" | ||
label: str | ||
fast_check: bool = False | ||
mirror_hardwares: List[str] = Field(default_factory=list) | ||
gpu: str = "" | ||
num_gpus: int = 1 | ||
num_nodes: int = 1 | ||
working_dir: str = "/vllm-workspace/tests" | ||
source_file_dependencies: List[str] = Field(default_factory=list) | ||
no_gpu: bool = False | ||
soft_fail: bool = False | ||
parallelism: int = 1 | ||
optional: bool = False | ||
command: Optional[str] = None | ||
commands: Optional[List[str]] = None | ||
|
||
|
||
class BuildkiteStep(BaseModel): | ||
"""This class represents a step in Buildkite format.""" | ||
label: str | ||
key: str | ||
agents: Dict[str, Any] = {"queue": AgentQueue.AWS_CPU} | ||
commands: Optional[List[str]] = None | ||
plugins: Optional[List[Dict]] = None | ||
parallelism: Optional[int] = None | ||
soft_fail: Optional[bool] = None | ||
depends_on: Optional[str] = "build" | ||
env: Optional[Dict[str, str]] = None | ||
retry: Optional[Dict[str, Any]] = None | ||
|
||
|
||
class BuildkiteBlockStep(BaseModel): | ||
"""This class represents a block step in Buildkite format.""" | ||
block: str | ||
depends_on: Optional[str] = "build" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe declare a global constant for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
key: str | ||
|
||
|
||
def get_step_key(step_label: str) -> str: | ||
step_label = step_label.replace(", ", ",") | ||
step_key = "" | ||
skip_chars = "()%" | ||
for char in step_label.lower(): | ||
if char in ", ": | ||
step_key += "-" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe this should be like "if the last char was There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
elif char not in skip_chars: | ||
step_key += char | ||
|
||
return step_key | ||
|
||
|
||
def get_block_step(step_label: str) -> BuildkiteBlockStep: | ||
return BuildkiteBlockStep(block=f"Run {step_label}", key=f"block-{get_step_key(step_label)}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import enum | ||
from typing import Optional, List | ||
|
||
# Constants | ||
HF_HOME = "/root/.cache/huggingface" | ||
DEFAULT_WORKING_DIR = "/vllm-workspace/tests" | ||
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7" | ||
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo" | ||
AMD_REPO = "rocm/vllm-ci" | ||
A100_GPU = "a100" | ||
|
||
# File paths | ||
TEST_PATH = ".buildkite/test-pipeline.yaml" | ||
EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml" | ||
PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml" | ||
MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh" | ||
|
||
STEPS_TO_BLOCK = [] | ||
|
||
|
||
class AgentQueue(str, enum.Enum): | ||
AWS_CPU = "cpu_queue" | ||
AWS_SMALL_CPU = "small_cpu_queue" | ||
AWS_1xL4 = "gpu_1_queue" | ||
AWS_4xL4 = "gpu_4_queue" | ||
A100 = "a100-queue" | ||
AMD_GPU = "amd" | ||
AMD_CPU = "amd-cpu" | ||
|
||
|
||
def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue: | ||
if no_gpu: | ||
return AgentQueue.AWS_SMALL_CPU | ||
if gpu_type == A100_GPU: | ||
return AgentQueue.A100 | ||
return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4 | ||
|
||
|
||
def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str: | ||
"""Convert test commands into one-line command with the right directory.""" | ||
working_dir = step_working_dir or DEFAULT_WORKING_DIR | ||
test_commands_str = "; ".join(test_commands) | ||
return f"cd {working_dir}; {test_commands_str}" | ||
|
||
|
||
def get_multi_node_test_command( | ||
test_commands: List[str], | ||
working_dir: str, | ||
num_nodes: int, | ||
num_gpus: int, | ||
docker_image_path: str | ||
) -> str: | ||
quoted_commands = [f"'{command}'" for command in test_commands] | ||
multi_node_command = [ | ||
MULTI_NODE_TEST_SCRIPT, | ||
working_dir or DEFAULT_WORKING_DIR, | ||
str(num_nodes), | ||
str(num_gpus), | ||
docker_image_path, | ||
*quoted_commands | ||
] | ||
return " ".join(map(str, multi_node_command)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pytest | ||
import sys | ||
|
||
|
||
from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("step_label", "expected_result"), | ||
[ | ||
("Test Step", "test-step"), | ||
("Test Step 2", "test-step-2"), | ||
("Test (Step)", "test-step"), | ||
("Test A, B, C", "test-a-b-c"), | ||
], | ||
) | ||
def test_get_step_key(step_label: str, expected_result: str): | ||
assert get_step_key(step_label) == expected_result | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("step_label", "expected_result"), | ||
[ | ||
("Test Step", BuildkiteBlockStep(block="Run Test Step", key="block-test-step")), | ||
("Test Step 2", BuildkiteBlockStep(block="Run Test Step 2", key="block-test-step-2")), | ||
("Test (Step)", BuildkiteBlockStep(block="Run Test (Step)", key="block-test-step")), | ||
("Test A, B, C", BuildkiteBlockStep(block="Run Test A, B, C", key="block-test-a-b-c")), | ||
], | ||
) | ||
def test_get_block_step(step_label: str, expected_result: BuildkiteBlockStep): | ||
assert get_block_step(step_label) == expected_result | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", __file__])) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import pytest | ||
import sys | ||
from typing import List | ||
|
||
from scripts.pipeline_generator.utils import ( | ||
get_agent_queue, | ||
get_full_test_command, | ||
get_multi_node_test_command, | ||
AgentQueue, | ||
MULTI_NODE_TEST_SCRIPT, | ||
) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("no_gpu", "gpu_type", "num_gpus", "expected_result"), | ||
[ | ||
(True, None, None, AgentQueue.AWS_SMALL_CPU), | ||
(False, "a100", None, AgentQueue.A100), | ||
(False, None, 1, AgentQueue.AWS_1xL4), | ||
(False, None, 4, AgentQueue.AWS_4xL4), | ||
], | ||
) | ||
def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue): | ||
assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("test_commands", "step_working_dir", "expected_result"), | ||
[ | ||
(["echo 'hello'"], None, "cd /vllm-workspace/tests; echo 'hello'"), | ||
(["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests; echo 'hello'"), | ||
(["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests; echo 'hello1'; echo 'hello2'"), | ||
], | ||
) | ||
def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str): | ||
assert get_full_test_command(test_commands, step_working_dir) == expected_result | ||
|
||
|
||
def test_get_multi_node_test_command(): | ||
test_commands = [ | ||
( | ||
"distributed/test_same_node.py;" | ||
"pytest -v -s distributed/test_multi_node_assignment.py;" | ||
"pytest -v -s distributed/test_pipeline_parallel.py" | ||
), | ||
"distributed/test_same_node.py", | ||
] | ||
working_dir = "/vllm-workspace/tests" | ||
num_nodes = 2 | ||
num_gpus = 4 | ||
docker_image_path = "ecr-path/vllm-ci-test-repo:latest" | ||
expected_multi_node_command = [ | ||
MULTI_NODE_TEST_SCRIPT, | ||
working_dir, | ||
num_nodes, | ||
num_gpus, | ||
docker_image_path, | ||
f"'{test_commands[0]}'", | ||
f"'{test_commands[1]}'", | ||
] | ||
expected_result = " ".join(map(str, expected_multi_node_command)) | ||
assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", __file__])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are these in this PR? do you want to test maybe unmarshaling these data structures?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the method that unmarshal them from the yaml file would be in the next PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
then why is this checked in in this PR? this is not referenced in this PR anywhere?