Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline generator steps #32

Merged
merged 11 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,12 @@ terraform.rc

.env

.vscode/
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

.cache
*.log
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
click==8.1.7
pydantic==2.9.2
Empty file added scripts/__init__.py
Empty file.
Empty file.
60 changes: 60 additions & 0 deletions scripts/pipeline_generator/step.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional

from .utils import AgentQueue


class TestStep(BaseModel):
"""This class represents a test step defined in the test configuration file."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are these in this PR? do you want to test maybe unmarshaling these data structures?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the method that unmarshal them from the yaml file would be in the next PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then why is this checked in in this PR? this is not referenced in this PR anywhere?

label: str
fast_check: bool = False
mirror_hardwares: List[str] = Field(default_factory=list)
gpu: str = ""
num_gpus: int = 1
num_nodes: int = 1
working_dir: str = "/vllm-workspace/tests"
source_file_dependencies: List[str] = Field(default_factory=list)
no_gpu: bool = False
soft_fail: bool = False
parallelism: int = 1
optional: bool = False
command: Optional[str] = None
commands: Optional[List[str]] = None


class BuildkiteStep(BaseModel):
"""This class represents a step in Buildkite format."""
label: str
key: str
agents: Dict[str, Any] = {"queue": AgentQueue.AWS_CPU}
commands: Optional[List[str]] = None
plugins: Optional[List[Dict]] = None
parallelism: Optional[int] = None
soft_fail: Optional[bool] = None
depends_on: Optional[str] = "build"
env: Optional[Dict[str, str]] = None
retry: Optional[Dict[str, Any]] = None


class BuildkiteBlockStep(BaseModel):
"""This class represents a block step in Buildkite format."""
block: str
depends_on: Optional[str] = "build"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe declare a global constant for "build"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

key: str


def get_step_key(step_label: str) -> str:
step_label = step_label.replace(", ", ",")
step_key = ""
skip_chars = "()%"
for char in step_label.lower():
if char in ", ":
step_key += "-"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this should be like "if the last char was -, just skip this char"? then you do not need the replace(", ", ","), and can more gracefully handle cases like ,, or , (double spaces)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

elif char not in skip_chars:
step_key += char

return step_key


def get_block_step(step_label: str) -> BuildkiteBlockStep:
return BuildkiteBlockStep(block=f"Run {step_label}", key=f"block-{get_step_key(step_label)}")
62 changes: 62 additions & 0 deletions scripts/pipeline_generator/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import enum
from typing import Optional, List

# Constants
HF_HOME = "/root/.cache/huggingface"
DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
AMD_REPO = "rocm/vllm-ci"
A100_GPU = "a100"

# File paths
TEST_PATH = ".buildkite/test-pipeline.yaml"
EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"

STEPS_TO_BLOCK = []


class AgentQueue(str, enum.Enum):
AWS_CPU = "cpu_queue"
AWS_SMALL_CPU = "small_cpu_queue"
AWS_1xL4 = "gpu_1_queue"
AWS_4xL4 = "gpu_4_queue"
A100 = "a100-queue"
AMD_GPU = "amd"
AMD_CPU = "amd-cpu"


def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
if no_gpu:
return AgentQueue.AWS_SMALL_CPU
if gpu_type == A100_GPU:
return AgentQueue.A100
return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4


def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str:
"""Convert test commands into one-line command with the right directory."""
working_dir = step_working_dir or DEFAULT_WORKING_DIR
test_commands_str = "; ".join(test_commands)
return f"cd {working_dir}; {test_commands_str}"


def get_multi_node_test_command(
test_commands: List[str],
working_dir: str,
num_nodes: int,
num_gpus: int,
docker_image_path: str
) -> str:
quoted_commands = [f"'{command}'" for command in test_commands]
multi_node_command = [
MULTI_NODE_TEST_SCRIPT,
working_dir or DEFAULT_WORKING_DIR,
str(num_nodes),
str(num_gpus),
docker_image_path,
*quoted_commands
]
return " ".join(map(str, multi_node_command))
Empty file added scripts/tests/__init__.py
Empty file.
Empty file.
35 changes: 35 additions & 0 deletions scripts/tests/pipeline_generator/test_step.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest
import sys


from scripts.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep


@pytest.mark.parametrize(
("step_label", "expected_result"),
[
("Test Step", "test-step"),
("Test Step 2", "test-step-2"),
("Test (Step)", "test-step"),
("Test A, B, C", "test-a-b-c"),
],
)
def test_get_step_key(step_label: str, expected_result: str):
assert get_step_key(step_label) == expected_result


@pytest.mark.parametrize(
("step_label", "expected_result"),
[
("Test Step", BuildkiteBlockStep(block="Run Test Step", key="block-test-step")),
("Test Step 2", BuildkiteBlockStep(block="Run Test Step 2", key="block-test-step-2")),
("Test (Step)", BuildkiteBlockStep(block="Run Test (Step)", key="block-test-step")),
("Test A, B, C", BuildkiteBlockStep(block="Run Test A, B, C", key="block-test-a-b-c")),
],
)
def test_get_block_step(step_label: str, expected_result: BuildkiteBlockStep):
assert get_block_step(step_label) == expected_result


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
66 changes: 66 additions & 0 deletions scripts/tests/pipeline_generator/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest
import sys
from typing import List

from scripts.pipeline_generator.utils import (
get_agent_queue,
get_full_test_command,
get_multi_node_test_command,
AgentQueue,
MULTI_NODE_TEST_SCRIPT,
)


@pytest.mark.parametrize(
("no_gpu", "gpu_type", "num_gpus", "expected_result"),
[
(True, None, None, AgentQueue.AWS_SMALL_CPU),
(False, "a100", None, AgentQueue.A100),
(False, None, 1, AgentQueue.AWS_1xL4),
(False, None, 4, AgentQueue.AWS_4xL4),
],
)
def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue):
assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result


@pytest.mark.parametrize(
("test_commands", "step_working_dir", "expected_result"),
[
(["echo 'hello'"], None, "cd /vllm-workspace/tests; echo 'hello'"),
(["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests; echo 'hello'"),
(["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests; echo 'hello1'; echo 'hello2'"),
],
)
def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str):
assert get_full_test_command(test_commands, step_working_dir) == expected_result


def test_get_multi_node_test_command():
test_commands = [
(
"distributed/test_same_node.py;"
"pytest -v -s distributed/test_multi_node_assignment.py;"
"pytest -v -s distributed/test_pipeline_parallel.py"
),
"distributed/test_same_node.py",
]
working_dir = "/vllm-workspace/tests"
num_nodes = 2
num_gpus = 4
docker_image_path = "ecr-path/vllm-ci-test-repo:latest"
expected_multi_node_command = [
MULTI_NODE_TEST_SCRIPT,
working_dir,
num_nodes,
num_gpus,
docker_image_path,
f"'{test_commands[0]}'",
f"'{test_commands[1]}'",
]
expected_result = " ".join(map(str, expected_multi_node_command))
assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
Loading