Skip to content

Commit

Permalink
p
Browse files Browse the repository at this point in the history
Signed-off-by: kevin <[email protected]>
  • Loading branch information
khluu committed Sep 26, 2024
1 parent aeb85b6 commit e05d0eb
Show file tree
Hide file tree
Showing 7 changed files with 379 additions and 93 deletions.
12 changes: 7 additions & 5 deletions scripts/ci_aws_bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,16 @@ generate_pipeline() {
# Download necessary files
echo "Downloading pipeline generator scripts..."
echo "VLLM CI Branch: $VLLM_CI_BRANCH"
for FILE in pipeline_generator.py plugin.py step.py utils.py; do
curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE"
mkdir -p .buildkite/pipeline_generator
for FILE in pipeline_generator.py plugin.py step.py utils.py __init__.py; do
curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE"
done

# Generate and upload pipeline
python .buildkite/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF"
cat .buildkite/pipeline.yaml
buildkite-agent pipeline upload .buildkite/pipeline.yaml
cd .buildkite
python -m pipeline_generator.pipeline_generator --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF"
cat pipeline.yaml
buildkite-agent pipeline upload pipeline.yaml
exit 0
}

Expand Down
Empty file.
48 changes: 24 additions & 24 deletions scripts/pipeline_generator/pipeline_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from typing import List, Dict, Union
import os

from plugin import (
from .plugin import (
get_kubernetes_plugin_config,
get_docker_plugin_config,
)
from utils import (
from .utils import (
AgentQueue,
AMD_REPO,
A100_GPU,
Expand All @@ -21,7 +21,7 @@
get_full_test_command,
get_multi_node_test_command,
)
from step import (
from .step import (
TestStep,
BuildkiteStep,
BuildkiteBlockStep,
Expand All @@ -43,8 +43,6 @@ def read_test_steps(self, file_path: str) -> List[TestStep]:

def step_should_run(self, step: TestStep) -> bool:
"""Determine whether the step should automatically run or not."""
if step.gpu != A100_GPU:
return False
if step.optional:
return False
if not step.source_file_dependencies or self.run_all:
Expand All @@ -58,9 +56,6 @@ def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlo
steps = []
current_step = self.create_buildkite_step(step)

if step.num_nodes > 1:
self._configure_multi_node_step(current_step, step)

if not self.step_should_run(step):
block_step = get_block_step(step.label)
steps.append(block_step)
Expand All @@ -72,7 +67,7 @@ def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlo
def generate_build_step(self) -> BuildkiteStep:
"""Build the Docker image and push it to ECR."""
docker_image = f"{VLLM_ECR_REPO}:{self.commit}"
build_commands = self._get_build_commands(docker_image)
build_commands = self.get_build_commands(docker_image)

return BuildkiteStep(
label=":docker: build image",
Expand Down Expand Up @@ -110,7 +105,6 @@ def get_plugin_config(self, step: TestStep) -> Dict:
get_full_test_command(test_step_commands, step.working_dir)
]
container_image = f"{VLLM_ECR_REPO}:{self.commit}"

if step.gpu == A100_GPU:
return get_kubernetes_plugin_config(
container_image,
Expand All @@ -124,26 +118,30 @@ def get_plugin_config(self, step: TestStep) -> Dict:
)

def create_buildkite_step(self, step: TestStep) -> BuildkiteStep:
return BuildkiteStep(
buildkite_step = BuildkiteStep(
label=step.label,
key=get_step_key(step.label),
parallelism=step.parallelism,
soft_fail=step.soft_fail,
soft_fail=step.soft_fail,
plugins=[self.get_plugin_config(step)],
agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value}
)
if step.num_nodes and step.num_nodes > 1:
self._configure_multi_node_step(buildkite_step, step)
return buildkite_step

def _configure_multi_node_step(self, current_step: BuildkiteStep, step: TestStep):
current_step.commands = get_multi_node_test_command(
step.commands,
step.working_dir,
step.num_nodes,
step.num_gpus,
f"{VLLM_ECR_REPO}:{self.commit}"
)
current_step.commands = [get_multi_node_test_command(
step.commands,
step.working_dir,
step.num_nodes,
step.num_gpus,
f"{VLLM_ECR_REPO}:{self.commit}"
)
]
current_step.plugins = None

def _get_build_commands(self, docker_image: str) -> List[str]:
def get_build_commands(self, docker_image: str) -> List[str]:
ecr_login_command = (
"aws ecr-public get-login-password --region us-east-1 | "
f"docker login --username AWS --password-stdin {VLLM_ECR_URL}"
Expand Down Expand Up @@ -177,6 +175,8 @@ def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, Buildkit
step["commands"] = [cmd.replace("DOCKER_IMAGE_AMD", amd_docker_image) for cmd in step["commands"]]
buildkite_step = BuildkiteStep(**step)
buildkite_step.depends_on = "bootstrap"

# Add block step if step is in blocklist
if buildkite_step.key in STEPS_TO_BLOCK:
block_step = get_block_step(buildkite_step.label)
buildkite_steps.append(block_step)
Expand All @@ -185,9 +185,9 @@ def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, Buildkit
return buildkite_steps

def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteStep]:
mirrored_steps = []
mirrored_buildkite_steps = []
for test_step in test_steps:
if "amd" in test_step.mirror_hardwares:
if test_step.mirror_hardwares and "amd" in test_step.mirror_hardwares:
test_commands = [test_step.command] if test_step.command else test_step.commands
amd_test_command = [
"bash",
Expand All @@ -203,8 +203,8 @@ def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteSt
env = {"DOCKER_BUILDKIT": "1"},
commands = [" ".join(amd_test_command)],
)
mirrored_steps.append(mirrored_buildkite_step)
return mirrored_steps
mirrored_buildkite_steps.append(mirrored_buildkite_step)
return mirrored_buildkite_steps

@click.command()
@click.option("--run_all", type=str)
Expand Down
75 changes: 41 additions & 34 deletions scripts/pipeline_generator/plugin.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,43 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional

from utils import HF_HOME
from .utils import HF_HOME

DOCKER_PLUGIN_NAME = "docker#v5.2.0"
KUBERNETES_PLUGIN_NAME = "kubernetes"

DEFAULT_DOCKER_ENVIRONMENT_VARIBLES = [
f"HF_HOME={HF_HOME}",
"VLLM_USAGE_SOURCE=ci-test",
"HF_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN"
]
DEFAULT_DOCKER_VOLUMES = [
"/dev/shm:/dev/shm",
f"{HF_HOME}:{HF_HOME}"
]
DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS = [
{"name": "devshm", "mountPath": "/dev/shm"},
{"name": "hf-cache", "mountPath": HF_HOME}
]
DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES = [
{"name": "HF_HOME", "value": HF_HOME},
{"name": "VLLM_USAGE_SOURCE", "value": "ci-test"},
{
"name": "HF_TOKEN",
"valueFrom": {
"secretKeyRef": {
"name": "hf-token-secret",
"key": "token"
}
}
},
]
DEFAULT_KUBERNETES_POD_VOLUMES = [
{"name": "devshm", "emptyDir": {"medium": "Memory"}},
{"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}}
]
DEFAULT_KUBERNETES_NODE_SELECTOR = {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"}

class DockerPluginConfig(BaseModel):
"""
Expand All @@ -19,16 +51,8 @@ class DockerPluginConfig(BaseModel):
gpus: Optional[str] = "all"
mount_buildkite_agent: Optional[bool] = Field(default=False, alias="mount-buildkite-agent")
command: List[str] = Field(default_factory=list)
environment: List[str] = [
f"HF_HOME={HF_HOME}",
"VLLM_USAGE_SOURCE=ci-test",
"HF_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN"
]
volumes: List[str] = [
"/dev/shm:/dev/shm",
f"{HF_HOME}:{HF_HOME}"
]
environment: List[str] = DEFAULT_DOCKER_ENVIRONMENT_VARIBLES
volumes: List[str] = DEFAULT_DOCKER_VOLUMES


class KubernetesPodContainerConfig(BaseModel):
Expand All @@ -40,25 +64,10 @@ class KubernetesPodContainerConfig(BaseModel):
resources: Dict[str, Dict[str, int]]
volume_mounts: List[Dict[str, str]] = Field(
alias="volumeMounts",
default=[
{"name": "devshm", "mountPath": "/dev/shm"},
{"name": "hf-cache", "mountPath": HF_HOME}
]
default=DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS
)
env: List[Dict[str, str]] = Field(
default=[
{"name": "HF_HOME", "value": HF_HOME},
{"name": "VLLM_USAGE_SOURCE", "value": "ci-test"},
{
"name": "HF_TOKEN",
"valueFrom": {
"secretKeyRef": {
"name": "hf-token-secret",
"key": "token"
}
}
},
],
default=DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES,
)


Expand All @@ -69,14 +78,11 @@ class KubernetesPodSpec(BaseModel):
containers: List[KubernetesPodContainerConfig]
priority_class_name: str = Field(default="ci", alias="priorityClassName")
node_selector: Dict[str, Any] = Field(
default={"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"},
default=DEFAULT_KUBERNETES_NODE_SELECTOR,
alias="nodeSelector"
)
volumes: List[Dict[str, Any]] = Field(
default=[
{"name": "devshm", "emptyDir": {"medium": "Memory"}},
{"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}}
]
default=DEFAULT_KUBERNETES_POD_VOLUMES
)


Expand All @@ -88,11 +94,12 @@ class KubernetesPluginConfig(BaseModel):


def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[str], num_gpus: int) -> Dict:
test_bash_command[-1] = f'"{test_bash_command[-1]}"'
pod_spec = KubernetesPodSpec(
containers=[
KubernetesPodContainerConfig(
image=container_image,
command=test_bash_command,
command=[" ".join(test_bash_command)],
resources={"limits": {"nvidia.com/gpu": num_gpus}}
)
]
Expand Down
20 changes: 10 additions & 10 deletions scripts/pipeline_generator/step.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional

from utils import AgentQueue
from .utils import AgentQueue

BUILD_STEP_KEY = "build"

class TestStep(BaseModel):
"""This class represents a test step defined in the test configuration file."""
label: str
fast_check: bool = False
mirror_hardwares: List[str] = Field(default_factory=list)
gpu: str = ""
num_gpus: int = 1
num_nodes: int = 1
fast_check: Optional[bool] = None
mirror_hardwares: Optional[List[str]] = None
gpu: Optional[str] = None
num_gpus: Optional[int] = None
num_nodes: Optional[int] = None
working_dir: str = "/vllm-workspace/tests"
source_file_dependencies: List[str] = Field(default_factory=list)
no_gpu: bool = False
soft_fail: bool = False
parallelism: int = 1
source_file_dependencies: Optional[List[str]] = None
no_gpu: Optional[bool] = None
soft_fail: Optional[bool] = None
parallelism: Optional[int] = None
optional: bool = False
command: Optional[str] = None
commands: Optional[List[str]] = None
Expand Down
2 changes: 1 addition & 1 deletion scripts/pipeline_generator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: O
return AgentQueue.AWS_SMALL_CPU
if gpu_type == A100_GPU:
return AgentQueue.A100
return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4
return AgentQueue.AWS_1xL4 if not num_gpus or num_gpus == 1else AgentQueue.AWS_4xL4


def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str:
Expand Down
Loading

0 comments on commit e05d0eb

Please sign in to comment.