Skip to content

Commit

Permalink
[v1.12.x-aws] .ci/aws: Get rid of ODCR/Region/PlacementGroup from Jen…
Browse files Browse the repository at this point in the history
…kinsfile

Signed-off-by: Seth Zegelstein <[email protected]>
(cherry picked from commit 7b8fa6d)
  • Loading branch information
a-szegel committed Nov 5, 2024
1 parent 6b72511 commit 19f6f67
Showing 1 changed file with 37 additions and 45 deletions.
82 changes: 37 additions & 45 deletions .ci/aws/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,31 +43,41 @@ def get_persistent_cluster_name(build_tag, os, instance_type) {
return "PluginPRCI_PersistentManualCluster_${instance_type_prefix}"
}

def run_test_orchestrator_once_persistent(run_name, build_tag, os, instance_type, instance_count, region, odcr, addl_args) {
def get_persistent_cluster_region(cluster_name) {
def region = sh (
script: "aws ssm get-parameter --region ${REGION} --name ${cluster_name} --query Parameter.Value --output text | tr -cd 'a-z0-9-'",
returnStdout: true
)
return region
}

def run_test_orchestrator_once_persistent(run_name, build_tag, os, instance_type, instance_count, addl_args) {
/*
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster
*/

def buildNumber = env.BUILD_NUMBER as int
def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type)
def args = "--os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}.xml"
def region = get_persistent_cluster_region(cluster_name)
def args = "--os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}.xml"
sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}"

}

def run_test_orchestrator_once_container(run_name, build_tag, os, container_os, instance_type, instance_count, region, odcr, addl_args) {
def run_test_orchestrator_once_container(run_name, build_tag, os, container_os, instance_type, instance_count, addl_args) {
/*
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster on a container with specified OS
*/

def buildNumber = env.BUILD_NUMBER as int
def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type)
def args = "--os ${os} --container-os ${container_os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}-${os}-${container_os}.xml"
def region = get_persistent_cluster_region(cluster_name)
def args = "--os ${os} --container-os ${container_os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}-${os}-${container_os}.xml"
sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}"

}

def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, odcr, addl_args) {
def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type, lock_label, lock_count, addl_args) {
/*
* Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster.
* The job will queue until it acquires the given number of locks. The locks will be released
Expand All @@ -76,23 +86,21 @@ def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type
* param@ build_tag: the BUILD_TAG env generated by Jenkins
* param@ os: the operating system for the test stage.
* param@ instance_type: the instance type for the test stage.
* param@ region: the (default) aws region where the tests are run.
* param@ lock_label: str, the label of the lockable resources.
* param@ lock_count: int, the quantity of the lockable resources.
* param@ odcr: The on demand capacity reservation ID to create instances in
* param@ addl_args: additional arguments passed to test_orchestrator.py
* return@: the test stage.
*/
return {
stage("${stage_name}") {
lock(label: lock_label, quantity: lock_count) {
this.run_test_orchestrator_once_persistent(stage_name, build_tag, os, instance_type, lock_count, region, odcr, addl_args)
this.run_test_orchestrator_once_persistent(stage_name, build_tag, os, instance_type, lock_count, addl_args)
}
}
}
}

def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, instance_type, region, lock_label, lock_count, odcr, addl_args) {
def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, instance_type, lock_label, lock_count, addl_args) {
/*
* Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster on a container of specified OS.
* The job will queue until it acquires the given number of locks. The locks will be released
Expand All @@ -102,17 +110,15 @@ def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os,
* param@ os: the operating system on the instance for the test stage.
* param@ container_os: the operating system on the container for the test stage.
* param@ instance_type: the instance type for the test stage.
* param@ region: the (default) aws region where the tests are run.
* param@ lock_label: str, the label of the lockable resources.
* param@ lock_count: int, the quantity of the lockable resources.
* param@ odcr: The on demand capacity reservation ID to create instances in
* param@ addl_args: additional arguments passed to test_orchestrator.py
* return@: the test stage.
*/
return {
stage("${stage_name}") {
lock(label: lock_label, quantity: lock_count) {
this.run_test_orchestrator_once_container(stage_name, build_tag, os, container_os, instance_type, lock_count, region, odcr, addl_args)
this.run_test_orchestrator_once_container(stage_name, build_tag, os, container_os, instance_type, lock_count, addl_args)
}
}
}
Expand Down Expand Up @@ -175,62 +181,48 @@ pipeline {
def nccl_test_iter = "--test-aws-ofi-nccl-nccltest-iterations 5"
def efa_installer = "--use-prebuilt-ami-with-efa-installer true"

def persistent_manual_cluster_addl_args = " --keep-cluster --skip-fixture-setup --skip-health-checks --use-existing-installer --cleanup-pf-directory"
def persistent_manual_cluster_addl_args = " --keep-cluster --skip-fixture-setup --skip-health-checks --use-existing-installer --cleanup-pf-directory --enable-placement-group false"
def container_addl_args = " --test-in-containers-on-ec2"

def base_args = "${efa_installer} ${nccl_version} ${timeout} ${cluster_type} ${test_target} ${test_type} ${build_type} ${pr_num} ${nccl_test_iter} ${persistent_manual_cluster_addl_args}"

def num_instances = 4
def p3dn_lock_label = "p3dn-1-4node"
def p3dn_region = "ap-northeast-1"
def p3dn_odcr = "cr-08ecd03c0644442e4"
def p3dn_addl_args = "${base_args} --odcr-placement-group-name p3dn-placement-group ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional"
def p4d_lock_label = "p4d-1-4node"
def p4d_region = "us-east-2"
def p4d_odcr = "cr-0e5eebb3c896f6af0"
def p4_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional"
def p5_lock_label = "p5-1-4node"
def p5_region = "ap-southeast-3"
def p5_odcr = "cr-091dbf6e0516dbba1"
def p5_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test"
def p3_p4_p5_base_os = "alinux2"

def g4dn_lock_label = "g4dn-1-4node"
def g4dn_region = "us-west-2"
def g4dn_odcr = "cr-0e2f9cac30bb5ad5f"
def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group --test-list test_nccl_test test_ofi_nccl_functional"
def trn1_lock_label = "trn1-1-4node"
def trn1_region = "us-east-2"
def trn1_odcr = "cr-0e9366fb7fa2772f1"
def trn1_addl_args = "${base_args} --odcr-placement-group-name trn1-placement-group --test-list test_nccom_test"
def trn1n_lock_label = "trn1n-1-4node"
def trn1n_region = "us-east-1"
def trn1n_odcr = "cr-07342cf6439332dce"
def trn1n_addl_args = "${base_args} --odcr-placement-group-name trn1n-placement-group --test-list test_nccom_test"

def p3_p4_p5_base_os = "alinux2"
def p3_p4_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional"
def p5_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test"
def g4dn_addl_args = "${base_args} --test-list test_nccl_test test_ofi_nccl_functional"
def neuron_addl_args = "${base_args} --test-list test_nccom_test"

// p3dn tests
stages["4_p3dn_al2"] = get_test_stage_with_lock_container("4_p3dn_al2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_al2"] = get_test_stage_with_lock_container("4_p3dn_al2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p3dn.24xlarge", p3dn_lock_label, num_instances, p3_p4_addl_args)
stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p3dn.24xlarge", p3dn_lock_label, num_instances, p3_p4_addl_args)
stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p3dn.24xlarge", p3dn_lock_label, num_instances, p3_p4_addl_args)

// p4d tests
stages["4_p4d_alinux2"] = get_test_stage_with_lock_container("4_p4d_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args)
stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock_container("4_p4d_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args)
stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock_container("4_p4d_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args)
stages["4_p4d_alinux2"] = get_test_stage_with_lock_container("4_p4d_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p4d.24xlarge", p4d_lock_label, num_instances, p3_p4_addl_args)
stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock_container("4_p4d_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p4d.24xlarge", p4d_lock_label, num_instances, p3_p4_addl_args)
stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock_container("4_p4d_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p4d.24xlarge", p4d_lock_label, num_instances, p3_p4_addl_args)

// p5 tests
stages["4_p5_alinux2"] = get_test_stage_with_lock_container("4_p5_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args)
stages["4_p5_ubuntu2004"] = get_test_stage_with_lock_container("4_p5_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args)
stages["4_p5_ubuntu2204"] = get_test_stage_with_lock_container("4_p5_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args)
stages["4_p5_alinux2"] = get_test_stage_with_lock_container("4_p5_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p5.48xlarge", p5_lock_label, num_instances, p5_addl_args)
stages["4_p5_ubuntu2004"] = get_test_stage_with_lock_container("4_p5_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p5.48xlarge", p5_lock_label, num_instances, p5_addl_args)
stages["4_p5_ubuntu2204"] = get_test_stage_with_lock_container("4_p5_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p5.48xlarge", p5_lock_label, num_instances, p5_addl_args)

// g4dn tests
stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock_persistent("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args)
stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock_persistent("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_lock_label, num_instances, g4dn_addl_args)

// trn1 tests
stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_region, trn1_lock_label, num_instances, trn1_odcr, trn1_addl_args)
stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_lock_label, num_instances, neuron_addl_args)

// trn1n tests
stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_region, trn1n_lock_label, num_instances, trn1n_odcr, trn1n_addl_args)
stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_lock_label, num_instances, neuron_addl_args)

parallel stages
}
Expand Down

0 comments on commit 19f6f67

Please sign in to comment.