diff --git a/.ci/aws/Jenkinsfile b/.ci/aws/Jenkinsfile index 7f27207d0..6425d3ff1 100644 --- a/.ci/aws/Jenkinsfile +++ b/.ci/aws/Jenkinsfile @@ -43,31 +43,41 @@ def get_persistent_cluster_name(build_tag, os, instance_type) { return "PluginPRCI_PersistentManualCluster_${instance_type_prefix}" } -def run_test_orchestrator_once_persistent(run_name, build_tag, os, instance_type, instance_count, region, odcr, addl_args) { +def get_persistent_cluster_region(cluster_name) { + def region = sh ( + script: "aws ssm get-parameter --region ${REGION} --name ${cluster_name} --query Parameter.Value --output text | tr -cd 'a-z0-9-'", + returnStdout: true + ) + return region +} + +def run_test_orchestrator_once_persistent(run_name, build_tag, os, instance_type, instance_count, addl_args) { /* * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster */ def buildNumber = env.BUILD_NUMBER as int def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type) - def args = "--os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}.xml" + def region = get_persistent_cluster_region(cluster_name) + def args = "--os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}.xml" sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}" } -def run_test_orchestrator_once_container(run_name, build_tag, os, container_os, instance_type, instance_count, region, odcr, addl_args) { +def run_test_orchestrator_once_container(run_name, build_tag, os, container_os, instance_type, instance_count, addl_args) { /* * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments on an already existing persistent cluster on a container with specified OS */ def buildNumber = env.BUILD_NUMBER as int def cluster_name = get_persistent_cluster_name(build_tag, os, instance_type) - def args = "--os ${os} --container-os ${container_os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}-${os}-${container_os}.xml" + def region = get_persistent_cluster_region(cluster_name) + def args = "--os ${os} --container-os ${container_os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}-PR${buildNumber}-${os}-${container_os}.xml" sh ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}" } -def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, odcr, addl_args) { +def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type, lock_label, lock_count, addl_args) { /* * Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster. * The job will queue until it acquires the given number of locks. The locks will be released @@ -76,23 +86,21 @@ def get_test_stage_with_lock_persistent(stage_name, build_tag, os, instance_type * param@ build_tag: the BUILD_TAG env generated by Jenkins * param@ os: the operating system for the test stage. * param@ instance_type: the instance type for the test stage. - * param@ region: the (default) aws region where the tests are run. * param@ lock_label: str, the label of the lockable resources. * param@ lock_count: int, the quantity of the lockable resources. - * param@ odcr: The on demand capacity reservation ID to create instances in * param@ addl_args: additional arguments passed to test_orchestrator.py * return@: the test stage. */ return { stage("${stage_name}") { lock(label: lock_label, quantity: lock_count) { - this.run_test_orchestrator_once_persistent(stage_name, build_tag, os, instance_type, lock_count, region, odcr, addl_args) + this.run_test_orchestrator_once_persistent(stage_name, build_tag, os, instance_type, lock_count, addl_args) } } } } -def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, instance_type, region, lock_label, lock_count, odcr, addl_args) { +def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, instance_type, lock_label, lock_count, addl_args) { /* * Generate a single test stage that run test_orchestrator.py with the given parameters on an already existing persistent cluster on a container of specified OS. * The job will queue until it acquires the given number of locks. The locks will be released @@ -102,17 +110,15 @@ def get_test_stage_with_lock_container(stage_name, build_tag, os, container_os, * param@ os: the operating system on the instance for the test stage. * param@ container_os: the operating system on the container for the test stage. * param@ instance_type: the instance type for the test stage. - * param@ region: the (default) aws region where the tests are run. * param@ lock_label: str, the label of the lockable resources. * param@ lock_count: int, the quantity of the lockable resources. - * param@ odcr: The on demand capacity reservation ID to create instances in * param@ addl_args: additional arguments passed to test_orchestrator.py * return@: the test stage. */ return { stage("${stage_name}") { lock(label: lock_label, quantity: lock_count) { - this.run_test_orchestrator_once_container(stage_name, build_tag, os, container_os, instance_type, lock_count, region, odcr, addl_args) + this.run_test_orchestrator_once_container(stage_name, build_tag, os, container_os, instance_type, lock_count, addl_args) } } } @@ -175,62 +181,48 @@ pipeline { def nccl_test_iter = "--test-aws-ofi-nccl-nccltest-iterations 5" def efa_installer = "--use-prebuilt-ami-with-efa-installer true" - def persistent_manual_cluster_addl_args = " --keep-cluster --skip-fixture-setup --skip-health-checks --use-existing-installer --cleanup-pf-directory" + def persistent_manual_cluster_addl_args = " --keep-cluster --skip-fixture-setup --skip-health-checks --use-existing-installer --cleanup-pf-directory --enable-placement-group false" def container_addl_args = " --test-in-containers-on-ec2" def base_args = "${efa_installer} ${nccl_version} ${timeout} ${cluster_type} ${test_target} ${test_type} ${build_type} ${pr_num} ${nccl_test_iter} ${persistent_manual_cluster_addl_args}" def num_instances = 4 def p3dn_lock_label = "p3dn-1-4node" - def p3dn_region = "ap-northeast-1" - def p3dn_odcr = "cr-08ecd03c0644442e4" - def p3dn_addl_args = "${base_args} --odcr-placement-group-name p3dn-placement-group ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional" def p4d_lock_label = "p4d-1-4node" - def p4d_region = "us-east-2" - def p4d_odcr = "cr-0e5eebb3c896f6af0" - def p4_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional" def p5_lock_label = "p5-1-4node" - def p5_region = "ap-southeast-3" - def p5_odcr = "cr-091dbf6e0516dbba1" - def p5_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test" - def p3_p4_p5_base_os = "alinux2" - def g4dn_lock_label = "g4dn-1-4node" - def g4dn_region = "us-west-2" - def g4dn_odcr = "cr-0e2f9cac30bb5ad5f" - def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group --test-list test_nccl_test test_ofi_nccl_functional" def trn1_lock_label = "trn1-1-4node" - def trn1_region = "us-east-2" - def trn1_odcr = "cr-0e9366fb7fa2772f1" - def trn1_addl_args = "${base_args} --odcr-placement-group-name trn1-placement-group --test-list test_nccom_test" def trn1n_lock_label = "trn1n-1-4node" - def trn1n_region = "us-east-1" - def trn1n_odcr = "cr-07342cf6439332dce" - def trn1n_addl_args = "${base_args} --odcr-placement-group-name trn1n-placement-group --test-list test_nccom_test" + + def p3_p4_p5_base_os = "alinux2" + def p3_p4_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test test_ofi_nccl_functional" + def p5_addl_args = "${base_args} ${container_addl_args} --test-list test_nccl_test" + def g4dn_addl_args = "${base_args} --test-list test_nccl_test test_ofi_nccl_functional" + def neuron_addl_args = "${base_args} --test-list test_nccom_test" // p3dn tests - stages["4_p3dn_al2"] = get_test_stage_with_lock_container("4_p3dn_al2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) - stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) - stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) + stages["4_p3dn_al2"] = get_test_stage_with_lock_container("4_p3dn_al2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p3dn.24xlarge", p3dn_lock_label, num_instances, p3_p4_addl_args) + stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p3dn.24xlarge", p3dn_lock_label, num_instances, p3_p4_addl_args) + stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock_container("4_p3dn_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p3dn.24xlarge", p3dn_lock_label, num_instances, p3_p4_addl_args) // p4d tests - stages["4_p4d_alinux2"] = get_test_stage_with_lock_container("4_p4d_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args) - stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock_container("4_p4d_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args) - stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock_container("4_p4d_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_addl_args) + stages["4_p4d_alinux2"] = get_test_stage_with_lock_container("4_p4d_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p4d.24xlarge", p4d_lock_label, num_instances, p3_p4_addl_args) + stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock_container("4_p4d_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p4d.24xlarge", p4d_lock_label, num_instances, p3_p4_addl_args) + stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock_container("4_p4d_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p4d.24xlarge", p4d_lock_label, num_instances, p3_p4_addl_args) // p5 tests - stages["4_p5_alinux2"] = get_test_stage_with_lock_container("4_p5_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args) - stages["4_p5_ubuntu2004"] = get_test_stage_with_lock_container("4_p5_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args) - stages["4_p5_ubuntu2204"] = get_test_stage_with_lock_container("4_p5_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p5_addl_args) + stages["4_p5_alinux2"] = get_test_stage_with_lock_container("4_p5_alinux2", env.BUILD_TAG, p3_p4_p5_base_os, "alinux2", "p5.48xlarge", p5_lock_label, num_instances, p5_addl_args) + stages["4_p5_ubuntu2004"] = get_test_stage_with_lock_container("4_p5_ubuntu2004", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2004", "p5.48xlarge", p5_lock_label, num_instances, p5_addl_args) + stages["4_p5_ubuntu2204"] = get_test_stage_with_lock_container("4_p5_ubuntu2204", env.BUILD_TAG, p3_p4_p5_base_os, "ubuntu2204", "p5.48xlarge", p5_lock_label, num_instances, p5_addl_args) // g4dn tests - stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock_persistent("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args) + stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock_persistent("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_lock_label, num_instances, g4dn_addl_args) // trn1 tests - stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_region, trn1_lock_label, num_instances, trn1_odcr, trn1_addl_args) + stages["4_trn1_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", trn1_lock_label, num_instances, neuron_addl_args) // trn1n tests - stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_region, trn1n_lock_label, num_instances, trn1n_odcr, trn1n_addl_args) + stages["4_trn1n_ubuntu2004"] = get_test_stage_with_lock_persistent("4_trn1n_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "trn1n.32xlarge", trn1n_lock_label, num_instances, neuron_addl_args) parallel stages }