Skip to content

Commit

Permalink
.ci/aws: Merge common.groovy and Jenkinsfile
Browse files Browse the repository at this point in the history
Signed-off-by: Seth Zegelstein <[email protected]>
  • Loading branch information
a-szegel committed Jul 29, 2024
1 parent bacae0c commit 7f41bc6
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 168 deletions.
176 changes: 160 additions & 16 deletions .ci/aws/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,154 @@ def buildNumber = env.BUILD_NUMBER as int
if (buildNumber > 1) milestone(buildNumber - 1)
milestone(buildNumber)


import groovy.transform.Field
@Field boolean build_ok = true

def get_portafiducia_download_path() {
/* Stable Portafiducia tarball */
def AWS_ACCOUNT_ID = sh (
script: "aws sts get-caller-identity --query Account --output text | tr -dc 0-9",
returnStdout: true
)
return "s3://libfabric-ci-$AWS_ACCOUNT_ID-us-west-2/portafiducia/portafiducia.tar.gz"
}

def download_and_extract_portafiducia(outputDir) {
/* Download PortaFiducia tarball from S3 and extract to outputDir */
def tempPath = "/tmp/portafiducia.tar.gz"
def downloadPath = this.get_portafiducia_download_path()

def ret = sh (
script: "mkdir -p ${outputDir} && aws s3 cp ${downloadPath} ${tempPath} && " +
"tar xf ${tempPath} -C ${outputDir}",
returnStatus: true,
)

if (ret != 0) {
unstable('Failed to download and extract PortaFiducia')
}
}

def install_porta_fiducia() {
/*
* Install PortaFiducia in a (new) virtual environment.
*/
sh '''
python3 -m venv venv
. venv/bin/activate
pip install --upgrade pip
pip install --upgrade awscli
pip install -e PortaFiducia
'''
}

def kill_all_clusters(instance_type, region) {
def instance_type_without_period = sh(
script: "echo ${instance_type} | tr -d '.\\n'",
returnStdout: true
)
sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name \'*${instance_type_without_period}*\' --region ${region} || true"
}

def wait_for_odcr_capacity(region, instance_count, odcr) {
sh ". venv/bin/activate; ./PortaFiducia/scripts/wait_for_odcr_capacity.py --region ${region} --odcr-id ${odcr} --required-capacity ${instance_count}"
}

def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, config, odcr, addl_args) {
/*
* Run PortaFiducia/tests/test_orchestrator.py with given command line arguments
*/

/*
* This is a temporary workaround to deal with clusters not getting cleaned up
* Attempt to cleanup all instances types in a region when you get the lock.
* This is required b/c milestones send multiple SIG_TERM, followed by a SIG_KILL after 20s.
* This stops us from being able to add additional capacity to the Jenkins service.
*/
kill_all_clusters(instance_type, region)
wait_for_odcr_capacity(region, instance_count, odcr)

/*
* p3dn clusters are getting ICE'ed within an ODCR, when we try to launch them back to back.
* This is a non-deterministic work around to help us increase our chances of not getting ICE'ed.
* Worst case, this increases our time to publish results on PR's by 15 minutes.
*/
if (instance_type == "p3dn.24xlarge") {
sh "sleep 150"
}

def cluster_name = get_cluster_name(build_tag, os, instance_type)
def args = "--config ${config} --os ${os} --odcr ${odcr} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
def ret = sh (
script: ". venv/bin/activate; ./PortaFiducia/tests/test_orchestrator.py ${args}",
returnStatus: true
)
if (ret == 65)
unstable('Scripts exited with status 65')
else if (ret != 0)
build_ok = false
catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
sh "exit ${ret}"
}
}

def get_random_string(len) {
def s = sh (
script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}",
returnStdout: true
)
return s
}

def get_cluster_name(build_tag, os, instance_type) {
/*
* Compose the cluster name. Pcluster requires a cluster name under 60 characters.
* cluster name cannot have ".".
* Jenkins does not allow groovy to use the replace() method
* of string. Therefore we used shell command sed to replace "." with ""
*/
build_tag = sh(
script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"",
returnStdout: true
)

def cluster_name = sh(
script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type}-'${get_random_string(8)} | tr -d '.\\n'",
returnStdout: true
)

return cluster_name
}


def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, region, lock_label, lock_count, config, odcr, addl_args) {
/*
* Generate a single test stage that run test_orchestrator.py with the given parameters.
* The job will queue until it acquires the given number of locks. The locks will be released
* after the job finishes.
* param@ stage_name: the name of the stage
* param@ build_tag: the BUILD_TAG env generated by Jenkins
* param@ os: the operating system for the test stage.
* param@ instance_type: the instance type for the test stage.
* param@ region: the (default) aws region where the tests are run.
* param@ lock_label: str, the label of the lockable resources.
* param@ lock_count: int, the quantity of the lockable resources.
* param@ config: the name of the PortaFiducia config file
* param@ odcr: The on demand capacity reservation ID to create instances in
* param@ addl_args: additional arguments passed to test_orchestrator.py
* return@: the test stage.
*/
return {
stage("${stage_name}") {
lock(label: lock_label, quantity: lock_count) {
this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, lock_count, region, config, odcr, addl_args)
}
}
}
}


pipeline {
agent {
ecs {
Expand Down Expand Up @@ -35,24 +183,21 @@ pipeline {
steps {
script {
sh 'printenv'
def common = load ".ci/aws/common.groovy"
common.download_and_extract_portafiducia('PortaFiducia')
download_and_extract_portafiducia('PortaFiducia')
}
}
}
stage("Install PortaFiducia") {
steps {
script {
def common = load ".ci/aws/common.groovy"
common.install_porta_fiducia()
install_porta_fiducia()
}

}
}
stage("Test OFI NCCL Plugin") {
steps {
script {
def common = load ".ci/aws/common.groovy"
def stages = [:]
def nccl_version = "v2.20.5-1"
def addl_args_pr = "--test-aws-ofi-nccl-pr $env.CHANGE_ID --test-nccl-version ${nccl_version}"
Expand All @@ -72,19 +217,19 @@ pipeline {
def p4_p5_addl_args = "${addl_args_pr} --use-prebuilt-ami-with-efa-installer true"

// p3dn tests
stages["4_p3dn_al2"] = common.get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_al2_addl_args)
stages["4_p3dn_ubuntu2004"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = common.get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_al2"] = get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_al2_addl_args)
stages["4_p3dn_ubuntu2004"] = get_test_stage_with_lock("4_p3dn_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)
stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, config, p3dn_odcr, p3dn_addl_args)

// p4d tests
stages["4_p4d_alinux2"] = common.get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, p4_p5_addl_args)
stages["4_p4d_ubuntu2004"] = common.get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, p4_p5_addl_args)
stages["4_p4d_ubuntu2204"] = common.get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, p4_p5_addl_args)
stages["4_p4d_alinux2"] = get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, p4_p5_addl_args)
stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, p4_p5_addl_args)
stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, config, p4d_odcr, p4_p5_addl_args)

// p5 tests
stages["4_p5_alinux2"] = common.get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, p4_p5_addl_args)
stages["4_p5_ubuntu2004"] = common.get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, p4_p5_addl_args)
stages["4_p5_ubuntu2204"] = common.get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, p4_p5_addl_args)
stages["4_p5_alinux2"] = get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, p4_p5_addl_args)
stages["4_p5_ubuntu2004"] = get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, p4_p5_addl_args)
stages["4_p5_ubuntu2204"] = get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, config, p5_odcr, p4_p5_addl_args)

parallel stages
}
Expand All @@ -93,8 +238,7 @@ pipeline {
stage('check build_ok') {
steps {
script {
def common = load ".ci/aws/common.groovy"
if (common.build_ok) {
if (build_ok) {
currentBuild.result = "SUCCESS"
}
else {
Expand Down
152 changes: 0 additions & 152 deletions .ci/aws/common.groovy

This file was deleted.

0 comments on commit 7f41bc6

Please sign in to comment.