Skip to content

Commit

Permalink
export logic outside of profile scope into custom functions
Browse files Browse the repository at this point in the history
  • Loading branch information
Joon-Klaps committed Jan 9, 2025
1 parent b20292d commit 6d07e08
Showing 1 changed file with 158 additions and 117 deletions.
275 changes: 158 additions & 117 deletions conf/vsc_kul_uhasselt.config
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"
tier1_project = System.getenv("SLURM_ACCOUNT") ?: null
def avail_queues = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
def availQueues = avail_queues.toString().split(',')
def SCRATCH_DIR = System.getenv("VSC_SCRATCH") ?: "/tmp"
def TIER2_PROJECT = System.getenv("SLURM_ACCOUNT") ?: null
def DEDICATED_QUEUES = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
def AVAILABLE_QUEUES = DEDICATED_QUEUES.toString().split(',')

// Perform work directory cleanup when the run has succesfully completed
// cleanup = true
Expand All @@ -30,7 +30,7 @@ process {
singularity {
enabled = true
autoMounts = true
cacheDir = "$scratch_dir/.singularity"
cacheDir = "$SCRATCH_DIR/.singularity"
pullTimeout = "30 min"
}

Expand All @@ -40,20 +40,141 @@ params {
}

env {
APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp"
APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache"
APPTAINER_TMPDIR="$SCRATCH_DIR/.apptainer/tmp"
APPTAINER_CACHEDIR="$SCRATCH_DIR/.apptainer/cache"
}

// AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
aws {
maxErrorRetry = 3
}

// Function to limit task time when dedicated queues are not available
/*
* Queue Selection Utility Functions for HPC Environments
* ==================================================
* This module provides functions to determine appropriate HPC queues based on task requirements
* for both GENIUS and WICE clusters.
*/

/*
* Constants:
* ----------
* TIME_THRESHOLD: 72 hours - Threshold for determining long-running jobs
* MEMORY_THRESHOLD (GENIUS): 175GB - Memory threshold for bigmem queues
* MEMORY_THRESHOLD (WICE): 239GB - Memory threshold for high-memory queues
*/
def TIME_THRESHOLD = 72.h
def MEMORY_THRESHOLD_GENIUS = 175.GB
def MEMORY_THRESHOLD_WICE = 239.GB

/*
* ---------
* Functions:
* ----------
* These functions are designed to select the appropriate HPC queues of
* VSC_KUL_UHASSELT based on task requirements. They handle both standard
* and GPU queues, considering memory requirements, execution time, and
* queue availability.
*/

/*
* limitTaskTime(time, maxTime)
* Ensures task time doesn't exceed the maximum allowed time
* @param time Current task time
* @param maxTime Maximum allowed time
* @return Limited task time
*/
def limitTaskTime(time, maxTime) {
return time > maxTime ? maxTime : time
}

/*
* determineGeniusQueue(task)
* Selects appropriate CPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements
*/
def determineGeniusQueue = { task ->
if (task.memory >= MEMORY_THRESHOLD_GENIUS) {
if (task.time >= TIME_THRESHOLD) {
return AVAILABLE_QUEUES.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem_long'
}
return 'bigmem'
}
return task.time >= TIME_THRESHOLD ? 'batch_long' : 'batch'
}

/*
* determineGeniusGpuQueue(task)
* Selects appropriate GPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineGeniusGpuQueue = { task ->
if (task.memory >= MEMORY_THRESHOLD_GENIUS) {
return task.time >= TIME_THRESHOLD ? 'gpu_v100_long' : 'gpu_v100'
}
if (task.time >= TIME_THRESHOLD) {
return AVAILABLE_QUEUES.contains('dedicated_rega_gpu') ? 'dedicated_rega_gpu' : 'gpu_p100_long,amd_long'
}
return 'gpu_p100,amd'
}

/*
* determineWiceQueue(task)
* Selects appropriate CPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements and availability
*/
def determineWiceQueue = { task ->
if (task.memory >= MEMORY_THRESHOLD_WICE) {
if (AVAILABLE_QUEUES.contains('dedicated_big_bigmem')) {
return 'dedicated_big_bigmem'
} else {
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
return 'bigmem,hugemem'
}
}

return task.time >= TIME_THRESHOLD ?
'batch_long,batch_icelake_long,batch_sapphirerapids_long' :
'batch,batch_sapphirerapids,batch_icelake'
}

/*
* determineWiceGpuQueue(task)
* Selects appropriate GPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineWiceGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isDedicatedQueue = isHighMemory ?
AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') :
AVAILABLE_QUEUES.contains('dedicated_big_gpu')

if (task.time >= TIME_THRESHOLD && !isDedicatedQueue) {
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
}

if (isHighMemory) {
return isDedicatedQueue ? 'dedicated_big_gpu_h100' : 'gpu_h100'
} else {
return isDedicatedQueue ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}
}

/*
* ========
* Profiles
* ========
* These profiles define the resource limits, queue selection, and cluster options
* for WICE and GENIUS clusters. They also include GPU-specific configurations.
* Details of the resource limits can be found in for genius at
* https://docs.vscentrum.be/leuven/tier2_hardware/genius_hardware.html
* and for wice at https://docs.vscentrum.be/leuven/tier2_hardware/wice_hardware.html
*/

// Define profiles for each cluster
profiles {
genius {
Expand All @@ -62,31 +183,22 @@ profiles {
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
beforeScript = 'module load cluster/genius'
clusterOptions = { "--clusters=genius --account=$tier1_project" }

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') :
(task.time >= 72.h ? 'batch_long' : 'batch')
beforeScript = 'module load cluster/genius'
queue = { determineGeniusQueue(task) }
clusterOptions = {
determineGeniusQueue(task) =~ /dedicated/ ?
"--clusters=genius --account=lp_big_genius_cpu" :
"--clusters=genius --account=$TIER2_PROJECT"
}

withLabel: '.*gpu.*'{
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

// Set clusteroptions
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
// suggested to use 9 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
Expand All @@ -101,15 +213,10 @@ profiles {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
beforeScript = 'module load cluster/genius'
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
Expand All @@ -121,66 +228,24 @@ profiles {
// max is 2016000
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
beforeScript = 'module load cluster/wice'

// Set queue
// The task time is limites to 72 hours if the memory is larger than 239GB
// and dedicated queues are not available
queue = {
def maxTime = 72.h
if (task.memory >= 239.GB) {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem'
} else {
return task.time >= maxTime ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake'
}
}

// Set clusterOptions, changing account based on queue
queue = { determineWiceQueue(task) }
clusterOptions = {
def queueValue = {
task.memory >= 239.GB ?
(task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
}
queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project"
determineWiceQueue(task) =~ /dedicated/ ?
"--clusters=wice --account=lp_big_wice_cpu" :
"--clusters=wice --account=$TIER2_PROJECT"
}

withLabel: '.*gpu.*' {
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

// Set queue
// The task time is limites to 72 hours if the memory is larger than 239GB
// and dedicated queues are not available
queue = {
def maxTime = 72.h
if (task.memory >= 239.GB) {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100'
} else {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}
}

clusterOptions = {
// suggested to use 16 cpus per gpu
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
// Do same queue evaluation as above
def queueValue = {
task.memory >= 239.GB ?
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
}

// Set clusterOptions, changing account based on queue
queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$tier1_project --gres=gpu:${gpus}"
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
Expand All @@ -193,39 +258,15 @@ profiles {

process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
beforeScript = 'module load cluster/wice'
// Set queue
// The task time is limites to 72 hours if the memory is larger than 239GB
// and dedicated queues are not available
queue = {
def maxTime = 72.h
if (task.memory >= 239.GB) {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100'
} else {
task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ?
limitTaskTime(task.time, maxTime) : task.time
return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}
}

// Set clusteroptions
clusterOptions = {
// suggested to use 16 cpus per gpu
beforeScript = 'module load cluster/wice'
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
// Do same queue evaluation as above, without adjusting task.time
def queueValue = {
task.memory >= 239.GB ?
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
}

// Set clusterOptions, changing account based on queue
queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$tier1_project --gres=gpu:${gpus}"
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
Expand All @@ -234,7 +275,7 @@ profiles {
params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'

process {
clusterOptions = {"--clusters=genius --account=$tier1_project"}
clusterOptions = {"--clusters=genius --account=$TIER2_PROJECT"}
beforeScript = 'module load cluster/genius/superdome'
// 6000 - 228 so 228GB for overhead, max is 5910888MB
resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]
Expand Down

0 comments on commit 6d07e08

Please sign in to comment.