export logic outside of profile scope into custom functions

nf-core · Jan 9, 2025 · 6d07e08 · 6d07e08
1 parent b20292d
commit 6d07e08
Showing 1 changed file with 158 additions and 117 deletions.
diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config
@@ -1,9 +1,9 @@
 // Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
 // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
-scratch_dir         = System.getenv("VSC_SCRATCH") ?: "/tmp"
-tier1_project       = System.getenv("SLURM_ACCOUNT") ?: null
-def avail_queues    = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
-def availQueues     = avail_queues.toString().split(',')
+def SCRATCH_DIR      = System.getenv("VSC_SCRATCH") ?: "/tmp"
+def TIER2_PROJECT    = System.getenv("SLURM_ACCOUNT") ?: null
+def DEDICATED_QUEUES = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
+def AVAILABLE_QUEUES = DEDICATED_QUEUES.toString().split(',')
 
 // Perform work directory cleanup when the run has succesfully completed
 // cleanup = true
@@ -30,7 +30,7 @@ process {
 singularity {
     enabled     = true
     autoMounts  = true
-    cacheDir    = "$scratch_dir/.singularity"
+    cacheDir    = "$SCRATCH_DIR/.singularity"
     pullTimeout = "30 min"
 }
 
@@ -40,20 +40,141 @@ params {
 }
 
 env {
-    APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp"
-    APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache"
+    APPTAINER_TMPDIR="$SCRATCH_DIR/.apptainer/tmp"
+    APPTAINER_CACHEDIR="$SCRATCH_DIR/.apptainer/cache"
 }
 
 // AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
 aws {
     maxErrorRetry = 3
 }
 
-// Function to limit task time when dedicated queues are not available
+/*
+ * Queue Selection Utility Functions for HPC Environments
+ * ==================================================
+ * This module provides functions to determine appropriate HPC queues based on task requirements
+ * for both GENIUS and WICE clusters.
+ */
+
+/*
+ * Constants:
+ * ----------
+ * TIME_THRESHOLD: 72 hours - Threshold for determining long-running jobs
+ * MEMORY_THRESHOLD (GENIUS): 175GB - Memory threshold for bigmem queues
+ * MEMORY_THRESHOLD (WICE): 239GB - Memory threshold for high-memory queues
+*/
+def TIME_THRESHOLD = 72.h
+def MEMORY_THRESHOLD_GENIUS = 175.GB
+def MEMORY_THRESHOLD_WICE = 239.GB
+
+/*
+ * ---------
+ * Functions:
+ * ----------
+ * These functions are designed to select the appropriate HPC queues of
+ * VSC_KUL_UHASSELT based on task requirements. They handle both standard
+ * and GPU queues, considering memory requirements, execution time, and
+ * queue availability.
+*/
+
+/*
+ * limitTaskTime(time, maxTime)
+ *     Ensures task time doesn't exceed the maximum allowed time
+ *     @param time Current task time
+ *     @param maxTime Maximum allowed time
+ *     @return Limited task time
+*/
 def limitTaskTime(time, maxTime) {
     return time > maxTime ? maxTime : time
 }
 
+/*
+ * determineGeniusQueue(task)
+ *     Selects appropriate CPU queue for GENIUS cluster
+ *     @param task Nextflow task object containing memory and time requirements
+ *     @return Queue name based on task requirements
+*/
+def determineGeniusQueue = { task ->
+    if (task.memory >= MEMORY_THRESHOLD_GENIUS) {
+        if (task.time >= TIME_THRESHOLD) {
+            return AVAILABLE_QUEUES.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem_long'
+        }
+        return 'bigmem'
+    }
+    return task.time >= TIME_THRESHOLD ? 'batch_long' : 'batch'
+}
+
+/*
+ * determineGeniusGpuQueue(task)
+ *     Selects appropriate GPU queue for GENIUS cluster
+ *     @param task Nextflow task object containing memory and time requirements
+ *     @return GPU queue name based on task requirements
+*/
+def determineGeniusGpuQueue = { task ->
+    if (task.memory >= MEMORY_THRESHOLD_GENIUS) {
+        return task.time >= TIME_THRESHOLD ? 'gpu_v100_long' : 'gpu_v100'
+    }
+    if (task.time >= TIME_THRESHOLD) {
+        return AVAILABLE_QUEUES.contains('dedicated_rega_gpu') ? 'dedicated_rega_gpu' : 'gpu_p100_long,amd_long'
+    }
+    return 'gpu_p100,amd'
+}
+
+/*
+ * determineWiceQueue(task)
+ *     Selects appropriate CPU queue for WICE cluster
+ *     @param task Nextflow task object containing memory and time requirements
+ *     @return Queue name based on task requirements and availability
+*/
+def determineWiceQueue = { task ->
+    if (task.memory >= MEMORY_THRESHOLD_WICE) {
+        if (AVAILABLE_QUEUES.contains('dedicated_big_bigmem')) {
+            return 'dedicated_big_bigmem'
+        } else {
+            task.time = limitTaskTime(task.time, TIME_THRESHOLD)
+            return 'bigmem,hugemem'
+        }
+    }
+
+    return task.time >= TIME_THRESHOLD ?
+        'batch_long,batch_icelake_long,batch_sapphirerapids_long' :
+        'batch,batch_sapphirerapids,batch_icelake'
+}
+
+/*
+ * determineWiceGpuQueue(task)
+ *     Selects appropriate GPU queue for WICE cluster
+ *     @param task Nextflow task object containing memory and time requirements
+ *     @return GPU queue name based on task requirements
+*/
+def determineWiceGpuQueue = { task ->
+    def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
+    def isDedicatedQueue = isHighMemory ?
+        AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') :
+        AVAILABLE_QUEUES.contains('dedicated_big_gpu')
+
+    if (task.time >= TIME_THRESHOLD && !isDedicatedQueue) {
+        task.time = limitTaskTime(task.time, TIME_THRESHOLD)
+    }
+
+    if (isHighMemory) {
+        return isDedicatedQueue ? 'dedicated_big_gpu_h100' : 'gpu_h100'
+    } else {
+        return isDedicatedQueue ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
+    }
+}
+
+/*
+ * ========
+ * Profiles
+ * ========
+    * These profiles define the resource limits, queue selection, and cluster options
+    * for WICE and GENIUS clusters. They also include GPU-specific configurations.
+    * Details of the resource limits can be found in for genius at
+    * https://docs.vscentrum.be/leuven/tier2_hardware/genius_hardware.html
+    * and for wice at https://docs.vscentrum.be/leuven/tier2_hardware/wice_hardware.html
+*/
+
 // Define profiles for each cluster
 profiles {
     genius {
@@ -62,31 +183,22 @@ profiles {
         process {
             // 768 - 65 so 65GB for overhead, max is 720000MB
             resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
-            beforeScript = 'module load cluster/genius'
-            clusterOptions = { "--clusters=genius --account=$tier1_project" }
-
-            queue = {
-                task.memory >= 175.GB ?
-                    (task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') :
-                    (task.time >= 72.h ? 'batch_long' : 'batch')
+            beforeScript   = 'module load cluster/genius'
+            queue          = { determineGeniusQueue(task) }
+            clusterOptions = {
+                determineGeniusQueue(task) =~ /dedicated/ ?
+                    "--clusters=genius --account=lp_big_genius_cpu" :
+                    "--clusters=genius --account=$TIER2_PROJECT"
             }
 
             withLabel: '.*gpu.*'{
                 resourceLimits         = [ memory: 703.GB, cpus: 36 , time: 168.h ]
                 apptainer.runOptions   = '--containall --cleanenv --nv'
                 singularity.runOptions = '--containall --cleanenv --nv'
-
-                // Set clusteroptions
+                queue                  = { determineGeniusGpuQueue(task) }
                 clusterOptions         = {
-                    // suggested to use 9 cpus per gpu
                     def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
-                    "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
-                }
-
-                queue = {
-                    task.memory >= 175.GB ?
-                        (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
-                        (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
+                    "--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
                 }
             }
         }
@@ -101,15 +213,10 @@ profiles {
             // 768 - 65 so 65GB for overhead, max is 720000MB
             resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
             beforeScript   = 'module load cluster/genius'
+            queue         = { determineGeniusGpuQueue(task) }
             clusterOptions = {
                 def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
-                "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
-            }
-
-            queue = {
-                    task.memory >= 175.GB ?
-                        (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
-                        (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
+                "--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
             }
         }
     }
@@ -121,66 +228,24 @@ profiles {
             // max is 2016000
             resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
             beforeScript   = 'module load cluster/wice'
-
-            // Set queue
-            // The task time is limites to 72 hours if the memory is larger than 239GB
-            // and dedicated queues are not available
-            queue = {
-                def maxTime = 72.h
-                if (task.memory >= 239.GB) {
-                    task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ?
-                        limitTaskTime(task.time, maxTime) : task.time
-                    return availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem'
-                } else {
-                    return task.time >= maxTime ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake'
-                }
-            }
-
-            // Set clusterOptions, changing account based on queue
+            queue          = { determineWiceQueue(task) }
             clusterOptions = {
-                def queueValue = {
-                    task.memory >= 239.GB ?
-                        (task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
-                        (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
-                }
-                queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project"
+                determineWiceQueue(task) =~ /dedicated/ ?
+                    "--clusters=wice --account=lp_big_wice_cpu" :
+                    "--clusters=wice --account=$TIER2_PROJECT"
             }
 
             withLabel: '.*gpu.*' {
                 resourceLimits         = [ memory: 703.GB, cpus: 64, time: 168.h ]
                 apptainer.runOptions   = '--containall --cleanenv --nv'
                 singularity.runOptions = '--containall --cleanenv --nv'
-
-                // Set queue
-                // The task time is limites to 72 hours if the memory is larger than 239GB
-                // and dedicated queues are not available
-                queue = {
-                    def maxTime = 72.h
-                    if (task.memory >= 239.GB) {
-                        task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ?
-                            limitTaskTime(task.time, maxTime) : task.time
-                        return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100'
-                    } else {
-                        task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ?
-                            limitTaskTime(task.time, maxTime) : task.time
-                        return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
-                    }
-                }
-
-                clusterOptions = {
-                    // suggested to use 16 cpus per gpu
+                queue                  = { determineWiceGpuQueue(task) }
+                clusterOptions         = {
                     def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
-                    // Do same queue evaluation as above
-                    def queueValue = {
-                        task.memory >= 239.GB ?
-                            (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
-                            (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
-                    }
-
-                    // Set clusterOptions, changing account based on queue
-                    queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
-                    queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
-                    "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}"
+                    def queueValue = determineWiceGpuQueue(task)
+                    queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
+                    queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
+                    "--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
                 }
             }
         }
@@ -193,39 +258,15 @@ profiles {
 
         process {
             // 768 - 65 so 65GB for overhead, max is 720000MB
-            resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
-            beforeScript   = 'module load cluster/wice'
-                // Set queue
-                // The task time is limites to 72 hours if the memory is larger than 239GB
-                // and dedicated queues are not available
-            queue = {
-                    def maxTime = 72.h
-                    if (task.memory >= 239.GB) {
-                        task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ?
-                            limitTaskTime(task.time, maxTime) : task.time
-                        return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100'
-                    } else {
-                        task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ?
-                            limitTaskTime(task.time, maxTime) : task.time
-                        return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
-                    }
-            }
-
-            // Set clusteroptions
-            clusterOptions = {
-                // suggested to use 16 cpus per gpu
+            beforeScript           = 'module load cluster/wice'
+            resourceLimits         = [ memory: 703.GB, cpus: 64, time: 168.h ]
+            queue                  = { determineWiceGpuQueue(task) }
+            clusterOptions         = {
                 def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
-                // Do same queue evaluation as above, without adjusting task.time
-                def queueValue = {
-                    task.memory >= 239.GB ?
-                        (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
-                        (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
-                }
-
-                // Set clusterOptions, changing account based on queue
-                queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
-                queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
-                "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}"
+                def queueValue = determineWiceGpuQueue(task)
+                queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
+                queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
+                "--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
             }
         }
     }
@@ -234,7 +275,7 @@ profiles {
         params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'
 
         process {
-            clusterOptions = {"--clusters=genius --account=$tier1_project"}
+            clusterOptions = {"--clusters=genius --account=$TIER2_PROJECT"}
             beforeScript   = 'module load cluster/genius/superdome'
             // 6000 - 228 so 228GB for overhead, max is 5910888MB
             resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]