From f5c36dde3ee04c89381f113c13b39c6fe64fe367 Mon Sep 17 00:00:00 2001 From: phue Date: Wed, 16 Jun 2021 16:36:39 +0200 Subject: [PATCH 1/4] cbe: send SIGUSR2 upon job termination Previously, if a process hit the walltime limit and received SIGKILL from the slurm scheduler, singularity did not properly propagate such (soft) kill signal. This prevented the exit code to be caught, e.g for resubmission purposes. This commit introduces a workaround using slurms --signal directive to send SIGUSR2 to the singularity process itself (instead of container child processes, which presumably was happening before). Effectively, once a job reaches walltime limit, this will result in exitcode 140 which is typically caught by the errorStrategy in nf-core pipelines See also: https://slurm.schedmd.com/sbatch.html#OPT_signal https://github.com/nextflow-io/nextflow/issues/2163 https://github.com/nextflow-io/nextflow/issues/1561 --- conf/cbe.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/cbe.config b/conf/cbe.config index 18f72dc..2ff0b6b 100755 --- a/conf/cbe.config +++ b/conf/cbe.config @@ -8,7 +8,7 @@ params { process { executor = 'slurm' queue = { task.memory <= 170.GB ? 'c' : 'm' } - clusterOptions = { task.time <= 1.h ? '--qos rapid' : task.time <= 8.h ? '--qos short': task.time <= 48.h ? '--qos medium' : '--qos long' } + clusterOptions = { def qos = task.time <= 1.h ? '--qos rapid' : { task.time <= 8.h ? '--qos short': { task.time <= 48.h ? '--qos medium' : '--qos long' } }; qos << ' --signal B:USR2' } module = 'anaconda3/2019.10' } From 45213cf6db28a432ce5e9684787591f3345687a0 Mon Sep 17 00:00:00 2001 From: phue Date: Thu, 17 Jun 2021 11:12:36 +0200 Subject: [PATCH 2/4] add note about upcoming upstream fix also refactor the closure to make it slightly more readable --- conf/cbe.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/cbe.config b/conf/cbe.config index 2ff0b6b..8540abe 100755 --- a/conf/cbe.config +++ b/conf/cbe.config @@ -8,8 +8,10 @@ params { process { executor = 'slurm' queue = { task.memory <= 170.GB ? 'c' : 'm' } - clusterOptions = { def qos = task.time <= 1.h ? '--qos rapid' : { task.time <= 8.h ? '--qos short': { task.time <= 48.h ? '--qos medium' : '--qos long' } }; qos << ' --signal B:USR2' } module = 'anaconda3/2019.10' + + // --signal option will be handled by nextflow after 21.10.0 release (see https://github.com/nextflow-io/nextflow/issues/2163) + clusterOptions = { '--signal B:USR2 ' << ( task.time <= 1.h ? '--qos rapid' : ( task.time <= 8.h ? '--qos short': ( task.time <= 48.h ? '--qos medium' : '--qos long' ) ) ) } } singularity { From 6c4998fa8a05c8c833c9e40dd7b063a2d806d86e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20H=C3=BCther?= Date: Fri, 27 Aug 2021 14:01:34 +0200 Subject: [PATCH 3/4] reduce max_memory to 1.8TB --- conf/cbe.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/cbe.config b/conf/cbe.config index 8540abe..4f505cc 100755 --- a/conf/cbe.config +++ b/conf/cbe.config @@ -22,6 +22,6 @@ singularity { params { params.max_time = 14.d params.max_cpus = 36 - params.max_memory = 4.TB + params.max_memory = 1800.GB igenomes_base = '/resources/references/igenomes' } From 1cbe2759c46038d27a415ac2d12d29d8ef28debb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20H=C3=BCther?= Date: Wed, 22 Sep 2021 11:54:27 +0200 Subject: [PATCH 4/4] fix invalid rapid qos selection it only applies to the c partition --- conf/cbe.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/cbe.config b/conf/cbe.config index 4f505cc..89da3db 100755 --- a/conf/cbe.config +++ b/conf/cbe.config @@ -11,7 +11,7 @@ process { module = 'anaconda3/2019.10' // --signal option will be handled by nextflow after 21.10.0 release (see https://github.com/nextflow-io/nextflow/issues/2163) - clusterOptions = { '--signal B:USR2 ' << ( task.time <= 1.h ? '--qos rapid' : ( task.time <= 8.h ? '--qos short': ( task.time <= 48.h ? '--qos medium' : '--qos long' ) ) ) } + clusterOptions = { '--signal B:USR2 ' << ( (queue == 'c' & task.time <= 1.h) ? '--qos rapid' : ( task.time <= 8.h ? '--qos short': ( task.time <= 48.h ? '--qos medium' : '--qos long' ) ) ) } } singularity {