From 81c5857c516c80ca1ab31a0d97aea9a9a95b1077 Mon Sep 17 00:00:00 2001 From: Matthias De Smet <11850640+matthdsm@users.noreply.github.com> Date: Wed, 26 Oct 2022 11:07:14 +0200 Subject: [PATCH 1/2] Update vsc_ugent.config --- conf/vsc_ugent.config | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/conf/vsc_ugent.config b/conf/vsc_ugent.config index b46b347..d2ce550 100644 --- a/conf/vsc_ugent.config +++ b/conf/vsc_ugent.config @@ -10,10 +10,16 @@ workDir = "$scratch_dir/work" // Reduce the job submit rate to about 3 per second, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { - submitRateLimit = '3 sec' + submitRateLimit = '30/1min' queueSize = 50 } +// Add backoff strategy to catch cluster timeouts +process { + errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } + maxRetries = 5 +} + // Specify that singularity should be used and where the cache dir will be for the images singularity { enabled = true @@ -30,7 +36,7 @@ profiles { skitty { params { config_profile_description = 'HPC_SKITTY profile for use on the Skitty cluster of the VSC HPC.' - config_profile_contact = 'Nicolas Vannieuwkerke (@nvnieuwk)' + config_profile_contact = 'ict@cmgg.be' config_profile_url = 'https://www.ugent.be/hpc/en' max_memory = 177.GB max_cpus = 36 @@ -40,7 +46,6 @@ profiles { process { executor = 'slurm' queue = 'skitty' - maxRetries = 2 scratch = "$scratch_dir" } } @@ -48,7 +53,7 @@ profiles { swalot { params { config_profile_description = 'HPC_SWALOT profile for use on the Swalot cluster of the VSC HPC.' - config_profile_contact = 'Nicolas Vannieuwkerke (@nvnieuwk)' + config_profile_contact = 'ict@cmgg.be' config_profile_url = 'https://www.ugent.be/hpc/en' max_memory = 116.GB max_cpus = 20 @@ -58,7 +63,6 @@ profiles { process { executor = 'slurm' queue = 'swalot' - maxRetries = 2 scratch = "$scratch_dir" } } @@ -66,7 +70,7 @@ profiles { victini { params { config_profile_description = 'HPC_VICTINI profile for use on the Victini cluster of the VSC HPC.' - config_profile_contact = 'Nicolas Vannieuwkerke (@nvnieuwk)' + config_profile_contact = 'ict@cmgg.be' config_profile_url = 'https://www.ugent.be/hpc/en' max_memory = 88.GB max_cpus = 36 @@ -76,7 +80,6 @@ profiles { process { executor = 'slurm' queue = 'victini' - maxRetries = 2 scratch = "$scratch_dir" } } @@ -84,7 +87,7 @@ profiles { kirlia { params { config_profile_description = 'HPC_KIRLIA profile for use on the Kirlia cluster of the VSC HPC.' - config_profile_contact = 'Nicolas Vannieuwkerke (@nvnieuwk)' + config_profile_contact = 'ict@cmgg.be' config_profile_url = 'https://www.ugent.be/hpc/en' max_memory = 738.GB max_cpus = 36 @@ -94,7 +97,6 @@ profiles { process { executor = 'slurm' queue = 'kirlia' - maxRetries = 2 scratch = "$scratch_dir" } } @@ -102,7 +104,7 @@ profiles { doduo { params { config_profile_description = 'HPC_DODUO profile for use on the Doduo cluster of the VSC HPC.' - config_profile_contact = 'Nicolas Vannieuwkerke (@nvnieuwk)' + config_profile_contact = 'ict@cmgg.be' config_profile_url = 'https://www.ugent.be/hpc/en' max_memory = 250.GB max_cpus = 96 @@ -112,7 +114,6 @@ profiles { process { executor = 'slurm' queue = 'doduo' - maxRetries = 2 scratch = "$scratch_dir" } } From ec190ac0136b0a796b8a799ee07177a5400561a2 Mon Sep 17 00:00:00 2001 From: Matthias De Smet <11850640+matthdsm@users.noreply.github.com> Date: Wed, 26 Oct 2022 11:09:45 +0200 Subject: [PATCH 2/2] Update conf/vsc_ugent.config Co-authored-by: nvnieuwk <101190534+nvnieuwk@users.noreply.github.com> --- conf/vsc_ugent.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/vsc_ugent.config b/conf/vsc_ugent.config index d2ce550..4a3733a 100644 --- a/conf/vsc_ugent.config +++ b/conf/vsc_ugent.config @@ -7,7 +7,7 @@ workDir = "$scratch_dir/work" // Perform work directory cleanup when the run has succesfully completed // cleanup = true -// Reduce the job submit rate to about 3 per second, this way the server won't be bombarded with jobs +// Reduce the job submit rate to about 30 per minute, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { submitRateLimit = '30/1min'