diff --git a/conf/sage.config b/conf/sage.config index e5bfa8b..bfe1e09 100644 --- a/conf/sage.config +++ b/conf/sage.config @@ -1,19 +1,52 @@ +// Config profile metadata params { config_profile_description = 'The Sage Bionetworks profile' config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)' config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows' } +// Leverage us-east-1 mirror of select human and mouse genomes +params { + igenomes_base = 's3://sage-igenomes/igenomes' + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' +} + +// Enable retries globally for certain exit codes +process { + errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' } + maxRetries = 5 + maxErrors = '-1' +} + +// Increase time limit to allow file transfers to finish +// The default is 12 hours, which results in timeouts +threadPool.FileTransfer.maxAwait = '24 hour' + +// Configure Nextflow to be more reliable on AWS +aws { + region = "us-east-1" + client { + uploadChunkSize = 209715200 + } +} +executor { + name = 'awsbatch' + // Ensure unlimited queue size on AWS Batch + queueSize = 100000 + // Slow down the rate at which AWS Batch jobs accumulate in + // the queue (an attempt to prevent orphaned EBS volumes) + submitRateLimit = '5 / 1 sec' +} + +// Adjust default resource allocations (see `../docs/sage.md`) process { cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 24.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' } - maxRetries = 5 - maxErrors = '-1' - // Process-specific resource requirements withLabel:process_low { cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } @@ -37,32 +70,13 @@ process { memory = { check_max( 128.GB * task.attempt, 'memory' ) } } - // Preventing Sarek labels from using the actual maximums - withLabel:memory_max { - memory = { check_max( 128.GB * task.attempt, 'memory' ) } - } - withLabel:cpus_max { - cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } - } - -} - -aws { - region = "us-east-1" -} - -params { - igenomes_base = 's3://sage-igenomes/igenomes' - max_memory = 500.GB - max_cpus = 64 - max_time = 168.h // One week } // Function to slow the increase of the resource multipler -// as attempts are made. The rationale is that some CPUs -// don't need to be increased as fast as memory. +// as attempts are made. The rationale is that the number +// of CPU cores isn't a limiting factor as often as memory. def slow(attempt, factor = 2) { - return Math.ceil( attempt / factor) as int + return Math.ceil( attempt / factor) as int } diff --git a/docs/sage.md b/docs/sage.md index 133ccec..1e36fed 100644 --- a/docs/sage.md +++ b/docs/sage.md @@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi This global configuration includes the following tweaks: - Update the default value for `igenomes_base` to `s3://sage-igenomes` -- Increase the default time limits because we run pipelines on AWS - Enable retries by default when exit codes relate to insufficient memory - Allow pending jobs to finish if the number of retries are exhausted -- Slow the increase in the number of allocated CPU cores on retries +- Increase the amount of time allowed for file transfers +- Increase the default chunk size for multipart uploads to S3 +- Slow down job submission rate to avoid overwhelming any APIs - Define the `check_max()` function, which is missing in Sarek v2 +- Slow the increase in the number of allocated CPU cores on retries +- Increase the default time limits because we run pipelines on AWS ## Additional information about iGenomes