From 317e5a16cb8c769115b0c7554c060d638426b134 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 25 Aug 2022 16:24:52 -0700 Subject: [PATCH] Improve AWS-related config for Sage profile --- conf/sage.config | 112 +++++++++++++++++++++++++++++------------------ docs/sage.md | 7 ++- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/conf/sage.config b/conf/sage.config index e5bfa8b..7477d83 100644 --- a/conf/sage.config +++ b/conf/sage.config @@ -1,62 +1,88 @@ +// Config profile metadata params { config_profile_description = 'The Sage Bionetworks profile' config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)' config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows' } +// Leverage us-east-1 mirror of select human and mouse genomes +params { + igenomes_base = 's3://sage-igenomes/igenomes' +} + +// Enable retries globally for certain exit codes process { - - cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' } maxRetries = 5 maxErrors = '-1' - - // Process-specific resource requirements - withLabel:process_low { - cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } - } - withLabel:process_medium { - cpus = { check_max( 12 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 48.h * task.attempt, 'time' ) } - } - withLabel:process_high { - cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 96.h * task.attempt, 'time' ) } - } - withLabel:process_long { - time = { check_max( 192.h * task.attempt, 'time' ) } - } - withLabel:process_high_memory { - memory = { check_max( 128.GB * task.attempt, 'memory' ) } - } - - // Preventing Sarek labels from using the actual maximums - withLabel:memory_max { - memory = { check_max( 128.GB * task.attempt, 'memory' ) } - } - withLabel:cpus_max { - cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } - } - } +// Increase time limit to allow file transfers to finish +// The default is 12 hours, which results in timeouts +threadPool.FileTransfer.maxAwait = '24 hour' + +// Configure Nextflow to be more reliable on AWS aws { region = "us-east-1" + client { + uploadChunkSize = 209715200 + } +} +executor { + name = 'awsbatch' + // Ensure unlimited queue size on AWS Batch + queueSize = 100000 + // Slow down the rate at which AWS Batch jobs accumulate in + // the queue (an attempt to prevent orphaned EBS volumes) + submitRateLimit = '5 / 1 sec' } -params { - igenomes_base = 's3://sage-igenomes/igenomes' - max_memory = 500.GB - max_cpus = 64 - max_time = 168.h // One week -} +// Disabling resource allocation tweaks for now +// +// params { +// max_memory = 500.GB +// max_cpus = 64 +// max_time = 168.h // One week +// } +// +// process { +// +// cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 6.GB * task.attempt, 'memory' ) } +// time = { check_max( 24.h * task.attempt, 'time' ) } +// +// // Process-specific resource requirements +// withLabel:process_low { +// cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 12.GB * task.attempt, 'memory' ) } +// time = { check_max( 24.h * task.attempt, 'time' ) } +// } +// withLabel:process_medium { +// cpus = { check_max( 12 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 36.GB * task.attempt, 'memory' ) } +// time = { check_max( 48.h * task.attempt, 'time' ) } +// } +// withLabel:process_high { +// cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 72.GB * task.attempt, 'memory' ) } +// time = { check_max( 96.h * task.attempt, 'time' ) } +// } +// withLabel:process_long { +// time = { check_max( 192.h * task.attempt, 'time' ) } +// } +// withLabel:process_high_memory { +// memory = { check_max( 128.GB * task.attempt, 'memory' ) } +// } +// +// // Preventing Sarek labels from using the actual maximums +// withLabel:memory_max { +// memory = { check_max( 128.GB * task.attempt, 'memory' ) } +// } +// withLabel:cpus_max { +// cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } +// } +// +// } // Function to slow the increase of the resource multipler // as attempts are made. The rationale is that some CPUs diff --git a/docs/sage.md b/docs/sage.md index 133ccec..d503b42 100644 --- a/docs/sage.md +++ b/docs/sage.md @@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi This global configuration includes the following tweaks: - Update the default value for `igenomes_base` to `s3://sage-igenomes` -- Increase the default time limits because we run pipelines on AWS - Enable retries by default when exit codes relate to insufficient memory - Allow pending jobs to finish if the number of retries are exhausted -- Slow the increase in the number of allocated CPU cores on retries +- Increase the amount of time allowed for file transfers +- Increase the default chunk size for multipart uploads to S3 +- Slow down job submission rate to avoid overwhelming any APIs - Define the `check_max()` function, which is missing in Sarek v2 +- (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries +- (Disabled temporarily) Increase the default time limits because we run pipelines on AWS ## Additional information about iGenomes