From 317e5a16cb8c769115b0c7554c060d638426b134 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Thu, 25 Aug 2022 16:24:52 -0700 Subject: [PATCH 1/3] Improve AWS-related config for Sage profile --- conf/sage.config | 112 +++++++++++++++++++++++++++++------------------ docs/sage.md | 7 ++- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/conf/sage.config b/conf/sage.config index e5bfa8b..7477d83 100644 --- a/conf/sage.config +++ b/conf/sage.config @@ -1,62 +1,88 @@ +// Config profile metadata params { config_profile_description = 'The Sage Bionetworks profile' config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)' config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows' } +// Leverage us-east-1 mirror of select human and mouse genomes +params { + igenomes_base = 's3://sage-igenomes/igenomes' +} + +// Enable retries globally for certain exit codes process { - - cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' } maxRetries = 5 maxErrors = '-1' - - // Process-specific resource requirements - withLabel:process_low { - cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 24.h * task.attempt, 'time' ) } - } - withLabel:process_medium { - cpus = { check_max( 12 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 48.h * task.attempt, 'time' ) } - } - withLabel:process_high { - cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 96.h * task.attempt, 'time' ) } - } - withLabel:process_long { - time = { check_max( 192.h * task.attempt, 'time' ) } - } - withLabel:process_high_memory { - memory = { check_max( 128.GB * task.attempt, 'memory' ) } - } - - // Preventing Sarek labels from using the actual maximums - withLabel:memory_max { - memory = { check_max( 128.GB * task.attempt, 'memory' ) } - } - withLabel:cpus_max { - cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } - } - } +// Increase time limit to allow file transfers to finish +// The default is 12 hours, which results in timeouts +threadPool.FileTransfer.maxAwait = '24 hour' + +// Configure Nextflow to be more reliable on AWS aws { region = "us-east-1" + client { + uploadChunkSize = 209715200 + } +} +executor { + name = 'awsbatch' + // Ensure unlimited queue size on AWS Batch + queueSize = 100000 + // Slow down the rate at which AWS Batch jobs accumulate in + // the queue (an attempt to prevent orphaned EBS volumes) + submitRateLimit = '5 / 1 sec' } -params { - igenomes_base = 's3://sage-igenomes/igenomes' - max_memory = 500.GB - max_cpus = 64 - max_time = 168.h // One week -} +// Disabling resource allocation tweaks for now +// +// params { +// max_memory = 500.GB +// max_cpus = 64 +// max_time = 168.h // One week +// } +// +// process { +// +// cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 6.GB * task.attempt, 'memory' ) } +// time = { check_max( 24.h * task.attempt, 'time' ) } +// +// // Process-specific resource requirements +// withLabel:process_low { +// cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 12.GB * task.attempt, 'memory' ) } +// time = { check_max( 24.h * task.attempt, 'time' ) } +// } +// withLabel:process_medium { +// cpus = { check_max( 12 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 36.GB * task.attempt, 'memory' ) } +// time = { check_max( 48.h * task.attempt, 'time' ) } +// } +// withLabel:process_high { +// cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } +// memory = { check_max( 72.GB * task.attempt, 'memory' ) } +// time = { check_max( 96.h * task.attempt, 'time' ) } +// } +// withLabel:process_long { +// time = { check_max( 192.h * task.attempt, 'time' ) } +// } +// withLabel:process_high_memory { +// memory = { check_max( 128.GB * task.attempt, 'memory' ) } +// } +// +// // Preventing Sarek labels from using the actual maximums +// withLabel:memory_max { +// memory = { check_max( 128.GB * task.attempt, 'memory' ) } +// } +// withLabel:cpus_max { +// cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } +// } +// +// } // Function to slow the increase of the resource multipler // as attempts are made. The rationale is that some CPUs diff --git a/docs/sage.md b/docs/sage.md index 133ccec..d503b42 100644 --- a/docs/sage.md +++ b/docs/sage.md @@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi This global configuration includes the following tweaks: - Update the default value for `igenomes_base` to `s3://sage-igenomes` -- Increase the default time limits because we run pipelines on AWS - Enable retries by default when exit codes relate to insufficient memory - Allow pending jobs to finish if the number of retries are exhausted -- Slow the increase in the number of allocated CPU cores on retries +- Increase the amount of time allowed for file transfers +- Increase the default chunk size for multipart uploads to S3 +- Slow down job submission rate to avoid overwhelming any APIs - Define the `check_max()` function, which is missing in Sarek v2 +- (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries +- (Disabled temporarily) Increase the default time limits because we run pipelines on AWS ## Additional information about iGenomes From c8837235591ad4925fcefcb4f6b96036a14cd53b Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Wed, 31 Aug 2022 09:10:30 -0700 Subject: [PATCH 2/3] Simplify resource adjustments --- conf/sage.config | 83 ++++++++++++++++++++---------------------------- docs/sage.md | 4 +-- 2 files changed, 36 insertions(+), 51 deletions(-) diff --git a/conf/sage.config b/conf/sage.config index 7477d83..615da63 100644 --- a/conf/sage.config +++ b/conf/sage.config @@ -37,58 +37,43 @@ executor { submitRateLimit = '5 / 1 sec' } -// Disabling resource allocation tweaks for now -// -// params { -// max_memory = 500.GB -// max_cpus = 64 -// max_time = 168.h // One week -// } -// -// process { -// -// cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } -// memory = { check_max( 6.GB * task.attempt, 'memory' ) } -// time = { check_max( 24.h * task.attempt, 'time' ) } -// -// // Process-specific resource requirements -// withLabel:process_low { -// cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } -// memory = { check_max( 12.GB * task.attempt, 'memory' ) } -// time = { check_max( 24.h * task.attempt, 'time' ) } -// } -// withLabel:process_medium { -// cpus = { check_max( 12 * slow(task.attempt), 'cpus' ) } -// memory = { check_max( 36.GB * task.attempt, 'memory' ) } -// time = { check_max( 48.h * task.attempt, 'time' ) } -// } -// withLabel:process_high { -// cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } -// memory = { check_max( 72.GB * task.attempt, 'memory' ) } -// time = { check_max( 96.h * task.attempt, 'time' ) } -// } -// withLabel:process_long { -// time = { check_max( 192.h * task.attempt, 'time' ) } -// } -// withLabel:process_high_memory { -// memory = { check_max( 128.GB * task.attempt, 'memory' ) } -// } -// -// // Preventing Sarek labels from using the actual maximums -// withLabel:memory_max { -// memory = { check_max( 128.GB * task.attempt, 'memory' ) } -// } -// withLabel:cpus_max { -// cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } -// } -// -// } +// Adjust default resource allocations (see `../docs/sage.md`) +process { + + cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 24.h * task.attempt, 'time' ) } + + // Process-specific resource requirements + withLabel:process_low { + cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 24.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 12 * slow(task.attempt), 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 48.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 96.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 192.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 128.GB * task.attempt, 'memory' ) } + } + +} // Function to slow the increase of the resource multipler -// as attempts are made. The rationale is that some CPUs -// don't need to be increased as fast as memory. +// as attempts are made. The rationale is that the number +// of CPU cores isn't a limiting factor as often as memory. def slow(attempt, factor = 2) { - return Math.ceil( attempt / factor) as int + return Math.ceil( attempt / factor) as int } diff --git a/docs/sage.md b/docs/sage.md index d503b42..1e36fed 100644 --- a/docs/sage.md +++ b/docs/sage.md @@ -11,8 +11,8 @@ This global configuration includes the following tweaks: - Increase the default chunk size for multipart uploads to S3 - Slow down job submission rate to avoid overwhelming any APIs - Define the `check_max()` function, which is missing in Sarek v2 -- (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries -- (Disabled temporarily) Increase the default time limits because we run pipelines on AWS +- Slow the increase in the number of allocated CPU cores on retries +- Increase the default time limits because we run pipelines on AWS ## Additional information about iGenomes From 179b343bd20995bd48fc57b44a6da07340995025 Mon Sep 17 00:00:00 2001 From: Bruno Grande Date: Wed, 31 Aug 2022 09:18:27 -0700 Subject: [PATCH 3/3] Incorporate resource limits --- conf/sage.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/sage.config b/conf/sage.config index 615da63..bfe1e09 100644 --- a/conf/sage.config +++ b/conf/sage.config @@ -8,6 +8,9 @@ params { // Leverage us-east-1 mirror of select human and mouse genomes params { igenomes_base = 's3://sage-igenomes/igenomes' + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' } // Enable retries globally for certain exit codes