Merge pull request #407 from Sage-Bionetworks-Workflows/bgrande/sage-aws

Improve AWS-related config for Sage profile
2024-12-23 10:58:16 +00:00 · 2022-09-01 13:29:49 -07:00 · 2022-09-01 13:29:49 -07:00 · 47fa890ad7
commit 47fa890ad7
parent 6b83603325 80831358f9
2 changed files with 45 additions and 28 deletions
--- a/conf/sage.config
+++ b/conf/sage.config
@ -1,19 +1,52 @@
+// Config profile metadata
 params {
  config_profile_description = 'The Sage Bionetworks profile'
  config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
  config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
 }

+// Leverage us-east-1 mirror of select human and mouse genomes
+params {
+  igenomes_base = 's3://sage-igenomes/igenomes'
+  max_memory    = '128.GB'
+  max_cpus      = 16
+  max_time      = '240.h'
+}
+
+// Enable retries globally for certain exit codes
+process {
+  errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
+  maxRetries    = 5
+  maxErrors     = '-1'
+}
+
+// Increase time limit to allow file transfers to finish
+// The default is 12 hours, which results in timeouts
+threadPool.FileTransfer.maxAwait = '24 hour'
+
+// Configure Nextflow to be more reliable on AWS
+aws {
+  region = "us-east-1"
+  client {
+    uploadChunkSize = 209715200
+  }
+}
+executor {
+    name = 'awsbatch'
+    // Ensure unlimited queue size on AWS Batch
+    queueSize = 100000
+    // Slow down the rate at which AWS Batch jobs accumulate in
+    // the queue (an attempt to prevent orphaned EBS volumes)
+    submitRateLimit = '5 / 1 sec'
+}
+
+// Adjust default resource allocations (see `../docs/sage.md`)
 process {

  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
  time   = { check_max( 24.h * task.attempt,       'time'   ) }

-  errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
-  maxRetries    = 5
-  maxErrors     = '-1'
-
  // Process-specific resource requirements
  withLabel:process_low {
    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
@ -37,32 +70,13 @@ process {
    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
  }

-  // Preventing Sarek labels from using the actual maximums
-  withLabel:memory_max {
-    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-  }
-  withLabel:cpus_max {
-    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-  }
-
-}
-
-aws {
-  region = "us-east-1"
-}
-
-params {
-  igenomes_base = 's3://sage-igenomes/igenomes'
-  max_memory    = 500.GB
-  max_cpus      = 64
-  max_time      = 168.h  // One week
 }

 // Function to slow the increase of the resource multipler
-// as attempts are made. The rationale is that some CPUs
-// don't need to be increased as fast as memory.
+// as attempts are made. The rationale is that the number
+// of CPU cores isn't a limiting factor as often as memory.
 def slow(attempt, factor = 2) {
-  return Math.ceil( attempt / factor) as int
+ return Math.ceil( attempt / factor) as int
 }


--- a/docs/sage.md
+++ b/docs/sage.md
@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi
 This global configuration includes the following tweaks:

 - Update the default value for `igenomes_base` to `s3://sage-igenomes`
- Increase the default time limits because we run pipelines on AWS
 - Enable retries by default when exit codes relate to insufficient memory
 - Allow pending jobs to finish if the number of retries are exhausted
- Slow the increase in the number of allocated CPU cores on retries
+- Increase the amount of time allowed for file transfers
+- Increase the default chunk size for multipart uploads to S3
+- Slow down job submission rate to avoid overwhelming any APIs
 - Define the `check_max()` function, which is missing in Sarek v2
+- Slow the increase in the number of allocated CPU cores on retries
+- Increase the default time limits because we run pipelines on AWS

 ## Additional information about iGenomes