Merge pull request #407 from Sage-Bionetworks-Workflows/bgrande/sage-aws

Improve AWS-related config for Sage profile
2024-11-23 08:49:54 +00:00 · 2022-09-01 13:29:49 -07:00 · 2022-09-01 13:29:49 -07:00 · 47fa890ad7
commit 47fa890ad7
parent 6b83603325 80831358f9
2 changed files with 45 additions and 28 deletions
--- a/conf/sage.config
+++ b/conf/sage.config
@ -1,19 +1,52 @@
 // Config profile metadata
 params {
  config_profile_description = 'The Sage Bionetworks profile'
  config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
  config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
 }
 // Leverage us-east-1 mirror of select human and mouse genomes
 params {
  igenomes_base = 's3://sage-igenomes/igenomes'
  max_memory    = '128.GB'
  max_cpus      = 16
  max_time      = '240.h'
 }
 // Enable retries globally for certain exit codes
 process {
  errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
  maxRetries    = 5
  maxErrors     = '-1'
 }
 // Increase time limit to allow file transfers to finish
 // The default is 12 hours, which results in timeouts
 threadPool.FileTransfer.maxAwait = '24 hour'
 // Configure Nextflow to be more reliable on AWS
 aws {
  region = "us-east-1"
  client {
    uploadChunkSize = 209715200
  }
 }
 executor {
    name = 'awsbatch'
    // Ensure unlimited queue size on AWS Batch
    queueSize = 100000
    // Slow down the rate at which AWS Batch jobs accumulate in
    // the queue (an attempt to prevent orphaned EBS volumes)
    submitRateLimit = '5 / 1 sec'
 }
 // Adjust default resource allocations (see `../docs/sage.md`)
 process {
  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
  time   = { check_max( 24.h * task.attempt,       'time'   ) }
  errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
  maxRetries    = 5
  maxErrors     = '-1'
  // Process-specific resource requirements
  withLabel:process_low {
    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
@ -37,30 +70,11 @@ process {
    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
  }
  // Preventing Sarek labels from using the actual maximums
  withLabel:memory_max {
    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
  }
  withLabel:cpus_max {
    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
  }
 }
 aws {
  region = "us-east-1"
 }
 params {
  igenomes_base = 's3://sage-igenomes/igenomes'
  max_memory    = 500.GB
  max_cpus      = 64
  max_time      = 168.h  // One week
 }
 // Function to slow the increase of the resource multipler
-// as attempts are made. The rationale is that some CPUs
+// as attempts are made. The rationale is that the number
-// don't need to be increased as fast as memory.
+// of CPU cores isn't a limiting factor as often as memory.
 def slow(attempt, factor = 2) {
 return Math.ceil( attempt / factor) as int
 }
--- a/docs/sage.md
+++ b/docs/sage.md
@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi
 This global configuration includes the following tweaks:
 - Update the default value for `igenomes_base` to `s3://sage-igenomes`
 - Increase the default time limits because we run pipelines on AWS
 - Enable retries by default when exit codes relate to insufficient memory
 - Allow pending jobs to finish if the number of retries are exhausted
- Slow the increase in the number of allocated CPU cores on retries
+- Increase the amount of time allowed for file transfers
 - Increase the default chunk size for multipart uploads to S3
 - Slow down job submission rate to avoid overwhelming any APIs
 - Define the `check_max()` function, which is missing in Sarek v2
 - Slow the increase in the number of allocated CPU cores on retries
 - Increase the default time limits because we run pipelines on AWS
 ## Additional information about iGenomes