Improve AWS-related config for Sage profile

2024-11-25 09:19:56 +00:00 · 2022-08-25 16:24:52 -07:00 · 2022-08-25 16:24:52 -07:00 · 317e5a16cb
commit 317e5a16cb
parent 6f0d9e6c43
2 changed files with 74 additions and 45 deletions
--- a/conf/sage.config
+++ b/conf/sage.config
@ -1,62 +1,88 @@
 // Config profile metadata
 params {
  config_profile_description = 'The Sage Bionetworks profile'
  config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
  config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
 }
 // Leverage us-east-1 mirror of select human and mouse genomes
 params {
  igenomes_base = 's3://sage-igenomes/igenomes'
 }
 // Enable retries globally for certain exit codes
 process {
  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
  time   = { check_max( 24.h * task.attempt,       'time'   ) }
  errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
  maxRetries    = 5
  maxErrors     = '-1'
  // Process-specific resource requirements
  withLabel:process_low {
    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
    memory = { check_max( 12.GB * task.attempt,        'memory' ) }
    time   = { check_max( 24.h  * task.attempt,        'time'   ) }
  }
  withLabel:process_medium {
    cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
    memory = { check_max( 36.GB * task.attempt,       'memory' ) }
    time   = { check_max( 48.h  * task.attempt,       'time'   ) }
  }
  withLabel:process_high {
    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
    memory = { check_max( 72.GB * task.attempt,       'memory' ) }
    time   = { check_max( 96.h  * task.attempt,       'time'   ) }
  }
  withLabel:process_long {
    time   = { check_max( 192.h  * task.attempt,   'time'   ) }
  }
  withLabel:process_high_memory {
    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
 }
-  // Preventing Sarek labels from using the actual maximums
+// Increase time limit to allow file transfers to finish
-  withLabel:memory_max {
+// The default is 12 hours, which results in timeouts
-    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
+threadPool.FileTransfer.maxAwait = '24 hour'
  }
  withLabel:cpus_max {
    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
  }
 }
 // Configure Nextflow to be more reliable on AWS
 aws {
  region = "us-east-1"
  client {
    uploadChunkSize = 209715200
  }
 }
 executor {
    name = 'awsbatch'
    // Ensure unlimited queue size on AWS Batch
    queueSize = 100000
    // Slow down the rate at which AWS Batch jobs accumulate in
    // the queue (an attempt to prevent orphaned EBS volumes)
    submitRateLimit = '5 / 1 sec'
 }
-params {
+// Disabling resource allocation tweaks for now
-  igenomes_base = 's3://sage-igenomes/igenomes'
+//
-  max_memory    = 500.GB
+// params {
-  max_cpus      = 64
+//   max_memory    = 500.GB
-  max_time      = 168.h  // One week
+//   max_cpus      = 64
-}
+//   max_time      = 168.h  // One week
 // }
 //
 // process {
 //
 //   cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
 //   memory = { check_max( 6.GB * task.attempt,       'memory' ) }
 //   time   = { check_max( 24.h * task.attempt,       'time'   ) }
 //
 //   // Process-specific resource requirements
 //   withLabel:process_low {
 //     cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
 //     memory = { check_max( 12.GB * task.attempt,        'memory' ) }
 //     time   = { check_max( 24.h  * task.attempt,        'time'   ) }
 //   }
 //   withLabel:process_medium {
 //     cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
 //     memory = { check_max( 36.GB * task.attempt,       'memory' ) }
 //     time   = { check_max( 48.h  * task.attempt,       'time'   ) }
 //   }
 //   withLabel:process_high {
 //     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
 //     memory = { check_max( 72.GB * task.attempt,       'memory' ) }
 //     time   = { check_max( 96.h  * task.attempt,       'time'   ) }
 //   }
 //   withLabel:process_long {
 //     time   = { check_max( 192.h  * task.attempt,   'time'   ) }
 //   }
 //   withLabel:process_high_memory {
 //     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
 //   }
 //
 //   // Preventing Sarek labels from using the actual maximums
 //   withLabel:memory_max {
 //     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
 //   }
 //   withLabel:cpus_max {
 //     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
 //   }
 //
 // }
 // Function to slow the increase of the resource multipler
 // as attempts are made. The rationale is that some CPUs
--- a/docs/sage.md
+++ b/docs/sage.md
@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi
 This global configuration includes the following tweaks:
 - Update the default value for `igenomes_base` to `s3://sage-igenomes`
 - Increase the default time limits because we run pipelines on AWS
 - Enable retries by default when exit codes relate to insufficient memory
 - Allow pending jobs to finish if the number of retries are exhausted
- Slow the increase in the number of allocated CPU cores on retries
+- Increase the amount of time allowed for file transfers
 - Increase the default chunk size for multipart uploads to S3
 - Slow down job submission rate to avoid overwhelming any APIs
 - Define the `check_max()` function, which is missing in Sarek v2
 - (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries
 - (Disabled temporarily) Increase the default time limits because we run pipelines on AWS
 ## Additional information about iGenomes