From 317e5a16cb8c769115b0c7554c060d638426b134 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 25 Aug 2022 16:24:52 -0700
Subject: [PATCH] Improve AWS-related config for Sage profile

---
 conf/sage.config | 112 +++++++++++++++++++++++++++++------------------
 docs/sage.md     |   7 ++-
 2 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/conf/sage.config b/conf/sage.config
index e5bfa8b..7477d83 100644
--- a/conf/sage.config
+++ b/conf/sage.config
@@ -1,62 +1,88 @@
+// Config profile metadata
 params {
   config_profile_description = 'The Sage Bionetworks profile'
   config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
   config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
 }
 
+// Leverage us-east-1 mirror of select human and mouse genomes
+params {
+  igenomes_base = 's3://sage-igenomes/igenomes'
+}
+
+// Enable retries globally for certain exit codes
 process {
-
-  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
-  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
-  time   = { check_max( 24.h * task.attempt,       'time'   ) }
-
   errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
   maxRetries    = 5
   maxErrors     = '-1'
-
-  // Process-specific resource requirements
-  withLabel:process_low {
-    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
-    memory = { check_max( 12.GB * task.attempt,        'memory' ) }
-    time   = { check_max( 24.h  * task.attempt,        'time'   ) }
-  }
-  withLabel:process_medium {
-    cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
-    memory = { check_max( 36.GB * task.attempt,       'memory' ) }
-    time   = { check_max( 48.h  * task.attempt,       'time'   ) }
-  }
-  withLabel:process_high {
-    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-    memory = { check_max( 72.GB * task.attempt,       'memory' ) }
-    time   = { check_max( 96.h  * task.attempt,       'time'   ) }
-  }
-  withLabel:process_long {
-    time   = { check_max( 192.h  * task.attempt,   'time'   ) }
-  }
-  withLabel:process_high_memory {
-    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-  }
-
-  // Preventing Sarek labels from using the actual maximums
-  withLabel:memory_max {
-    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-  }
-  withLabel:cpus_max {
-    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-  }
-
 }
 
+// Increase time limit to allow file transfers to finish
+// The default is 12 hours, which results in timeouts
+threadPool.FileTransfer.maxAwait = '24 hour'
+
+// Configure Nextflow to be more reliable on AWS
 aws {
   region = "us-east-1"
+  client {
+    uploadChunkSize = 209715200
+  }
+}
+executor {
+    name = 'awsbatch'
+    // Ensure unlimited queue size on AWS Batch
+    queueSize = 100000
+    // Slow down the rate at which AWS Batch jobs accumulate in
+    // the queue (an attempt to prevent orphaned EBS volumes)
+    submitRateLimit = '5 / 1 sec'
 }
 
-params {
-  igenomes_base = 's3://sage-igenomes/igenomes'
-  max_memory    = 500.GB
-  max_cpus      = 64
-  max_time      = 168.h  // One week
-}
+// Disabling resource allocation tweaks for now
+//
+// params {
+//   max_memory    = 500.GB
+//   max_cpus      = 64
+//   max_time      = 168.h  // One week
+// }
+//
+// process {
+//
+//   cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
+//   memory = { check_max( 6.GB * task.attempt,       'memory' ) }
+//   time   = { check_max( 24.h * task.attempt,       'time'   ) }
+//
+//   // Process-specific resource requirements
+//   withLabel:process_low {
+//     cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
+//     memory = { check_max( 12.GB * task.attempt,        'memory' ) }
+//     time   = { check_max( 24.h  * task.attempt,        'time'   ) }
+//   }
+//   withLabel:process_medium {
+//     cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
+//     memory = { check_max( 36.GB * task.attempt,       'memory' ) }
+//     time   = { check_max( 48.h  * task.attempt,       'time'   ) }
+//   }
+//   withLabel:process_high {
+//     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
+//     memory = { check_max( 72.GB * task.attempt,       'memory' ) }
+//     time   = { check_max( 96.h  * task.attempt,       'time'   ) }
+//   }
+//   withLabel:process_long {
+//     time   = { check_max( 192.h  * task.attempt,   'time'   ) }
+//   }
+//   withLabel:process_high_memory {
+//     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
+//   }
+//
+//   // Preventing Sarek labels from using the actual maximums
+//   withLabel:memory_max {
+//     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
+//   }
+//   withLabel:cpus_max {
+//     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
+//   }
+//
+// }
 
 // Function to slow the increase of the resource multipler
 // as attempts are made. The rationale is that some CPUs
diff --git a/docs/sage.md b/docs/sage.md
index 133ccec..d503b42 100644
--- a/docs/sage.md
+++ b/docs/sage.md
@@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi
 This global configuration includes the following tweaks:
 
 - Update the default value for `igenomes_base` to `s3://sage-igenomes`
-- Increase the default time limits because we run pipelines on AWS
 - Enable retries by default when exit codes relate to insufficient memory
 - Allow pending jobs to finish if the number of retries are exhausted
-- Slow the increase in the number of allocated CPU cores on retries
+- Increase the amount of time allowed for file transfers
+- Increase the default chunk size for multipart uploads to S3
+- Slow down job submission rate to avoid overwhelming any APIs
 - Define the `check_max()` function, which is missing in Sarek v2
+- (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries
+- (Disabled temporarily) Increase the default time limits because we run pipelines on AWS
 
 ## Additional information about iGenomes