From 317e5a16cb8c769115b0c7554c060d638426b134 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Thu, 25 Aug 2022 16:24:52 -0700
Subject: [PATCH 1/3] Improve AWS-related config for Sage profile

---
 conf/sage.config | 112 +++++++++++++++++++++++++++++------------------
 docs/sage.md     |   7 ++-
 2 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/conf/sage.config b/conf/sage.config
index e5bfa8b..7477d83 100644
--- a/conf/sage.config
+++ b/conf/sage.config
@@ -1,62 +1,88 @@
+// Config profile metadata
 params {
   config_profile_description = 'The Sage Bionetworks profile'
   config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
   config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
 }
 
+// Leverage us-east-1 mirror of select human and mouse genomes
+params {
+  igenomes_base = 's3://sage-igenomes/igenomes'
+}
+
+// Enable retries globally for certain exit codes
 process {
-
-  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
-  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
-  time   = { check_max( 24.h * task.attempt,       'time'   ) }
-
   errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
   maxRetries    = 5
   maxErrors     = '-1'
-
-  // Process-specific resource requirements
-  withLabel:process_low {
-    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
-    memory = { check_max( 12.GB * task.attempt,        'memory' ) }
-    time   = { check_max( 24.h  * task.attempt,        'time'   ) }
-  }
-  withLabel:process_medium {
-    cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
-    memory = { check_max( 36.GB * task.attempt,       'memory' ) }
-    time   = { check_max( 48.h  * task.attempt,       'time'   ) }
-  }
-  withLabel:process_high {
-    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-    memory = { check_max( 72.GB * task.attempt,       'memory' ) }
-    time   = { check_max( 96.h  * task.attempt,       'time'   ) }
-  }
-  withLabel:process_long {
-    time   = { check_max( 192.h  * task.attempt,   'time'   ) }
-  }
-  withLabel:process_high_memory {
-    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-  }
-
-  // Preventing Sarek labels from using the actual maximums
-  withLabel:memory_max {
-    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-  }
-  withLabel:cpus_max {
-    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-  }
-
 }
 
+// Increase time limit to allow file transfers to finish
+// The default is 12 hours, which results in timeouts
+threadPool.FileTransfer.maxAwait = '24 hour'
+
+// Configure Nextflow to be more reliable on AWS
 aws {
   region = "us-east-1"
+  client {
+    uploadChunkSize = 209715200
+  }
+}
+executor {
+    name = 'awsbatch'
+    // Ensure unlimited queue size on AWS Batch
+    queueSize = 100000
+    // Slow down the rate at which AWS Batch jobs accumulate in
+    // the queue (an attempt to prevent orphaned EBS volumes)
+    submitRateLimit = '5 / 1 sec'
 }
 
-params {
-  igenomes_base = 's3://sage-igenomes/igenomes'
-  max_memory    = 500.GB
-  max_cpus      = 64
-  max_time      = 168.h  // One week
-}
+// Disabling resource allocation tweaks for now
+//
+// params {
+//   max_memory    = 500.GB
+//   max_cpus      = 64
+//   max_time      = 168.h  // One week
+// }
+//
+// process {
+//
+//   cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
+//   memory = { check_max( 6.GB * task.attempt,       'memory' ) }
+//   time   = { check_max( 24.h * task.attempt,       'time'   ) }
+//
+//   // Process-specific resource requirements
+//   withLabel:process_low {
+//     cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
+//     memory = { check_max( 12.GB * task.attempt,        'memory' ) }
+//     time   = { check_max( 24.h  * task.attempt,        'time'   ) }
+//   }
+//   withLabel:process_medium {
+//     cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
+//     memory = { check_max( 36.GB * task.attempt,       'memory' ) }
+//     time   = { check_max( 48.h  * task.attempt,       'time'   ) }
+//   }
+//   withLabel:process_high {
+//     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
+//     memory = { check_max( 72.GB * task.attempt,       'memory' ) }
+//     time   = { check_max( 96.h  * task.attempt,       'time'   ) }
+//   }
+//   withLabel:process_long {
+//     time   = { check_max( 192.h  * task.attempt,   'time'   ) }
+//   }
+//   withLabel:process_high_memory {
+//     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
+//   }
+//
+//   // Preventing Sarek labels from using the actual maximums
+//   withLabel:memory_max {
+//     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
+//   }
+//   withLabel:cpus_max {
+//     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
+//   }
+//
+// }
 
 // Function to slow the increase of the resource multipler
 // as attempts are made. The rationale is that some CPUs
diff --git a/docs/sage.md b/docs/sage.md
index 133ccec..d503b42 100644
--- a/docs/sage.md
+++ b/docs/sage.md
@@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi
 This global configuration includes the following tweaks:
 
 - Update the default value for `igenomes_base` to `s3://sage-igenomes`
-- Increase the default time limits because we run pipelines on AWS
 - Enable retries by default when exit codes relate to insufficient memory
 - Allow pending jobs to finish if the number of retries are exhausted
-- Slow the increase in the number of allocated CPU cores on retries
+- Increase the amount of time allowed for file transfers
+- Increase the default chunk size for multipart uploads to S3
+- Slow down job submission rate to avoid overwhelming any APIs
 - Define the `check_max()` function, which is missing in Sarek v2
+- (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries
+- (Disabled temporarily) Increase the default time limits because we run pipelines on AWS
 
 ## Additional information about iGenomes
 

From c8837235591ad4925fcefcb4f6b96036a14cd53b Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Wed, 31 Aug 2022 09:10:30 -0700
Subject: [PATCH 2/3] Simplify resource adjustments

---
 conf/sage.config | 83 ++++++++++++++++++++----------------------------
 docs/sage.md     |  4 +--
 2 files changed, 36 insertions(+), 51 deletions(-)

diff --git a/conf/sage.config b/conf/sage.config
index 7477d83..615da63 100644
--- a/conf/sage.config
+++ b/conf/sage.config
@@ -37,58 +37,43 @@ executor {
     submitRateLimit = '5 / 1 sec'
 }
 
-// Disabling resource allocation tweaks for now
-//
-// params {
-//   max_memory    = 500.GB
-//   max_cpus      = 64
-//   max_time      = 168.h  // One week
-// }
-//
-// process {
-//
-//   cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
-//   memory = { check_max( 6.GB * task.attempt,       'memory' ) }
-//   time   = { check_max( 24.h * task.attempt,       'time'   ) }
-//
-//   // Process-specific resource requirements
-//   withLabel:process_low {
-//     cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
-//     memory = { check_max( 12.GB * task.attempt,        'memory' ) }
-//     time   = { check_max( 24.h  * task.attempt,        'time'   ) }
-//   }
-//   withLabel:process_medium {
-//     cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
-//     memory = { check_max( 36.GB * task.attempt,       'memory' ) }
-//     time   = { check_max( 48.h  * task.attempt,       'time'   ) }
-//   }
-//   withLabel:process_high {
-//     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-//     memory = { check_max( 72.GB * task.attempt,       'memory' ) }
-//     time   = { check_max( 96.h  * task.attempt,       'time'   ) }
-//   }
-//   withLabel:process_long {
-//     time   = { check_max( 192.h  * task.attempt,   'time'   ) }
-//   }
-//   withLabel:process_high_memory {
-//     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-//   }
-//
-//   // Preventing Sarek labels from using the actual maximums
-//   withLabel:memory_max {
-//     memory = { check_max( 128.GB * task.attempt,   'memory' ) }
-//   }
-//   withLabel:cpus_max {
-//     cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
-//   }
-//
-// }
+// Adjust default resource allocations (see `../docs/sage.md`)
+process {
+
+  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
+  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
+  time   = { check_max( 24.h * task.attempt,       'time'   ) }
+
+  // Process-specific resource requirements
+  withLabel:process_low {
+    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
+    memory = { check_max( 12.GB * task.attempt,        'memory' ) }
+    time   = { check_max( 24.h  * task.attempt,        'time'   ) }
+  }
+  withLabel:process_medium {
+    cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
+    memory = { check_max( 36.GB * task.attempt,       'memory' ) }
+    time   = { check_max( 48.h  * task.attempt,       'time'   ) }
+  }
+  withLabel:process_high {
+    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
+    memory = { check_max( 72.GB * task.attempt,       'memory' ) }
+    time   = { check_max( 96.h  * task.attempt,       'time'   ) }
+  }
+  withLabel:process_long {
+    time   = { check_max( 192.h  * task.attempt,   'time'   ) }
+  }
+  withLabel:process_high_memory {
+    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
+  }
+
+}
 
 // Function to slow the increase of the resource multipler
-// as attempts are made. The rationale is that some CPUs
-// don't need to be increased as fast as memory.
+// as attempts are made. The rationale is that the number
+// of CPU cores isn't a limiting factor as often as memory.
 def slow(attempt, factor = 2) {
-  return Math.ceil( attempt / factor) as int
+ return Math.ceil( attempt / factor) as int
 }
 
 
diff --git a/docs/sage.md b/docs/sage.md
index d503b42..1e36fed 100644
--- a/docs/sage.md
+++ b/docs/sage.md
@@ -11,8 +11,8 @@ This global configuration includes the following tweaks:
 - Increase the default chunk size for multipart uploads to S3
 - Slow down job submission rate to avoid overwhelming any APIs
 - Define the `check_max()` function, which is missing in Sarek v2
-- (Disabled temporarily) Slow the increase in the number of allocated CPU cores on retries
-- (Disabled temporarily) Increase the default time limits because we run pipelines on AWS
+- Slow the increase in the number of allocated CPU cores on retries
+- Increase the default time limits because we run pipelines on AWS
 
 ## Additional information about iGenomes
 

From 179b343bd20995bd48fc57b44a6da07340995025 Mon Sep 17 00:00:00 2001
From: Bruno Grande <bruno.grande@sagebase.org>
Date: Wed, 31 Aug 2022 09:18:27 -0700
Subject: [PATCH 3/3] Incorporate resource limits

---
 conf/sage.config | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conf/sage.config b/conf/sage.config
index 615da63..bfe1e09 100644
--- a/conf/sage.config
+++ b/conf/sage.config
@@ -8,6 +8,9 @@ params {
 // Leverage us-east-1 mirror of select human and mouse genomes
 params {
   igenomes_base = 's3://sage-igenomes/igenomes'
+  max_memory    = '128.GB'
+  max_cpus      = 16
+  max_time      = '240.h'
 }
 
 // Enable retries globally for certain exit codes