nf-configs/conf/sage.config

// Config profile metadata
params {
  config_profile_description = 'The Sage Bionetworks profile'
  config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
  config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
}

// Leverage us-east-1 mirror of select human and mouse genomes
params {
  igenomes_base = 's3://sage-igenomes/igenomes'
  max_memory    = '128.GB'
  max_cpus      = 16
  max_time      = '240.h'
}

// Enable retries globally for certain exit codes
process {
  errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
  maxRetries    = 5
  maxErrors     = '-1'
}

// Increase time limit to allow file transfers to finish
// The default is 12 hours, which results in timeouts
threadPool.FileTransfer.maxAwait = '24 hour'

// Configure Nextflow to be more reliable on AWS
aws {
  region = "us-east-1"
  client {
    uploadChunkSize = 209715200
    uploadMaxThreads = 4
  }
  batch {
    maxParallelTransfers = 1
    maxTransferAttempts = 5
    delayBetweenAttempts = '120 sec'
  }
}
executor {
    name = 'awsbatch'
    // Ensure unlimited queue size on AWS Batch
    queueSize = 500
    // Slow down the rate at which AWS Batch jobs accumulate in
    // the queue (an attempt to prevent orphaned EBS volumes)
    submitRateLimit = '5 / 1 sec'
}

// Adjust default resource allocations (see `../docs/sage.md`)
process {

  cpus   = { check_max( 1    * slow(task.attempt), 'cpus'   ) }
  memory = { check_max( 6.GB * task.attempt,       'memory' ) }
  time   = { check_max( 24.h * task.attempt,       'time'   ) }

  // Process-specific resource requirements
  withLabel:process_low {
    cpus   = { check_max( 4     * slow(task.attempt),  'cpus'   ) }
    memory = { check_max( 12.GB * task.attempt,        'memory' ) }
    time   = { check_max( 24.h  * task.attempt,        'time'   ) }
  }
  withLabel:process_medium {
    cpus   = { check_max( 12    * slow(task.attempt), 'cpus'   ) }
    memory = { check_max( 36.GB * task.attempt,       'memory' ) }
    time   = { check_max( 48.h  * task.attempt,       'time'   ) }
  }
  withLabel:process_high {
    cpus   = { check_max( 24    * slow(task.attempt), 'cpus'   ) }
    memory = { check_max( 72.GB * task.attempt,       'memory' ) }
    time   = { check_max( 96.h  * task.attempt,       'time'   ) }
  }
  withLabel:process_long {
    time   = { check_max( 192.h  * task.attempt,   'time'   ) }
  }
  withLabel:process_high_memory {
    memory = { check_max( 128.GB * task.attempt,   'memory' ) }
  }

}

// Function to slow the increase of the resource multipler
// as attempts are made. The rationale is that the number
// of CPU cores isn't a limiting factor as often as memory.
def slow(attempt, factor = 2) {
 return Math.ceil( attempt / factor) as int
}


// Function to ensure that resource requirements don't go
// beyond a maximum limit (copied here for Sarek v2)
def check_max(obj, type) {
    if (type == 'memory') {
        try {
            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
                return params.max_memory as nextflow.util.MemoryUnit
            else
                return obj
        } catch (all) {
            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
            return obj
        }
    } else if (type == 'time') {
        try {
            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
                return params.max_time as nextflow.util.Duration
            else
                return obj
        } catch (all) {
            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
            return obj
        }
    } else if (type == 'cpus') {
        try {
            return Math.min( obj, params.max_cpus as int )
        } catch (all) {
            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
            return obj
        }
    }
}