1
0
Fork 0
mirror of https://github.com/MillironX/nf-configs.git synced 2024-11-23 08:49:54 +00:00

Merge pull request #407 from Sage-Bionetworks-Workflows/bgrande/sage-aws

Improve AWS-related config for Sage profile
This commit is contained in:
Bruno Grande 2022-09-01 13:29:49 -07:00 committed by GitHub
commit 47fa890ad7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 28 deletions

View file

@ -1,19 +1,52 @@
// Config profile metadata
params { params {
config_profile_description = 'The Sage Bionetworks profile' config_profile_description = 'The Sage Bionetworks profile'
config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)' config_profile_contact = 'Bruno Grande (@BrunoGrandePhD)'
config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows' config_profile_url = 'https://github.com/Sage-Bionetworks-Workflows'
} }
// Leverage us-east-1 mirror of select human and mouse genomes
params {
igenomes_base = 's3://sage-igenomes/igenomes'
max_memory = '128.GB'
max_cpus = 16
max_time = '240.h'
}
// Enable retries globally for certain exit codes
process {
errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
maxRetries = 5
maxErrors = '-1'
}
// Increase time limit to allow file transfers to finish
// The default is 12 hours, which results in timeouts
threadPool.FileTransfer.maxAwait = '24 hour'
// Configure Nextflow to be more reliable on AWS
aws {
region = "us-east-1"
client {
uploadChunkSize = 209715200
}
}
executor {
name = 'awsbatch'
// Ensure unlimited queue size on AWS Batch
queueSize = 100000
// Slow down the rate at which AWS Batch jobs accumulate in
// the queue (an attempt to prevent orphaned EBS volumes)
submitRateLimit = '5 / 1 sec'
}
// Adjust default resource allocations (see `../docs/sage.md`)
process { process {
cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) } cpus = { check_max( 1 * slow(task.attempt), 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 24.h * task.attempt, 'time' ) } time = { check_max( 24.h * task.attempt, 'time' ) }
errorStrategy = { task.exitStatus in [143,137,104,134,139,247] ? 'retry' : 'finish' }
maxRetries = 5
maxErrors = '-1'
// Process-specific resource requirements // Process-specific resource requirements
withLabel:process_low { withLabel:process_low {
cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) } cpus = { check_max( 4 * slow(task.attempt), 'cpus' ) }
@ -37,32 +70,13 @@ process {
memory = { check_max( 128.GB * task.attempt, 'memory' ) } memory = { check_max( 128.GB * task.attempt, 'memory' ) }
} }
// Preventing Sarek labels from using the actual maximums
withLabel:memory_max {
memory = { check_max( 128.GB * task.attempt, 'memory' ) }
}
withLabel:cpus_max {
cpus = { check_max( 24 * slow(task.attempt), 'cpus' ) }
}
}
aws {
region = "us-east-1"
}
params {
igenomes_base = 's3://sage-igenomes/igenomes'
max_memory = 500.GB
max_cpus = 64
max_time = 168.h // One week
} }
// Function to slow the increase of the resource multipler // Function to slow the increase of the resource multipler
// as attempts are made. The rationale is that some CPUs // as attempts are made. The rationale is that the number
// don't need to be increased as fast as memory. // of CPU cores isn't a limiting factor as often as memory.
def slow(attempt, factor = 2) { def slow(attempt, factor = 2) {
return Math.ceil( attempt / factor) as int return Math.ceil( attempt / factor) as int
} }

View file

@ -5,11 +5,14 @@ To use this custom configuration, run the pipeline with `-profile sage`. This wi
This global configuration includes the following tweaks: This global configuration includes the following tweaks:
- Update the default value for `igenomes_base` to `s3://sage-igenomes` - Update the default value for `igenomes_base` to `s3://sage-igenomes`
- Increase the default time limits because we run pipelines on AWS
- Enable retries by default when exit codes relate to insufficient memory - Enable retries by default when exit codes relate to insufficient memory
- Allow pending jobs to finish if the number of retries are exhausted - Allow pending jobs to finish if the number of retries are exhausted
- Slow the increase in the number of allocated CPU cores on retries - Increase the amount of time allowed for file transfers
- Increase the default chunk size for multipart uploads to S3
- Slow down job submission rate to avoid overwhelming any APIs
- Define the `check_max()` function, which is missing in Sarek v2 - Define the `check_max()` function, which is missing in Sarek v2
- Slow the increase in the number of allocated CPU cores on retries
- Increase the default time limits because we run pipelines on AWS
## Additional information about iGenomes ## Additional information about iGenomes