Merge pull request #1700 from SusiJo/gatk_spark

Fix gatk/markduplicatesspark
This commit is contained in:
SusiJo 2022-06-09 11:09:06 +02:00 committed by GitHub
commit c587fd1fe3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 87 additions and 26 deletions

View file

@ -2,7 +2,7 @@ process GATK4_MARKDUPLICATES_SPARK {
tag "$meta.id"
label 'process_high'
conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0" : null)
conda (params.enable_conda ? "bioconda::gatk4=4.2.3.0 conda-forge::openjdk=8.0.312" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/gatk4:4.2.3.0--hdfd78af_0' :
'broadinstitute/gatk:4.2.3.0' }"
@ -14,8 +14,9 @@ process GATK4_MARKDUPLICATES_SPARK {
path dict
output:
tuple val(meta), path("${prefix}"), emit: output
path "versions.yml" , emit: versions
tuple val(meta), path("${prefix}"), emit: output
tuple val(meta), path("*.metrics"), emit: metrics, optional: true
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
@ -25,6 +26,7 @@ process GATK4_MARKDUPLICATES_SPARK {
prefix = task.ext.prefix ?: "${meta.id}"
def input_list = bam.collect{"--input $it"}.join(' ')
def avail_mem = 3
if (!task.memory) {
log.info '[GATK MarkDuplicatesSpark] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
@ -32,8 +34,6 @@ process GATK4_MARKDUPLICATES_SPARK {
avail_mem = task.memory.giga
}
"""
export SPARK_USER=spark3
gatk --java-options "-Xmx${avail_mem}g" MarkDuplicatesSpark \\
$input_list \\
--output $prefix \\
@ -45,6 +45,7 @@ process GATK4_MARKDUPLICATES_SPARK {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
openjdk: \$(echo \$(java -version 2>&1) | grep version | sed 's/\"//g' | cut -f3 -d ' ')
END_VERSIONS
"""
}

View file

@ -58,3 +58,4 @@ authors:
- "@ajodeh-juma"
- "@FriederikeHanssen"
- "@maxulysse"
- "@SusiJo"

View file

@ -3,26 +3,55 @@
nextflow.enable.dsl = 2
include { GATK4_MARKDUPLICATES_SPARK } from '../../../../modules/gatk4/markduplicatesspark/main.nf'
include { GATK4_MARKDUPLICATES_SPARK as GATK4_MARKDUPLICATES_SPARK_CRAM } from '../../../../modules/gatk4/markduplicatesspark/main.nf'
include { GATK4_MARKDUPLICATES_SPARK as GATK4_MARKDUPLICATES_SPARK_METRICS } from '../../../../modules/gatk4/markduplicatesspark/main.nf'
workflow test_gatk4_markduplicates_spark {
input = [ [ id:'test', single_end:false ], // meta map
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true)
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true)
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true)
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['sarscov2']['genome']['genome_dict'], checkIfExists: true)
GATK4_MARKDUPLICATES_SPARK ( input, fasta, fai, dict )
}
// chr 22
workflow test_gatk4_markduplicates_spark_multiple_bams {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_sorted_bam'], checkIfExists: true)
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_name_sorted_bam'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_name_sorted_bam'], checkIfExists: true)
] ]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_21_dict'], checkIfExists: true)
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
GATK4_MARKDUPLICATES_SPARK ( input, fasta, fai, dict )
}
// chr 22
workflow test_gatk4_markduplicates_spark_multiple_bams_cram_out {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_name_sorted_bam'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_name_sorted_bam'], checkIfExists: true)
] ]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
GATK4_MARKDUPLICATES_SPARK_CRAM ( input, fasta, fai, dict )
}
// chr 22
workflow test_gatk4_markduplicates_spark_multiple_bams_metrics {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_name_sorted_bam'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_name_sorted_bam'], checkIfExists: true)
] ]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
dict = file(params.test_data['homo_sapiens']['genome']['genome_dict'], checkIfExists: true)
GATK4_MARKDUPLICATES_SPARK_METRICS ( input, fasta, fai, dict )
}

View file

@ -2,4 +2,18 @@ process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: GATK4_MARKDUPLICATES_SPARK {
ext.prefix = { "${meta.id}.bam" }
}
withName: GATK4_MARKDUPLICATES_SPARK_CRAM {
ext.prefix = { "${meta.id}.cram" }
}
withName: GATK4_MARKDUPLICATES_SPARK_METRICS {
ext.args = '--metrics-file test.metrics'
ext.prefix = { "${meta.id}.bam" }
}
}
// override tests/config/nextflow.config
docker.userEmulation = false

View file

@ -1,25 +1,41 @@
- name: gatk4 markduplicates test_gatk4_markduplicates_spark
command: nextflow run tests/modules/gatk4/markduplicatesspark -entry test_gatk4_markduplicates_spark -c tests/config/nextflow.config -c ./tests/modules/gatk4/markduplicatesspark/nextflow.config
- name: gatk4 markduplicatesspark test_gatk4_markduplicates_spark
command: nextflow run ./tests/modules/gatk4/markduplicatesspark -entry test_gatk4_markduplicates_spark -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/markduplicatesspark/nextflow.config
tags:
- gatk4
- gatk4/markduplicatesspark
files:
- path: output/gatk4/test.bai
md5sum: e9c125e82553209933883b4fe2b8d7c2
- path: output/gatk4/test.bam
md5sum: 2efd50b2e6b7fd9bdf242cd9e266cfa9
- path: output/gatk4/test.metrics
md5sum: dc1a09ac6371aab7c50d1a554baa06d3
- path: output/gatk4/versions.yml
- name: gatk4 markduplicates test_gatk4_markduplicates_spark_multiple_bams
command: nextflow run tests/modules/gatk4/markduplicatesspark -entry test_gatk4_markduplicates_spark_multiple_bams -c tests/config/nextflow.config -c ./tests/modules/gatk4/markduplicatesspark/nextflow.config
- name: gatk4 markduplicatesspark test_gatk4_markduplicates_spark_multiple_bams
command: nextflow run ./tests/modules/gatk4/markduplicatesspark -entry test_gatk4_markduplicates_spark_multiple_bams -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/markduplicatesspark/nextflow.config
tags:
- gatk4
- gatk4/markduplicatesspark
files:
- path: output/gatk4/test.bai
md5sum: bad71df9c876e72a5bc0a3e0fd755f92
- path: output/gatk4/test.bam
md5sum: 8187febc6108ffef7f907e89b9c091a4
- path: output/gatk4/test.metrics
md5sum: 898cb0a6616897d8ada90bab53bf0837
- path: output/gatk4/versions.yml
- name: gatk4 markduplicatesspark test_gatk4_markduplicates_spark_multiple_bams_cram_out
command: nextflow run ./tests/modules/gatk4/markduplicatesspark -entry test_gatk4_markduplicates_spark_multiple_bams_cram_out -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/markduplicatesspark/nextflow.config
tags:
- gatk4
- gatk4/markduplicatesspark
files:
- path: output/gatk4/test.cram
md5sum: 2271016de5e4199736598f39d12d7587
- path: output/gatk4/versions.yml
- name: gatk4 markduplicatesspark test_gatk4_markduplicates_spark_multiple_bams_metrics
command: nextflow run ./tests/modules/gatk4/markduplicatesspark -entry test_gatk4_markduplicates_spark_multiple_bams_metrics -c ./tests/config/nextflow.config -c ./tests/modules/gatk4/markduplicatesspark/nextflow.config
tags:
- gatk4
- gatk4/markduplicatesspark
files:
- path: output/gatk4/test.bam
md5sum: 898cb0a6616897d8ada90bab53bf0837
- path: output/gatk4/test.metrics
contains: ["## METRICS CLASS", "org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics"]
- path: output/gatk4/versions.yml