New module: elprep filter (#1524)

* first commit

* syntax fix

* fix input

* output sam during test for md5sum

* replace md5sum with contains

* add new test data, add extra in/outputs

* cli fixes

* fix outputs

* Update modules/elprep/filter/main.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* Update modules/elprep/filter/meta.yml

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* Update modules/elprep/filter/meta.yml

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>

* fix suggestions by @jfy133

* Bit more verbose explanation for bool vals

* define variables

* fix prettier

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
Matthias De Smet 2022-04-22 11:08:03 +02:00 committed by GitHub
parent 90b203d3e9
commit 9e3daae8ef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 240 additions and 0 deletions

View file

@ -0,0 +1,89 @@
process ELPREP_FILTER {
tag "$meta.id"
label 'process_high'
conda (params.enable_conda ? "bioconda::elprep=5.1.2" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/elprep:5.1.2--he881be0_0':
'quay.io/biocontainers/elprep:5.1.2--he881be0_0' }"
input:
tuple val(meta), path(bam)
val(run_haplotypecaller)
val(run_bqsr)
path(reference_sequences)
path(filter_regions_bed)
path(reference_elfasta)
path(known_sites_elsites)
path(target_regions_bed)
path(intermediate_bqsr_tables)
val(bqsr_tables_only)
val(get_activity_profile)
val(get_assembly_regions)
output:
tuple val(meta), path("**.{bam,sam}") ,emit: bam
tuple val(meta), path("*.metrics.txt") ,optional: true, emit: metrics
tuple val(meta), path("*.recall") ,optional: true, emit: recall
tuple val(meta), path("*.vcf.gz") ,optional: true, emit: gvcf
tuple val(meta), path("*.table") ,optional: true, emit: table
tuple val(meta), path("*.activity_profile.igv") ,optional: true, emit: activity_profile
tuple val(meta), path("*.assembly_regions.igv") ,optional: true, emit: assembly_regions
path "versions.yml" ,emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def suffix = args.contains("--output-type sam") ? "sam" : "bam"
// filter args
def reference_sequences_cmd = reference_sequences ? " --replace-reference-sequences ${reference_sequences}" : ""
def filter_regions_cmd = filter_regions_bed ? " --filter-non-overlapping-reads ${filter_regions_bed}" : ""
// markdup args
def markdup_cmd = args.contains("--mark-duplicates") ? " --mark-optical-duplicates ${prefix}.metrics.txt": ""
// variant calling args
def haplotyper_cmd = run_haplotypecaller ? " --haplotypecaller ${prefix}.g.vcf.gz": ""
def fasta_cmd = reference_elfasta ? " --reference ${reference_elfasta}": ""
def known_sites_cmd = known_sites_elsites ? " --known-sites ${known_sites_elsites}": ""
def target_regions_cmd = target_regions_bed ? " --target-regions ${target_regions_bed}": ""
// bqsr args
def bqsr_cmd = run_bqsr ? " --bqsr ${prefix}.recall": ""
def bqsr_tables_only_cmd = bqsr_tables_only ? " --bqsr-tables-only ${prefix}.table": ""
def intermediate_bqsr_cmd = intermediate_bqsr_tables ? " --bqsr-apply .": ""
// misc
def activity_profile_cmd = get_activity_profile ? " --activity-profile ${prefix}.activity_profile.igv": ""
def assembly_regions_cmd = get_assembly_regions ? " --assembly-regions ${prefix}.assembly_regions.igv": ""
"""
elprep filter ${bam} ${prefix}.${suffix} \\
${reference_sequences_cmd} \\
${filter_regions_cmd} \\
${markdup_cmd} \\
${haplotyper_cmd} \\
${fasta_cmd} \\
${known_sites_cmd} \\
${target_regions_cmd} \\
${bqsr_cmd} \\
${bqsr_tables_only_cmd} \\
${intermediate_bqsr_cmd} \\
${activity_profile_cmd} \\
${assembly_regions_cmd} \\
--nr-of-threads ${task.cpus} \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
elprep: \$(elprep 2>&1 | head -n2 | tail -n1 |sed 's/^.*version //;s/ compiled.*\$//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,106 @@
name: "elprep_filter"
description: "Filter, sort and markdup sam/bam files, with optional BQSR and variant calling."
keywords:
- sort
- bam
- sam
- filter
- variant calling
tools:
- "elprep":
description: "elPrep is a high-performance tool for preparing .sam/.bam files for variant calling in sequencing pipelines. It can be used as a drop-in replacement for SAMtools/Picard/GATK4."
homepage: "https://github.com/ExaScience/elprep"
documentation: "https://github.com/ExaScience/elprep"
tool_dev_url: "https://github.com/ExaScience/elprep"
doi: "10.1371/journal.pone.0244471"
licence: "['AGPL v3']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: Input SAM/BAM file
pattern: "*.{bam,sam}"
- run_haplotypecaller:
type: boolean
description: Run variant calling on the input files. Needed to generate gvcf output.
- run_bqsr:
type: boolean
description: Run BQSR on the input files. Needed to generate recall metrics.
- reference_sequences:
type: file
description: Optional SAM header to replace existing header.
pattern: "*.sam"
- filter_regions_bed:
type: file
description: Optional BED file containing regions to filter.
pattern: "*.bed"
- reference_elfasta:
type: file
description: Elfasta file, required for BQSR and variant calling.
pattern: "*.elfasta"
- known_sites:
type: file
description: Optional elsites file containing known SNPs for BQSR.
pattern: "*.elsites"
- target_regions_bed:
type: file
description: Optional BED file containing target regions for BQSR and variant calling.
pattern: "*.bed"
- intermediate_bqsr_tables:
type: file
description: Optional list of BQSR tables, used when parsing files created by `elprep split`
pattern: "*.table"
- bqsr_tables_only:
type: boolean
description: Write intermediate BQSR tables, used when parsing files created by `elprep split`.
- get_activity_profile:
type: boolean
description: Get the activity profile calculated by the haplotypecaller to the given file in IGV format.
- get_assembly_regions:
type: boolean
description: Get the assembly regions calculated by haplotypecaller to the speficied file in IGV format.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- bam:
type: file
description: Sorted, markdup, optionally BQSR BAM/SAM file
pattern: "*.{bam,sam}"
- metrics:
type: file
description: Optional duplicate metrics file generated by elprep
pattern: "*.{metrics.txt}"
- recall:
type: file
description: Optional recall metrics file generated by elprep
pattern: "*.{recall}"
- gvcf:
type: file
description: Optional GVCF output file
pattern: "*.{vcf.gz}"
- table:
type: file
description: Optional intermediate BQSR table output file
pattern: "*.{table}"
- activity_profile:
type: file
description: Optional activity profile output file
pattern: "*.{activity_profile.igv}"
- assembly_regions:
type: file
description: Optional activity regions output file
pattern: "*.{assembly_regions.igv}"
authors:
- "@matthdsm"

View file

@ -599,6 +599,10 @@ ectyper:
- modules/ectyper/** - modules/ectyper/**
- tests/modules/ectyper/** - tests/modules/ectyper/**
elprep/filter:
- modules/elprep/filter/**
- tests/modules/elprep/filter/**
elprep/split: elprep/split:
- modules/elprep/split/** - modules/elprep/split/**
- tests/modules/elprep/split/** - tests/modules/elprep/split/**

View file

@ -112,6 +112,7 @@ params {
} }
'homo_sapiens' { 'homo_sapiens' {
'genome' { 'genome' {
genome_elfasta = "${test_data_dir}/genomics/homo_sapiens/genome/genome.elfasta"
genome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/genome.fasta" genome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/genome.fasta"
genome_fasta_fai = "${test_data_dir}/genomics/homo_sapiens/genome/genome.fasta.fai" genome_fasta_fai = "${test_data_dir}/genomics/homo_sapiens/genome/genome.fasta.fai"
genome_dict = "${test_data_dir}/genomics/homo_sapiens/genome/genome.dict" genome_dict = "${test_data_dir}/genomics/homo_sapiens/genome/genome.dict"
@ -123,6 +124,7 @@ params {
genome_header = "${test_data_dir}/genomics/homo_sapiens/genome/genome.header" genome_header = "${test_data_dir}/genomics/homo_sapiens/genome/genome.header"
genome_bed_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz" genome_bed_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz"
genome_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz.tbi" genome_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz.tbi"
genome_elsites = "${test_data_dir}/genomics/homo_sapiens/genome/genome.elsites"
transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta" transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta"
genome2_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/genome2.fasta" genome2_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/genome2.fasta"
genome_chain_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.chain.gz" genome_chain_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.chain.gz"
@ -136,6 +138,7 @@ params {
genome_21_multi_interval_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed.gz.tbi" genome_21_multi_interval_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed.gz.tbi"
genome_21_chromosomes_dir = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz" genome_21_chromosomes_dir = "${test_data_dir}/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz"
dbsnp_146_hg38_elsites = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.elsites"
dbsnp_146_hg38_vcf_gz = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" dbsnp_146_hg38_vcf_gz = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
dbsnp_146_hg38_vcf_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" dbsnp_146_hg38_vcf_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi"
gnomad_r2_1_1_vcf_gz = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" gnomad_r2_1_1_vcf_gz = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"

View file

@ -0,0 +1,18 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { ELPREP_FILTER } from '../../../../modules/elprep/filter/main.nf'
workflow test_elprep_filter {
input = [
[ id:'test', single_end:false ], // meta map
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true)
]
reference_elfasta = file(params.test_data['homo_sapiens']['genome']['genome_elfasta'], checkIfExists: true)
known_sites_elsites = file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_elsites'], checkIfExists: true)
target_regions_bed = file(params.test_data['homo_sapiens']['genome']['genome_bed'], checkIfExists: true)
ELPREP_FILTER ( input, true, true, [], [], reference_elfasta, known_sites_elsites, target_regions_bed, [], [], true, true)
}

View file

@ -0,0 +1,7 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: ELPREP_FILTER {
ext.args = "--mark-duplicates "
}
}

View file

@ -0,0 +1,13 @@
- name: elprep filter test_elprep_filter
command: nextflow run tests/modules/elprep/filter -entry test_elprep_filter -c tests/config/nextflow.config
tags:
- elprep
- elprep/filter
files:
- path: output/elprep/test.activity_profile.igv
- path: output/elprep/test.assembly_regions.igv
- path: output/elprep/test.bam
- path: output/elprep/test.g.vcf.gz
- path: output/elprep/test.metrics.txt
- path: output/elprep/test.recall
- path: output/elprep/versions.yml