From a68c563e54d5f3720fc57ef6e34ff08c4b3ec398 Mon Sep 17 00:00:00 2001 From: Francesco L <53608000+lescai@users.noreply.github.com> Date: Thu, 9 Dec 2021 11:16:40 +0100 Subject: [PATCH] Added UMI sub-workflow (#1098) * added code for subworkflow fgbio call umi consensus * ironing out a few typos etc * fixing last things * fixed md5sum - lets see if it changes * removing file accidentally deleted * tidy indents * added bwamem2 alternative * fixed entry for both tests * changed name second test workflow entry * fixed workflow entry names * fixed md5sum for file generated with bwamem2 * added syntax new DSL2 * added new config location in test command line * added new config location in test command line * use of prefix instead of suffix because modules have been changed in this way * explicit alias to bwa mem1 to avoid confusion * removed param that should be an ext optional argument in fgbio groupreadsbyumi * missing colon in config * missing colon in module config too * order list alphabetically Co-authored-by: Maxime U. Garcia * remove params from body Co-authored-by: Maxime U. Garcia * improving readability of input structure Co-authored-by: Mahesh Binzer-Panchal * reverting to mandatory input * fixed tests and workflow take values * remove param Co-authored-by: Maxime U. Garcia * simplify tests params Co-authored-by: Maxime U. Garcia * formatting inputs for readability * factoring in changes to bwamem2_mem and bwa_mem sort/view inputs * updating test md5sum for grouped file following code update in bwamem Co-authored-by: Maxime U. Garcia Co-authored-by: Maxime U. Garcia Co-authored-by: Mahesh Binzer-Panchal --- .../fgbio_create_umi_consensus/main.nf | 86 +++++++++++++++++++ .../fgbio_create_umi_consensus/meta.yml | 67 +++++++++++++++ tests/modules/fgbio/groupreadsbyumi/main.nf | 3 +- .../fgbio_create_umi_consensus/main.nf | 33 +++++++ .../nextflow.config | 31 +++++++ .../fgbio_create_umi_consensus/test.yml | 22 +++++ 6 files changed, 240 insertions(+), 2 deletions(-) create mode 100644 subworkflows/nf-core/fgbio_create_umi_consensus/main.nf create mode 100644 subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml create mode 100644 tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf create mode 100644 tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config create mode 100644 tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml diff --git a/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf b/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf new file mode 100644 index 00000000..042d0bbd --- /dev/null +++ b/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf @@ -0,0 +1,86 @@ +// +// Runs FGBIO tools to remove UMI tags from FASTQ reads +// Convert them to unmapped BAM file, map them to the reference genome, +// use the mapped information to group UMIs and generate consensus reads +// + + +include { BWAMEM2_INDEX } from '../../../modules/bwamem2/index/main.nf' +include { BWAMEM2_MEM } from '../../../modules/bwamem2/mem/main' +include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/bwa/index/main.nf' +include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/bwa/mem/main' +include { FGBIO_CALLMOLECULARCONSENSUSREADS as CALLUMICONSENSUS } from '../../../modules/fgbio/callmolecularconsensusreads/main.nf' +include { FGBIO_FASTQTOBAM as FASTQTOBAM } from '../../../modules/fgbio/fastqtobam/main' +include { FGBIO_GROUPREADSBYUMI as GROUPREADSBYUMI } from '../../../modules/fgbio/groupreadsbyumi/main' +include { SAMBLASTER } from '../../../modules/samblaster/main' +include { SAMTOOLS_BAM2FQ as BAM2FASTQ } from '../../../modules/samtools/bam2fq/main.nf' + + +workflow CREATE_UMI_CONSENSUS { + take: + reads // channel: [mandatory] [ val(meta), [ reads ] ] + fasta // channel: [mandatory] /path/to/reference/fasta + read_structure // string: [mandatory] "read_structure" + groupreadsbyumi_strategy // string: [mandatory] grouping strategy - default: "Adjacency" + aligner // string: [mandatory] "bwa-mem" or "bwa-mem2" + + main: + ch_versions = Channel.empty() + + // using information in val(read_structure) FASTQ reads are converted into + // a tagged unmapped BAM file (uBAM) + FASTQTOBAM ( reads, read_structure ) + ch_versions = ch_versions.mix(FASTQTOBAM.out.version) + + // in order to map uBAM using BWA MEM, we need to convert uBAM to FASTQ + // but keep the appropriate UMI tags in the FASTQ comment field and produce + // an interleaved FASQT file (hence, split = false) + split = false + BAM2FASTQ ( FASTQTOBAM.out.umibam, split ) + ch_versions = ch_versions.mix(BAM2FASTQ.out.versions) + + // the user can choose here to use either bwa-mem (default) or bwa-mem2 + aligned_bam = Channel.empty() + + if (aligner == "bwa-mem") { + // reference is indexed + BWAMEM1_INDEX ( fasta ) + ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions) + + // appropriately tagged interleaved FASTQ reads are mapped to the reference + BWAMEM1_MEM ( BAM2FASTQ.out.reads, BWAMEM1_INDEX.out.index, false ) + ch_versions = ch_versions.mix(BWAMEM1_MEM.out.versions) + aligned_bam = BWAMEM1_MEM.out.bam + } else { + // reference is indexed + BWAMEM2_INDEX ( fasta ) + ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions) + + // appropriately tagged interleaved FASTQ reads are mapped to the reference + BWAMEM2_MEM ( BAM2FASTQ.out.reads, BWAMEM2_INDEX.out.index, false ) + ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions) + aligned_bam = BWAMEM2_MEM.out.bam + } + + // samblaster is used in order to tag mates information in the BAM file + // this is used in order to group reads by UMI + SAMBLASTER ( aligned_bam ) + ch_versions = ch_versions.mix(SAMBLASTER.out.versions) + + // appropriately tagged reads are now grouped by UMI information + GROUPREADSBYUMI ( SAMBLASTER.out.bam, groupreadsbyumi_strategy ) + ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions) + + // using the above created groups, a consensus across reads in the same grou + // can be called + // this will emit a consensus BAM file + CALLUMICONSENSUS ( GROUPREADSBYUMI.out.bam ) + ch_versions = ch_versions.mix(CALLUMICONSENSUS.out.versions) + + emit: + ubam = FASTQTOBAM.out.umibam // channel: [ val(meta), [ bam ] ] + groupbam = GROUPREADSBYUMI.out.bam // channel: [ val(meta), [ bam ] ] + consensusbam = CALLUMICONSENSUS.out.bam // channel: [ val(meta), [ bam ] ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml b/subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml new file mode 100644 index 00000000..2cb61206 --- /dev/null +++ b/subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml @@ -0,0 +1,67 @@ +name: fgbio_create_umi_consensus +description: | + This workflow uses the suite FGBIO to identify and remove UMI tags from FASTQ reads + convert them to unmapped BAM file, map them to the reference genome, + and finally use the mapped information to group UMIs and generate consensus reads in each group +keywords: + - fgbio + - umi + - samblaster + - samtools + - bwa +modules: + - bwa/index + - bwa/mem + - fgbio/fastqtobam + - fgbio/groupreadsbyumi + - fgbio/callmolecularconsensusreads + - samblaster + - samtools/bam2fq +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - reads: + type: list + description: list umi-tagged reads + pattern: "[ *.{fastq.gz/fq.gz} ]" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - read_structure: + type: string + description: | + A read structure should always be provided for each of the fastq files. + If single end, the string will contain only one structure (i.e. "2M11S+T"), if paired-end the string + will contain two structures separated by a blank space (i.e. "2M11S+T 2M11S+T"). + If the read does not contain any UMI, the structure will be +T (i.e. only template of any length). + https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures + - groupreadsbyumi_strategy: + type: string + description: | + Reguired argument: defines the UMI assignment strategy. + Must be chosen among: Identity, Edit, Adjacency, Paired. +output: + - versions: + type: file + description: File containing software versions + pattern: 'versions.yml' + - ubam: + type: file + description: unmapped bam file + pattern: '*.bam' + - groupbam: + type: file + description: mapped bam file, where reads are grouped by UMI tag + pattern: '*.bam' + - consensusbam: + type: file + description: | + mapped bam file, where reads are created as consensus of those + belonging to the same UMI group + pattern: '*.bam' +authors: + - '@lescai' diff --git a/tests/modules/fgbio/groupreadsbyumi/main.nf b/tests/modules/fgbio/groupreadsbyumi/main.nf index 1d5fb474..b9bb350a 100644 --- a/tests/modules/fgbio/groupreadsbyumi/main.nf +++ b/tests/modules/fgbio/groupreadsbyumi/main.nf @@ -10,7 +10,6 @@ workflow test_fgbio_groupreadsbyumi { [ id:'test', single_end:false ], // meta map file(params.test_data['homo_sapiens']['illumina']['test_paired_end_umi_unsorted_tagged_bam'], checkIfExists: true) ] - strategy = "Adjacency" - FGBIO_GROUPREADSBYUMI ( input, strategy ) + FGBIO_GROUPREADSBYUMI ( input, 'Adjacency' ) } diff --git a/tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf new file mode 100644 index 00000000..6b02bbc8 --- /dev/null +++ b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf @@ -0,0 +1,33 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { CREATE_UMI_CONSENSUS } from '../../../../subworkflows/nf-core/fgbio_create_umi_consensus/main' + +workflow test_fgbio_create_umi_consensus_mem1 { + reads = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_umi_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_umi_2_fastq_gz'], checkIfExists: true) + ] + ] + fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + read_structure = "+T 12M11S+T" + + CREATE_UMI_CONSENSUS( reads, fasta, read_structure, "Adjacency", "bwa-mem" ) +} + +workflow test_fgbio_create_umi_consensus_mem2 { + reads = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_umi_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_umi_2_fastq_gz'], checkIfExists: true) + ] + ] + fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + read_structure = "+T 12M11S+T" + + CREATE_UMI_CONSENSUS( reads, fasta, read_structure, "Adjacency", "bwa-mem2" ) +} diff --git a/tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config new file mode 100644 index 00000000..a55a4213 --- /dev/null +++ b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config @@ -0,0 +1,31 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: SAMTOOLS_BAM2FQ { + ext.args = '-T RX' + } + + withName: BWA_MEM { + ext.args = '-p -C -M' + } + + withName: BWAMEM2_MEM { + ext.args = '-p -C -M' + } + + withName: FGBIO_CALLMOLECULARCONSENSUSREADS { + ext.args = '-M 1 -S Coordinate' + ext.prefix = { "${meta.id}_umiconsensus" } + } + + withName: SAMTOOLS_BAM2FQ { + ext.args = '-T RX' + } + + withName: SAMBLASTER { + ext.args = '-M --addMateTags' + ext.prefix = { "${meta.id}_processed" } + } + +} diff --git a/tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml new file mode 100644 index 00000000..2db70d3f --- /dev/null +++ b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml @@ -0,0 +1,22 @@ +- name: fgbio_create_umi_consensus_bwamem1 + command: nextflow run ./tests/subworkflows/nf-core/fgbio_create_umi_consensus -entry test_fgbio_create_umi_consensus_mem1 -c ./tests/config/nextflow.config -c ./tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config + tags: + - subworkflows/fgbio_create_umi_consensus + files: + - path: ./output/fastqtobam/test_umi_converted.bam + md5sum: 9510735554e5eff29244077a72075fb6 + - path: ./output/groupreadsbyumi/test_umi-grouped.bam + md5sum: 44f31da850d5a8100b43b629426f2e17 + - path: ./output/callumiconsensus/test_umiconsensus.bam + md5sum: 24b48e3543de0ae7e8a95c116d5ca6a6 +- name: fgbio_create_umi_consensus_bwamem2 + command: nextflow run ./tests/subworkflows/nf-core/fgbio_create_umi_consensus -entry test_fgbio_create_umi_consensus_mem2 -c ./tests/config/nextflow.config -c ./tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config + tags: + - subworkflows/fgbio_create_umi_consensus_bwamem2 + files: + - path: ./output/fastqtobam/test_umi_converted.bam + md5sum: 9510735554e5eff29244077a72075fb6 + - path: ./output/groupreadsbyumi/test_umi-grouped.bam + md5sum: c69333155038b9a968fd096627d4dfb0 + - path: ./output/callumiconsensus/test_umiconsensus.bam + md5sum: 24b48e3543de0ae7e8a95c116d5ca6a6