Added UMI sub-workflow (#1098)

* added code for subworkflow fgbio call umi consensus

* ironing out a few typos etc

* fixing last things

* fixed md5sum - lets see if it changes

* removing file accidentally deleted

* tidy indents

* added bwamem2 alternative

* fixed entry for both tests

* changed name second test workflow entry

* fixed workflow entry names

* fixed md5sum for file generated with bwamem2

* added syntax new DSL2

* added new config location in test command line

* added new config location in test command line

* use of prefix instead of suffix because modules have been changed in this way

* explicit alias to bwa mem1 to avoid confusion

* removed param that should be an ext optional argument in fgbio groupreadsbyumi

* missing colon in config

* missing colon in module config too

* order list alphabetically

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* remove params from body

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* improving readability of input structure

Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>

* reverting to mandatory input

* fixed tests and workflow take values

* remove param

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* simplify tests params

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* formatting inputs for readability

* factoring in changes to bwamem2_mem and bwa_mem sort/view inputs

* updating test md5sum for grouped file following code update in bwamem

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>
Co-authored-by: Maxime U. Garcia <max.u.garcia@gmail.com>
Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>
This commit is contained in:
Francesco L 2021-12-09 11:16:40 +01:00 committed by GitHub
parent 37c5cb495d
commit a68c563e54
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 240 additions and 2 deletions

View file

@ -0,0 +1,86 @@
//
// Runs FGBIO tools to remove UMI tags from FASTQ reads
// Convert them to unmapped BAM file, map them to the reference genome,
// use the mapped information to group UMIs and generate consensus reads
//
include { BWAMEM2_INDEX } from '../../../modules/bwamem2/index/main.nf'
include { BWAMEM2_MEM } from '../../../modules/bwamem2/mem/main'
include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/bwa/index/main.nf'
include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/bwa/mem/main'
include { FGBIO_CALLMOLECULARCONSENSUSREADS as CALLUMICONSENSUS } from '../../../modules/fgbio/callmolecularconsensusreads/main.nf'
include { FGBIO_FASTQTOBAM as FASTQTOBAM } from '../../../modules/fgbio/fastqtobam/main'
include { FGBIO_GROUPREADSBYUMI as GROUPREADSBYUMI } from '../../../modules/fgbio/groupreadsbyumi/main'
include { SAMBLASTER } from '../../../modules/samblaster/main'
include { SAMTOOLS_BAM2FQ as BAM2FASTQ } from '../../../modules/samtools/bam2fq/main.nf'
workflow CREATE_UMI_CONSENSUS {
take:
reads // channel: [mandatory] [ val(meta), [ reads ] ]
fasta // channel: [mandatory] /path/to/reference/fasta
read_structure // string: [mandatory] "read_structure"
groupreadsbyumi_strategy // string: [mandatory] grouping strategy - default: "Adjacency"
aligner // string: [mandatory] "bwa-mem" or "bwa-mem2"
main:
ch_versions = Channel.empty()
// using information in val(read_structure) FASTQ reads are converted into
// a tagged unmapped BAM file (uBAM)
FASTQTOBAM ( reads, read_structure )
ch_versions = ch_versions.mix(FASTQTOBAM.out.version)
// in order to map uBAM using BWA MEM, we need to convert uBAM to FASTQ
// but keep the appropriate UMI tags in the FASTQ comment field and produce
// an interleaved FASQT file (hence, split = false)
split = false
BAM2FASTQ ( FASTQTOBAM.out.umibam, split )
ch_versions = ch_versions.mix(BAM2FASTQ.out.versions)
// the user can choose here to use either bwa-mem (default) or bwa-mem2
aligned_bam = Channel.empty()
if (aligner == "bwa-mem") {
// reference is indexed
BWAMEM1_INDEX ( fasta )
ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions)
// appropriately tagged interleaved FASTQ reads are mapped to the reference
BWAMEM1_MEM ( BAM2FASTQ.out.reads, BWAMEM1_INDEX.out.index, false )
ch_versions = ch_versions.mix(BWAMEM1_MEM.out.versions)
aligned_bam = BWAMEM1_MEM.out.bam
} else {
// reference is indexed
BWAMEM2_INDEX ( fasta )
ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
// appropriately tagged interleaved FASTQ reads are mapped to the reference
BWAMEM2_MEM ( BAM2FASTQ.out.reads, BWAMEM2_INDEX.out.index, false )
ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions)
aligned_bam = BWAMEM2_MEM.out.bam
}
// samblaster is used in order to tag mates information in the BAM file
// this is used in order to group reads by UMI
SAMBLASTER ( aligned_bam )
ch_versions = ch_versions.mix(SAMBLASTER.out.versions)
// appropriately tagged reads are now grouped by UMI information
GROUPREADSBYUMI ( SAMBLASTER.out.bam, groupreadsbyumi_strategy )
ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions)
// using the above created groups, a consensus across reads in the same grou
// can be called
// this will emit a consensus BAM file
CALLUMICONSENSUS ( GROUPREADSBYUMI.out.bam )
ch_versions = ch_versions.mix(CALLUMICONSENSUS.out.versions)
emit:
ubam = FASTQTOBAM.out.umibam // channel: [ val(meta), [ bam ] ]
groupbam = GROUPREADSBYUMI.out.bam // channel: [ val(meta), [ bam ] ]
consensusbam = CALLUMICONSENSUS.out.bam // channel: [ val(meta), [ bam ] ]
versions = ch_versions // channel: [ versions.yml ]
}

View file

@ -0,0 +1,67 @@
name: fgbio_create_umi_consensus
description: |
This workflow uses the suite FGBIO to identify and remove UMI tags from FASTQ reads
convert them to unmapped BAM file, map them to the reference genome,
and finally use the mapped information to group UMIs and generate consensus reads in each group
keywords:
- fgbio
- umi
- samblaster
- samtools
- bwa
modules:
- bwa/index
- bwa/mem
- fgbio/fastqtobam
- fgbio/groupreadsbyumi
- fgbio/callmolecularconsensusreads
- samblaster
- samtools/bam2fq
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test' ]
- reads:
type: list
description: list umi-tagged reads
pattern: "[ *.{fastq.gz/fq.gz} ]"
- fasta:
type: file
description: The reference fasta file
pattern: "*.fasta"
- read_structure:
type: string
description: |
A read structure should always be provided for each of the fastq files.
If single end, the string will contain only one structure (i.e. "2M11S+T"), if paired-end the string
will contain two structures separated by a blank space (i.e. "2M11S+T 2M11S+T").
If the read does not contain any UMI, the structure will be +T (i.e. only template of any length).
https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures
- groupreadsbyumi_strategy:
type: string
description: |
Reguired argument: defines the UMI assignment strategy.
Must be chosen among: Identity, Edit, Adjacency, Paired.
output:
- versions:
type: file
description: File containing software versions
pattern: 'versions.yml'
- ubam:
type: file
description: unmapped bam file
pattern: '*.bam'
- groupbam:
type: file
description: mapped bam file, where reads are grouped by UMI tag
pattern: '*.bam'
- consensusbam:
type: file
description: |
mapped bam file, where reads are created as consensus of those
belonging to the same UMI group
pattern: '*.bam'
authors:
- '@lescai'

View file

@ -10,7 +10,6 @@ workflow test_fgbio_groupreadsbyumi {
[ id:'test', single_end:false ], // meta map
file(params.test_data['homo_sapiens']['illumina']['test_paired_end_umi_unsorted_tagged_bam'], checkIfExists: true)
]
strategy = "Adjacency"
FGBIO_GROUPREADSBYUMI ( input, strategy )
FGBIO_GROUPREADSBYUMI ( input, 'Adjacency' )
}

View file

@ -0,0 +1,33 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { CREATE_UMI_CONSENSUS } from '../../../../subworkflows/nf-core/fgbio_create_umi_consensus/main'
workflow test_fgbio_create_umi_consensus_mem1 {
reads = [
[ id:'test', single_end:false ], // meta map
[
file(params.test_data['homo_sapiens']['illumina']['test_umi_1_fastq_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test_umi_2_fastq_gz'], checkIfExists: true)
]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
read_structure = "+T 12M11S+T"
CREATE_UMI_CONSENSUS( reads, fasta, read_structure, "Adjacency", "bwa-mem" )
}
workflow test_fgbio_create_umi_consensus_mem2 {
reads = [
[ id:'test', single_end:false ], // meta map
[
file(params.test_data['homo_sapiens']['illumina']['test_umi_1_fastq_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test_umi_2_fastq_gz'], checkIfExists: true)
]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
read_structure = "+T 12M11S+T"
CREATE_UMI_CONSENSUS( reads, fasta, read_structure, "Adjacency", "bwa-mem2" )
}

View file

@ -0,0 +1,31 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: SAMTOOLS_BAM2FQ {
ext.args = '-T RX'
}
withName: BWA_MEM {
ext.args = '-p -C -M'
}
withName: BWAMEM2_MEM {
ext.args = '-p -C -M'
}
withName: FGBIO_CALLMOLECULARCONSENSUSREADS {
ext.args = '-M 1 -S Coordinate'
ext.prefix = { "${meta.id}_umiconsensus" }
}
withName: SAMTOOLS_BAM2FQ {
ext.args = '-T RX'
}
withName: SAMBLASTER {
ext.args = '-M --addMateTags'
ext.prefix = { "${meta.id}_processed" }
}
}

View file

@ -0,0 +1,22 @@
- name: fgbio_create_umi_consensus_bwamem1
command: nextflow run ./tests/subworkflows/nf-core/fgbio_create_umi_consensus -entry test_fgbio_create_umi_consensus_mem1 -c ./tests/config/nextflow.config -c ./tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
tags:
- subworkflows/fgbio_create_umi_consensus
files:
- path: ./output/fastqtobam/test_umi_converted.bam
md5sum: 9510735554e5eff29244077a72075fb6
- path: ./output/groupreadsbyumi/test_umi-grouped.bam
md5sum: 44f31da850d5a8100b43b629426f2e17
- path: ./output/callumiconsensus/test_umiconsensus.bam
md5sum: 24b48e3543de0ae7e8a95c116d5ca6a6
- name: fgbio_create_umi_consensus_bwamem2
command: nextflow run ./tests/subworkflows/nf-core/fgbio_create_umi_consensus -entry test_fgbio_create_umi_consensus_mem2 -c ./tests/config/nextflow.config -c ./tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
tags:
- subworkflows/fgbio_create_umi_consensus_bwamem2
files:
- path: ./output/fastqtobam/test_umi_converted.bam
md5sum: 9510735554e5eff29244077a72075fb6
- path: ./output/groupreadsbyumi/test_umi-grouped.bam
md5sum: c69333155038b9a968fd096627d4dfb0
- path: ./output/callumiconsensus/test_umiconsensus.bam
md5sum: 24b48e3543de0ae7e8a95c116d5ca6a6