From a68c563e54d5f3720fc57ef6e34ff08c4b3ec398 Mon Sep 17 00:00:00 2001
From: Francesco L <53608000+lescai@users.noreply.github.com>
Date: Thu, 9 Dec 2021 11:16:40 +0100
Subject: [PATCH] Added UMI sub-workflow (#1098)

* added code for subworkflow fgbio call umi consensus

* ironing out a few typos etc

* fixing last things

* fixed md5sum - lets see if it changes

* removing file accidentally deleted

* tidy indents

* added bwamem2 alternative

* fixed entry for both tests

* changed name second test workflow entry

* fixed workflow entry names

* fixed md5sum for file generated with bwamem2

* added syntax new DSL2

* added new config location in test command line

* added new config location in test command line

* use of prefix instead of suffix because modules have been changed in this way

* explicit alias to bwa mem1 to avoid confusion

* removed param that should be an ext optional argument in fgbio groupreadsbyumi

* missing colon in config

* missing colon in module config too

* order list alphabetically

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* remove params from body

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* improving readability of input structure

Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>

* reverting to mandatory input

* fixed tests and workflow take values

* remove param

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* simplify tests params

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>

* formatting inputs for readability

* factoring in changes to bwamem2_mem and bwa_mem sort/view inputs

* updating test md5sum for grouped file following code update in bwamem

Co-authored-by: Maxime U. Garcia <maxime.garcia@scilifelab.se>
Co-authored-by: Maxime U. Garcia <max.u.garcia@gmail.com>
Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>
---
 .../fgbio_create_umi_consensus/main.nf        | 86 +++++++++++++++++++
 .../fgbio_create_umi_consensus/meta.yml       | 67 +++++++++++++++
 tests/modules/fgbio/groupreadsbyumi/main.nf   |  3 +-
 .../fgbio_create_umi_consensus/main.nf        | 33 +++++++
 .../nextflow.config                           | 31 +++++++
 .../fgbio_create_umi_consensus/test.yml       | 22 +++++
 6 files changed, 240 insertions(+), 2 deletions(-)
 create mode 100644 subworkflows/nf-core/fgbio_create_umi_consensus/main.nf
 create mode 100644 subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml
 create mode 100644 tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf
 create mode 100644 tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
 create mode 100644 tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml

diff --git a/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf b/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf
new file mode 100644
index 00000000..042d0bbd
--- /dev/null
+++ b/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf
@@ -0,0 +1,86 @@
+//
+// Runs FGBIO tools to remove UMI tags from FASTQ reads
+// Convert them to unmapped BAM file, map them to the reference genome,
+// use the mapped information to group UMIs and generate consensus reads
+//
+
+
+include { BWAMEM2_INDEX                                         } from '../../../modules/bwamem2/index/main.nf'
+include { BWAMEM2_MEM                                           } from '../../../modules/bwamem2/mem/main'
+include { BWA_INDEX                         as BWAMEM1_INDEX    } from '../../../modules/bwa/index/main.nf'
+include { BWA_MEM                           as BWAMEM1_MEM      } from '../../../modules/bwa/mem/main'
+include { FGBIO_CALLMOLECULARCONSENSUSREADS as CALLUMICONSENSUS } from '../../../modules/fgbio/callmolecularconsensusreads/main.nf'
+include { FGBIO_FASTQTOBAM                  as FASTQTOBAM       } from '../../../modules/fgbio/fastqtobam/main'
+include { FGBIO_GROUPREADSBYUMI             as GROUPREADSBYUMI  } from '../../../modules/fgbio/groupreadsbyumi/main'
+include { SAMBLASTER                                            } from '../../../modules/samblaster/main'
+include { SAMTOOLS_BAM2FQ                   as BAM2FASTQ        } from '../../../modules/samtools/bam2fq/main.nf'
+
+
+workflow CREATE_UMI_CONSENSUS {
+    take:
+    reads                     // channel: [mandatory] [ val(meta), [ reads ] ]
+    fasta                     // channel: [mandatory] /path/to/reference/fasta
+    read_structure            // string:  [mandatory] "read_structure"
+    groupreadsbyumi_strategy  // string:  [mandatory] grouping strategy - default: "Adjacency"
+    aligner                   // string:  [mandatory] "bwa-mem" or "bwa-mem2"
+
+    main:
+    ch_versions = Channel.empty()
+
+    // using information in val(read_structure) FASTQ reads are converted into
+    // a tagged unmapped BAM file (uBAM)
+    FASTQTOBAM ( reads, read_structure )
+    ch_versions = ch_versions.mix(FASTQTOBAM.out.version)
+
+    // in order to map uBAM using BWA MEM, we need to convert uBAM to FASTQ
+    // but keep the appropriate UMI tags in the FASTQ comment field and produce
+    // an interleaved FASQT file (hence, split = false)
+    split = false
+    BAM2FASTQ ( FASTQTOBAM.out.umibam, split )
+    ch_versions = ch_versions.mix(BAM2FASTQ.out.versions)
+
+    // the user can choose here to use either bwa-mem (default) or bwa-mem2
+    aligned_bam = Channel.empty()
+
+    if (aligner == "bwa-mem") {
+        // reference is indexed
+        BWAMEM1_INDEX ( fasta )
+        ch_versions = ch_versions.mix(BWAMEM1_INDEX.out.versions)
+
+        // appropriately tagged interleaved FASTQ reads are mapped to the reference
+        BWAMEM1_MEM ( BAM2FASTQ.out.reads, BWAMEM1_INDEX.out.index, false )
+        ch_versions = ch_versions.mix(BWAMEM1_MEM.out.versions)
+        aligned_bam = BWAMEM1_MEM.out.bam
+    } else {
+        // reference is indexed
+        BWAMEM2_INDEX ( fasta )
+        ch_versions = ch_versions.mix(BWAMEM2_INDEX.out.versions)
+
+        // appropriately tagged interleaved FASTQ reads are mapped to the reference
+        BWAMEM2_MEM ( BAM2FASTQ.out.reads, BWAMEM2_INDEX.out.index, false )
+        ch_versions = ch_versions.mix(BWAMEM2_MEM.out.versions)
+        aligned_bam = BWAMEM2_MEM.out.bam
+    }
+
+    // samblaster is used in order to tag mates information in the BAM file
+    // this is used in order to group reads by UMI
+    SAMBLASTER ( aligned_bam )
+    ch_versions = ch_versions.mix(SAMBLASTER.out.versions)
+
+    // appropriately tagged reads are now grouped by UMI information
+    GROUPREADSBYUMI ( SAMBLASTER.out.bam, groupreadsbyumi_strategy )
+    ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions)
+
+    // using the above created groups, a consensus across reads in the same grou
+    // can be called
+    // this will emit a consensus BAM file
+    CALLUMICONSENSUS ( GROUPREADSBYUMI.out.bam )
+    ch_versions = ch_versions.mix(CALLUMICONSENSUS.out.versions)
+
+    emit:
+    ubam           = FASTQTOBAM.out.umibam          // channel: [ val(meta), [ bam ] ]
+    groupbam       = GROUPREADSBYUMI.out.bam        // channel: [ val(meta), [ bam ] ]
+    consensusbam   = CALLUMICONSENSUS.out.bam       // channel: [ val(meta), [ bam ] ]
+    versions       = ch_versions                    // channel: [ versions.yml ]
+}
+
diff --git a/subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml b/subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml
new file mode 100644
index 00000000..2cb61206
--- /dev/null
+++ b/subworkflows/nf-core/fgbio_create_umi_consensus/meta.yml
@@ -0,0 +1,67 @@
+name: fgbio_create_umi_consensus
+description: |
+  This workflow uses the suite FGBIO to identify and remove UMI tags from FASTQ reads
+  convert them to unmapped BAM file, map them to the reference genome,
+  and finally use the mapped information to group UMIs and generate consensus reads in each group
+keywords:
+  - fgbio
+  - umi
+  - samblaster
+  - samtools
+  - bwa
+modules:
+  - bwa/index
+  - bwa/mem
+  - fgbio/fastqtobam
+  - fgbio/groupreadsbyumi
+  - fgbio/callmolecularconsensusreads
+  - samblaster
+  - samtools/bam2fq
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test' ]
+  - reads:
+      type: list
+      description: list umi-tagged reads
+      pattern: "[ *.{fastq.gz/fq.gz} ]"
+  - fasta:
+      type: file
+      description: The reference fasta file
+      pattern: "*.fasta"
+  - read_structure:
+      type: string
+      description: |
+        A read structure should always be provided for each of the fastq files.
+        If single end, the string will contain only one structure (i.e. "2M11S+T"), if paired-end the string
+        will contain two structures separated by a blank space (i.e. "2M11S+T 2M11S+T").
+        If the read does not contain any UMI, the structure will be +T (i.e. only template of any length).
+        https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures
+  - groupreadsbyumi_strategy:
+    type: string
+    description: |
+    Reguired argument: defines the UMI assignment strategy.
+    Must be chosen among: Identity, Edit, Adjacency, Paired.
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: 'versions.yml'
+  - ubam:
+      type: file
+      description: unmapped bam file
+      pattern: '*.bam'
+  - groupbam:
+      type: file
+      description: mapped bam file, where reads are grouped by UMI tag
+      pattern: '*.bam'
+  - consensusbam:
+      type: file
+      description: |
+        mapped bam file, where reads are created as consensus of those
+        belonging to the same UMI group
+      pattern: '*.bam'
+authors:
+  - '@lescai'
diff --git a/tests/modules/fgbio/groupreadsbyumi/main.nf b/tests/modules/fgbio/groupreadsbyumi/main.nf
index 1d5fb474..b9bb350a 100644
--- a/tests/modules/fgbio/groupreadsbyumi/main.nf
+++ b/tests/modules/fgbio/groupreadsbyumi/main.nf
@@ -10,7 +10,6 @@ workflow test_fgbio_groupreadsbyumi {
         [ id:'test', single_end:false ], // meta map
         file(params.test_data['homo_sapiens']['illumina']['test_paired_end_umi_unsorted_tagged_bam'], checkIfExists: true)
     ]
-    strategy = "Adjacency"
 
-    FGBIO_GROUPREADSBYUMI ( input, strategy )
+    FGBIO_GROUPREADSBYUMI ( input, 'Adjacency' )
 }
diff --git a/tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf
new file mode 100644
index 00000000..6b02bbc8
--- /dev/null
+++ b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/main.nf
@@ -0,0 +1,33 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { CREATE_UMI_CONSENSUS } from '../../../../subworkflows/nf-core/fgbio_create_umi_consensus/main'
+
+workflow test_fgbio_create_umi_consensus_mem1 {
+    reads = [
+        [ id:'test', single_end:false ], // meta map
+        [
+            file(params.test_data['homo_sapiens']['illumina']['test_umi_1_fastq_gz'], checkIfExists: true),
+            file(params.test_data['homo_sapiens']['illumina']['test_umi_2_fastq_gz'], checkIfExists: true)
+        ]
+    ]
+    fasta          =    file(params.test_data['homo_sapiens']['genome']['genome_fasta'],            checkIfExists: true)
+    read_structure =    "+T 12M11S+T"
+
+    CREATE_UMI_CONSENSUS( reads, fasta, read_structure, "Adjacency", "bwa-mem" )
+}
+
+workflow test_fgbio_create_umi_consensus_mem2 {
+    reads = [
+        [ id:'test', single_end:false ], // meta map
+        [
+            file(params.test_data['homo_sapiens']['illumina']['test_umi_1_fastq_gz'], checkIfExists: true),
+            file(params.test_data['homo_sapiens']['illumina']['test_umi_2_fastq_gz'], checkIfExists: true)
+        ]
+    ]
+    fasta          =    file(params.test_data['homo_sapiens']['genome']['genome_fasta'],            checkIfExists: true)
+    read_structure =    "+T 12M11S+T"
+
+    CREATE_UMI_CONSENSUS( reads, fasta, read_structure, "Adjacency", "bwa-mem2" )
+}
diff --git a/tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
new file mode 100644
index 00000000..a55a4213
--- /dev/null
+++ b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
@@ -0,0 +1,31 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+
+    withName: SAMTOOLS_BAM2FQ {
+        ext.args = '-T RX'
+    }
+
+    withName: BWA_MEM {
+        ext.args = '-p -C -M'
+    }
+
+    withName: BWAMEM2_MEM {
+        ext.args = '-p -C -M'
+    }
+
+    withName: FGBIO_CALLMOLECULARCONSENSUSREADS {
+        ext.args = '-M 1 -S Coordinate'
+        ext.prefix = { "${meta.id}_umiconsensus" }
+    }
+
+    withName: SAMTOOLS_BAM2FQ {
+        ext.args = '-T RX'
+    }
+
+    withName: SAMBLASTER {
+        ext.args = '-M --addMateTags'
+        ext.prefix = { "${meta.id}_processed" }
+    }
+
+}
diff --git a/tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml
new file mode 100644
index 00000000..2db70d3f
--- /dev/null
+++ b/tests/subworkflows/nf-core/fgbio_create_umi_consensus/test.yml
@@ -0,0 +1,22 @@
+- name: fgbio_create_umi_consensus_bwamem1
+  command: nextflow run ./tests/subworkflows/nf-core/fgbio_create_umi_consensus -entry test_fgbio_create_umi_consensus_mem1 -c ./tests/config/nextflow.config -c ./tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
+  tags:
+    - subworkflows/fgbio_create_umi_consensus
+  files:
+    - path: ./output/fastqtobam/test_umi_converted.bam
+      md5sum: 9510735554e5eff29244077a72075fb6
+    - path: ./output/groupreadsbyumi/test_umi-grouped.bam
+      md5sum: 44f31da850d5a8100b43b629426f2e17
+    - path: ./output/callumiconsensus/test_umiconsensus.bam
+      md5sum: 24b48e3543de0ae7e8a95c116d5ca6a6
+- name: fgbio_create_umi_consensus_bwamem2
+  command: nextflow run ./tests/subworkflows/nf-core/fgbio_create_umi_consensus -entry test_fgbio_create_umi_consensus_mem2 -c ./tests/config/nextflow.config -c ./tests/subworkflows/nf-core/fgbio_create_umi_consensus/nextflow.config
+  tags:
+    - subworkflows/fgbio_create_umi_consensus_bwamem2
+  files:
+    - path: ./output/fastqtobam/test_umi_converted.bam
+      md5sum: 9510735554e5eff29244077a72075fb6
+    - path: ./output/groupreadsbyumi/test_umi-grouped.bam
+      md5sum: c69333155038b9a968fd096627d4dfb0
+    - path: ./output/callumiconsensus/test_umiconsensus.bam
+      md5sum: 24b48e3543de0ae7e8a95c116d5ca6a6