From 482112bb42ddd314de8e92b6a3c1a94e530828f7 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 3 Apr 2022 07:58:40 +0200 Subject: [PATCH] Start work on host removal --- modules.json | 11 ++- modules/nf-core/modules/bowtie2/align/main.nf | 77 +++++++++++++++++++ .../nf-core/modules/bowtie2/align/meta.yml | 51 ++++++++++++ modules/nf-core/modules/bowtie2/build/main.nf | 30 ++++++++ .../nf-core/modules/bowtie2/build/meta.yml | 33 ++++++++ .../nf-core/modules/samtools/flagstat/main.nf | 34 ++++++++ .../modules/samtools/flagstat/meta.yml | 49 ++++++++++++ nextflow.config | 4 + subworkflows/local/shortread_hostremoval.nf | 39 ++++++++++ workflows/taxprofiler.nf | 12 ++- 10 files changed, 336 insertions(+), 4 deletions(-) create mode 100644 modules/nf-core/modules/bowtie2/align/main.nf create mode 100644 modules/nf-core/modules/bowtie2/align/meta.yml create mode 100644 modules/nf-core/modules/bowtie2/build/main.nf create mode 100644 modules/nf-core/modules/bowtie2/build/meta.yml create mode 100644 modules/nf-core/modules/samtools/flagstat/main.nf create mode 100644 modules/nf-core/modules/samtools/flagstat/meta.yml create mode 100644 subworkflows/local/shortread_hostremoval.nf diff --git a/modules.json b/modules.json index dcfbd3f..7c3facc 100644 --- a/modules.json +++ b/modules.json @@ -6,6 +6,12 @@ "adapterremoval": { "git_sha": "f0800157544a82ae222931764483331a81812012" }, + "bowtie2/align": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, + "bowtie2/build": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, @@ -30,9 +36,12 @@ "porechop": { "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" }, + "samtools/flagstat": { + "git_sha": "1ad73f1b2abdea9398680d6d20014838135c9a35" + }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/bowtie2/align/main.nf b/modules/nf-core/modules/bowtie2/align/main.nf new file mode 100644 index 0000000..7e8a965 --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/main.nf @@ -0,0 +1,77 @@ +process BOWTIE2_ALIGN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4 bioconda::samtools=1.14 conda-forge::pigz=2.6' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:4d235f41348a00533f18e47c9669f1ecb327f629-0' : + 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:4d235f41348a00533f18e47c9669f1ecb327f629-0' }" + + input: + tuple val(meta), path(reads) + path index + val save_unaligned + + output: + tuple val(meta), path('*.bam') , emit: bam + tuple val(meta), path('*.log') , emit: log + tuple val(meta), path('*fastq.gz'), emit: fastq, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -x \$INDEX \\ + -U $reads \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools view -@ $task.cpus $args2 -bhS -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + } else { + def unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -x \$INDEX \\ + -1 ${reads[0]} \\ + -2 ${reads[1]} \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools view -@ $task.cpus $args2 -bhS -o ${prefix}.bam - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/bowtie2/align/meta.yml b/modules/nf-core/modules/bowtie2/align/meta.yml new file mode 100644 index 0000000..f80421e --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/meta.yml @@ -0,0 +1,51 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - log: + type: file + description: Aligment log + pattern: "*.log" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/bowtie2/build/main.nf b/modules/nf-core/modules/bowtie2/build/main.nf new file mode 100644 index 0000000..a4da62d --- /dev/null +++ b/modules/nf-core/modules/bowtie2/build/main.nf @@ -0,0 +1,30 @@ +process BOWTIE2_BUILD { + tag "$fasta" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.4' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.4--py39hbb4e92a_0' : + 'quay.io/biocontainers/bowtie2:2.4.4--py39hbb4e92a_0' }" + + input: + path fasta + + output: + path 'bowtie2' , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie2 + bowtie2-build $args --threads $task.cpus $fasta bowtie2/${fasta.baseName} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/bowtie2/build/meta.yml b/modules/nf-core/modules/bowtie2/build/meta.yml new file mode 100644 index 0000000..2da9a21 --- /dev/null +++ b/modules/nf-core/modules/bowtie2/build/meta.yml @@ -0,0 +1,33 @@ +name: bowtie2_build +description: Builds bowtie index for reference genome +keywords: + - build + - index + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - fasta: + type: file + description: Input genome fasta file +output: + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.bt2" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/samtools/flagstat/main.nf b/modules/nf-core/modules/samtools/flagstat/main.nf new file mode 100644 index 0000000..9e3440a --- /dev/null +++ b/modules/nf-core/modules/samtools/flagstat/main.nf @@ -0,0 +1,34 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : + 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + flagstat \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${bam}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/samtools/flagstat/meta.yml b/modules/nf-core/modules/samtools/flagstat/meta.yml new file mode 100644 index 0000000..9526906 --- /dev/null +++ b/modules/nf-core/modules/samtools/flagstat/meta.yml @@ -0,0 +1,49 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 7be36a6..8aa8722 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,6 +65,10 @@ params { shortread_clipmerge_minlength = 15 longread_clip = false + // Host Removal + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + // MALT run_malt = false malt_mode = 'BlastN' diff --git a/subworkflows/local/shortread_hostremoval.nf b/subworkflows/local/shortread_hostremoval.nf new file mode 100644 index 0000000..4b0861c --- /dev/null +++ b/subworkflows/local/shortread_hostremoval.nf @@ -0,0 +1,39 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { BOWTIE2_ALIGN } from '../../../modules/nf-core/modules/bowtie2/align/main' +include { BOWTIE2_BUILD } from '../../../modules/nf-core/modules/bowtie2/build/main' +include { SAMTOOLS_VIEW } from '../../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_FASTQ } from '../../../modules/nf-core/modules/samtools/fastq/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/modules/samtools/flagstat/main' + +workflow SHORTREAD_PREPROCESSING { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.shortread_hostremoval_index ) { + file( , checkIfExists: true ) + BOWTIE2_BUILD ( reference ) + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + } + + BOWTIE2_ALIGN ( reads, BOWTIE2_BUILD.out.index ) + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT.out.log ) + + SAMTOOLS_FLAGSTAT ( BOWTIE2_ALIGN.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT.out.flagstat ) + + emit: + reads = BOWTIE2_ALIGN.out.fastq // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index ce89a91..c5678f1 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,15 +11,21 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.databases, params.multiqc_config ] +def checkPathParamList = [ params.input, params.databases, params.shortread_hostremoval_reference, + params.shortread_hostremoval_index, params.multiqc_config + ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } +if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.databases ) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +// TODO Add check if index but no reference exit 1 +if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } else { } +if (params.shortread_hostremoval_index) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES