diff --git a/CHANGELOG.md b/CHANGELOG.md index edcdf11..6d289cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#271](https://github.com/nf-core/taxprofiler/pull/271/files) Improved standardised table generation documentation nd mOTUs manual database download tutorial (♥ to @prototaxites for reporting, fix by @jfy133) - [#269](https://github.com/nf-core/taxprofiler/pull/269/files) Reduced output files in AWS full test output due to very large files - [#270](https://github.com/nf-core/taxprofiler/pull/270/files) Fixed warning for host removal index parameter, and improved index checks (♥ to @prototaxites for reporting, fix by @jfy133) +- [#274](https://github.com/nf-core/taxprofiler/pull/274/files) Substituted the samtools/bam2fq module with samtools/fastq module (fix by @sofstam) ### `Dependencies` diff --git a/conf/modules.config b/conf/modules.config index ee1a3d2..78d8153 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -250,12 +250,12 @@ process { ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" } } - withName: SAMTOOLS_BAM2FQ { + withName: SAMTOOLS_FASTQ { ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" } publishDir = [ - path: { "${params.outdir}/samtools/bam2fq" }, + path: { "${params.outdir}/samtools/fastq" }, mode: params.publish_dir_mode, - pattern: '*.fq.gz', + pattern: '*.fastq.gz', enabled: params.save_hostremoval_unmapped ] } diff --git a/docs/output.md b/docs/output.md index 4ef6770..4240b25 100644 --- a/docs/output.md +++ b/docs/output.md @@ -21,7 +21,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Bowtie2](#bowtie2) - Host removal for Illumina reads - [minimap2](#minimap2) - Host removal for Nanopore reads - [SAMtools stats](#samtools-stats) - Statistics from host removal -- [SAMtools bam2fq](#samtools-bam2fq) - Converts unmapped BAM file to fastq format (minimap2 only) +- [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only) - [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations - [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches - [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species @@ -201,7 +201,7 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/ By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. -> ℹ️ Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/bam2fq/`](#samtools-bam2fq) +> ℹ️ Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq) > ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc.. @@ -228,11 +228,11 @@ By default, nf-core/taxprofiler will only provide the `.bam` file containing map > ℹ️ minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report. -> ℹ️ Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/bam2fq`](#samtools-bam2fq) +> ℹ️ Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/fastq`](#samtools-fastq) -### SAMtools bam2fq +### SAMtools fastq -[SAMtools bam2fq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format +[SAMtools fastq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format
Output files diff --git a/modules.json b/modules.json index 1a5cd64..9aa617a 100644 --- a/modules.json +++ b/modules.json @@ -187,9 +187,9 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, - "samtools/bam2fq": { + "samtools/fastq": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", "installed_by": ["modules"] }, "samtools/index": { diff --git a/modules/nf-core/samtools/bam2fq/main.nf b/modules/nf-core/samtools/bam2fq/main.nf deleted file mode 100644 index 0b06352..0000000 --- a/modules/nf-core/samtools/bam2fq/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process SAMTOOLS_BAM2FQ { - tag "$meta.id" - label 'process_low' - - conda "bioconda::samtools=1.16.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" - - input: - tuple val(meta), path(inputbam) - val split - - output: - tuple val(meta), path("*.fq.gz"), emit: reads - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - if (split){ - """ - samtools \\ - bam2fq \\ - $args \\ - -@ $task.cpus \\ - -1 ${prefix}_1.fq.gz \\ - -2 ${prefix}_2.fq.gz \\ - -0 ${prefix}_other.fq.gz \\ - -s ${prefix}_singleton.fq.gz \\ - $inputbam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - } else { - """ - samtools \\ - bam2fq \\ - $args \\ - -@ $task.cpus \\ - $inputbam | gzip --no-name > ${prefix}_interleaved.fq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - } -} diff --git a/modules/nf-core/samtools/bam2fq/meta.yml b/modules/nf-core/samtools/bam2fq/meta.yml deleted file mode 100644 index 319a60c..0000000 --- a/modules/nf-core/samtools/bam2fq/meta.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: samtools_bam2fq -description: | - The module uses bam2fq method from samtools to - convert a SAM, BAM or CRAM file to FASTQ format -keywords: - - bam2fq - - samtools - - fastq -tools: - - samtools: - description: Tools for dealing with SAM, BAM and CRAM files - homepage: None - documentation: http://www.htslib.org/doc/1.1/samtools.html - tool_dev_url: None - doi: "" - licence: ["MIT"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - inputbam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - split: - type: boolean - description: | - TRUE/FALSE value to indicate if reads should be separated into - /1, /2 and if present other, or singleton. - Note: choosing TRUE will generate 4 different files. - Choosing FALSE will produce a single file, which will be interleaved in case - the input contains paired reads. - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - reads: - type: file - description: | - FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) - or a single interleaved .fq.gz file if the user chooses not to split the reads. - pattern: "*.fq.gz" - -authors: - - "@lescai" diff --git a/modules/nf-core/samtools/fastq/main.nf b/modules/nf-core/samtools/fastq/main.nf new file mode 100644 index 0000000..c0b36f6 --- /dev/null +++ b/modules/nf-core/samtools/fastq/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTQ { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fastq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fastq.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fastq.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fastq.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fastq.gz" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ + samtools \\ + fastq \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fastq/meta.yml b/modules/nf-core/samtools/fastq/meta.yml new file mode 100644 index 0000000..b1a1ed3 --- /dev/null +++ b/modules/nf-core/samtools/fastq/meta.yml @@ -0,0 +1,62 @@ +name: samtools_fastq +description: Converts a SAM/BAM/CRAM file to FASTQ +keywords: + - bam + - sam + - cram + - fastq +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fastq file + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Compressed FASTQ file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fastq.gz" + - interleaved: + type: file + description: Compressed FASTQ file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fastq.gz" + - singleton: + type: file + description: Compressed FASTQ file with singleton reads + pattern: "*_singleton.fastq.gz" + - other: + type: file + description: Compressed FASTQ file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fastq.gz" + +authors: + - "@priyanka-surana" + - "@suzannejin" diff --git a/nextflow_schema.json b/nextflow_schema.json index 1941893..943d436 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -304,7 +304,7 @@ "type": "boolean", "fa_icon": "fas fa-save", "description": "Save reads from samples that went through the host-removal step", - "help_text": "Save only the reads NOT mapped to the reference genome in FASTQ format (as exported from `samtools view` and `bam2fq`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly." + "help_text": "Save only the reads NOT mapped to the reference genome in FASTQ format (as exported from `samtools view` and `fastq`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly." } }, "fa_icon": "fas fa-user-times" diff --git a/subworkflows/local/longread_hostremoval.nf b/subworkflows/local/longread_hostremoval.nf index 83660fe..f05e6a6 100644 --- a/subworkflows/local/longread_hostremoval.nf +++ b/subworkflows/local/longread_hostremoval.nf @@ -5,7 +5,7 @@ include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' -include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/samtools/bam2fq/main' +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' @@ -38,8 +38,8 @@ workflow LONGREAD_HOSTREMOVAL { SAMTOOLS_VIEW ( ch_minimap2_mapped , [], [] ) ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) - SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false ) - ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() ) + SAMTOOLS_FASTQ ( SAMTOOLS_VIEW.out.bam, false ) + ch_versions = ch_versions.mix( SAMTOOLS_FASTQ.out.versions.first() ) // Indexing whole BAM for host removal statistics SAMTOOLS_INDEX ( MINIMAP2_ALIGN.out.bam ) @@ -54,7 +54,7 @@ workflow LONGREAD_HOSTREMOVAL { emit: stats = SAMTOOLS_STATS.out.stats //channel: [val(meta), [reads ] ] - reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ] + reads = SAMTOOLS_FASTQ.out.fastq.mix( SAMTOOLS_FASTQ.out.other) // channel: [ val(meta), [ reads ] ] versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files }