1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-14 18:13:09 +00:00

Fix exporting of mapped/unmapped reads for consistency across both short and long reads

This commit is contained in:
James Fellows Yates 2023-02-07 11:43:40 +01:00
parent de5bdc36c5
commit 059cd17114
7 changed files with 31 additions and 38 deletions

View file

@ -201,6 +201,7 @@ process {
] ]
} }
// Saving unmapped reads as FQ comes via input channel!
withName: BOWTIE2_ALIGN { withName: BOWTIE2_ALIGN {
ext.prefix = { "${meta.id}_${meta.run_accession}" } ext.prefix = { "${meta.id}_${meta.run_accession}" }
publishDir = [ publishDir = [
@ -212,7 +213,7 @@ process {
[ [
path: { "${params.outdir}/bowtie2/align" }, path: { "${params.outdir}/bowtie2/align" },
mode: params.publish_dir_mode, mode: params.publish_dir_mode,
enabled: params.save_hostremoval_mapped, enabled: params.save_hostremoval_bam,
pattern: '*.bam' pattern: '*.bam'
], ],
[ [
@ -239,24 +240,18 @@ process {
publishDir = [ publishDir = [
path: { "${params.outdir}/minimap2/align" }, path: { "${params.outdir}/minimap2/align" },
mode: params.publish_dir_mode, mode: params.publish_dir_mode,
enabled: params.save_hostremoval_mapped, enabled: params.save_hostremoval_bam,
pattern: '*.bam' pattern: '*.bam'
] ]
} }
withName: SAMTOOLS_VIEW { withName: SAMTOOLS_VIEW {
ext.args = '-f 4' ext.args = '-f 4'
ext.prefix = { "${meta.id}.mapped.sorted" } ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" }
publishDir = [
path: { "${params.outdir}/samtools/view" },
mode: params.publish_dir_mode,
enabled: params.save_hostremoval_unmapped,
pattern: '*.bam'
]
} }
withName: SAMTOOLS_BAM2FQ { withName: SAMTOOLS_BAM2FQ {
ext.prefix = { "${meta.id}_${meta.run_accession}" } ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" }
publishDir = [ publishDir = [
path: { "${params.outdir}/samtools/bam2fq" }, path: { "${params.outdir}/samtools/bam2fq" },
mode: params.publish_dir_mode, mode: params.publish_dir_mode,

View file

@ -30,7 +30,7 @@ params {
perform_shortread_hostremoval = true perform_shortread_hostremoval = true
perform_longread_hostremoval = true perform_longread_hostremoval = true
save_hostremoval_index = true save_hostremoval_index = true
save_hostremoval_mapped = true save_hostremoval_bam = true
save_hostremoval_unmapped = true save_hostremoval_unmapped = true
perform_runmerging = true perform_runmerging = true

View file

@ -23,7 +23,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [Bowtie2](#bowtie2) - Host removal for Illumina reads - [Bowtie2](#bowtie2) - Host removal for Illumina reads
- [minimap2](#minimap2) - Host removal for Nanopore reads - [minimap2](#minimap2) - Host removal for Nanopore reads
- [SAMtools stats](#samtoolsstats) - Statistics from host removal - [SAMtools stats](#samtoolsstats) - Statistics from host removal
- [SAMtools fastq](#samtoolsfastq) - Converts the alignment file in fastq format - [SAMtools bam2fq](#samtoolsfastq) - Converts unmapped BAM file to fastq format (minimap2 only)
- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations - [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations
- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches - [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches
- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species - [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species
@ -179,7 +179,7 @@ You will only find the `.fastq` files in the results directory if you provide `
### Bowtie2 ### Bowtie2
[Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters, and particularly good at aligning to relatively long (e.g. mammalian) genomes. [Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters, and particularly good at aligning to relatively long (e.g. mammalian) genomes.
It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/or other possible contaminant reads (e.g. Phi X) from short-read `.fastq` files prior to profiling. It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/or other possible contaminant reads (e.g. Phi X) from short-read `.fastq` files prior to profiling.
@ -187,13 +187,15 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/
<summary>Output files</summary> <summary>Output files</summary>
- `bowtie2/` - `bowtie2/`
- `<sample_id>.bam`: reads that aligned against the user-supplied reference genome - `<sample_id>.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads
- `<sample_id>.bowtie2.log`: log file about the mapped reads - `<sample_id>.bowtie2.log`: log file about the mapped reads
- `<sample_id>.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps. - `<sample_id>.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps.
</details> </details>
By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only see the mapped (host) and unmapped reads in `.bam` format or the off-target reads in `.fastq` format in your results directory if you provide `--save_hostremoval_mapped` and ` --save_hostremoval_unmapped` respectively. By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads.
> Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/bam2fq/`](#samtools-bam2fq)
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc.. > ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc..
@ -209,27 +211,31 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) or o
<summary>Output files</summary> <summary>Output files</summary>
- `minimap2` - `minimap2`
- `<sample_id>.bam`: Alignment file in BAM format - `<sample_id>.bam`: Alignment file in BAM format containing both mapped and unmapped reads.
</details> </details>
By default, nf-core taxprofiler will only provide the `.bam` file containing mapped and unmapped if host removal for long reads is turned on (i.e., `--save_hostremoval_mapped` and ` --save_hostremoval_unmapped`). By default, nf-core/taxprofiler will only provide the `.bam` file containing mapped and unmapped reads if saving of host removal for long reads is turned on via `--save_hostremoval_bam`.
> minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report. > minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report.
### SAMtools fastq > Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/bam2fq`](#samtools-bam2fq)
[SAMtools fastq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format ### SAMtools bam2fq
[SAMtools bam2fq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format
<details markdown="1"> <details markdown="1">
<summary>Output files</summary> <summary>Output files</summary>
- `samtoolsstats` - `samtoolsstats`
- `<sample_id>.fq.gz`: Alignment file in FASTQ gzip format. - `<sample_id>_interleaved.fq.gz`: Unmapped reads only in FASTQ gzip format
</details> </details>
This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal (for short-read unmapped reads, see [bowtie2](#bowtie2)), if `--save_hostremoval_unmapped` is supplied. This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied
> For short-read unmapped reads, see [bowtie2](#bowtie2).
### SAMtools stats ### SAMtools stats

View file

@ -103,7 +103,7 @@ params {
shortread_hostremoval_index = null shortread_hostremoval_index = null
longread_hostremoval_index = null longread_hostremoval_index = null
save_hostremoval_index = false save_hostremoval_index = false
save_hostremoval_mapped = false save_hostremoval_bam = false
save_hostremoval_unmapped = false save_hostremoval_unmapped = false

View file

@ -293,7 +293,7 @@
"description": "Save mapping index of input reference when not already supplied by user", "description": "Save mapping index of input reference when not already supplied by user",
"help_text": "Save the output files of the in-built indexing of the host genome.\n\nThis is recommend to be turned on if you plan to use the same reference genome multiple times, as supplying the directory or file to `--shortread_hostremoval_index` or `--longread_hostremoval_index` respectively can speed up runtime of future runs. Once generated, we recommend you place this file _outside_ of your run results directory in a central 'cache' directory you and others using your machine can access and supply to the pipeline." "help_text": "Save the output files of the in-built indexing of the host genome.\n\nThis is recommend to be turned on if you plan to use the same reference genome multiple times, as supplying the directory or file to `--shortread_hostremoval_index` or `--longread_hostremoval_index` respectively can speed up runtime of future runs. Once generated, we recommend you place this file _outside_ of your run results directory in a central 'cache' directory you and others using your machine can access and supply to the pipeline."
}, },
"save_hostremoval_mapped": { "save_hostremoval_bam": {
"type": "boolean", "type": "boolean",
"fa_icon": "fas fa-save", "fa_icon": "fas fa-save",
"description": "Saved mapped and unmapped reads in BAM format from host removal", "description": "Saved mapped and unmapped reads in BAM format from host removal",
@ -303,7 +303,7 @@
"type": "boolean", "type": "boolean",
"fa_icon": "fas fa-save", "fa_icon": "fas fa-save",
"description": "Save unmapped reads in FASTQ format from host removal", "description": "Save unmapped reads in FASTQ format from host removal",
"help_text": "Save the unreads mapped to the reference genome in FASTQ format (as exported from `samtools view`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly." "help_text": "Save only the reads NOT mapped to the reference genome in FASTQ format (as exported from `samtools view` and `bam2fq`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly."
} }
}, },
"fa_icon": "fas fa-user-times" "fa_icon": "fas fa-user-times"

View file

@ -34,14 +34,15 @@ workflow LONGREAD_HOSTREMOVAL {
[ meta, reads, [] ] [ meta, reads, [] ]
} }
// Generate unmapped reads FASTQ for downstream taxprofiling
SAMTOOLS_VIEW ( ch_minimap2_mapped , [], [] ) SAMTOOLS_VIEW ( ch_minimap2_mapped , [], [] )
ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() )
SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false ) SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false )
ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() ) ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() )
SAMTOOLS_INDEX ( SAMTOOLS_VIEW.out.bam ) // Indexing whole BAM for host removal statistics
SAMTOOLS_INDEX ( MINIMAP2_ALIGN.out.bam )
ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() ) ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() )
bam_bai = MINIMAP2_ALIGN.out.bam bam_bai = MINIMAP2_ALIGN.out.bam
@ -51,7 +52,6 @@ workflow LONGREAD_HOSTREMOVAL {
ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first())
ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_STATS.out.stats ) ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_STATS.out.stats )
emit: emit:
stats = SAMTOOLS_STATS.out.stats //channel: [val(meta), [reads ] ] stats = SAMTOOLS_STATS.out.stats //channel: [val(meta), [reads ] ]
reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ] reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ]

View file

@ -6,7 +6,6 @@ include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build/
include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main'
include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main'
include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main'
include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main'
workflow SHORTREAD_HOSTREMOVAL { workflow SHORTREAD_HOSTREMOVAL {
take: take:
@ -25,20 +24,13 @@ workflow SHORTREAD_HOSTREMOVAL {
ch_bowtie2_index = index.first() ch_bowtie2_index = index.first()
} }
// Map, generate BAM with all reads and unmapped reads in FASTQ for downstream
BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true, true) BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true, true)
ch_versions = ch_versions.mix( BOWTIE2_ALIGN.out.versions.first() ) ch_versions = ch_versions.mix( BOWTIE2_ALIGN.out.versions.first() )
ch_multiqc_files = ch_multiqc_files.mix( BOWTIE2_ALIGN.out.log ) ch_multiqc_files = ch_multiqc_files.mix( BOWTIE2_ALIGN.out.log )
ch_bowtie2_mapped = BOWTIE2_ALIGN.out.bam // Indexing whole BAM for host removal statistics
.map { SAMTOOLS_INDEX ( BOWTIE2_ALIGN.out.bam )
meta, reads ->
[ meta, reads, [] ]
}
SAMTOOLS_VIEW ( ch_bowtie2_mapped, [], [] )
ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() )
SAMTOOLS_INDEX ( SAMTOOLS_VIEW.out.bam )
ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() ) ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() )
bam_bai = BOWTIE2_ALIGN.out.bam bam_bai = BOWTIE2_ALIGN.out.bam