diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7817114..2e3d35e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`
+- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing. (added by @Midnighter)
+- [#272](https://github.com/nf-core/taxprofiler/pull/272) - Add saving of final 'analysis-ready-reads' to dedicated directory. (❤️ to @alexhbnr for reporting, added by @jfy133)
+
### `Fixed`
- [#271](https://github.com/nf-core/taxprofiler/pull/271/files) Improved standardised table generation documentation nd mOTUs manual database download tutorial (♥ to @prototaxites for reporting, fix by @jfy133)
@@ -15,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#274](https://github.com/nf-core/taxprofiler/pull/274/files) Substituted the samtools/bam2fq module with samtools/fastq module (fix by @sofstam)
- [#275](https://github.com/nf-core/taxprofiler/pull/275/files) Replaced function used for error reporting to more Nextflow friendly method (fix by @jfy133)
- [#285](https://github.com/nf-core/taxprofiler/pull/285/files) Fixed overly large log files in Kraken2 output (♥ to @prototaxites for reporting, fix by @Midnighter & @jfy133)
+- [#286](https://github.com/nf-core/taxprofiler/pull/286/files) Runtime optimisation of MultiQC step via improved log file processing (fix by @Midnighter & @jfy133)
### `Dependencies`
diff --git a/LICENSE b/LICENSE
index 7d74b6c..694c995 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, Lauri Mesilaakso, Thomas A. Christensen II, Jianhong Ou, Mahwash Jamy, Maxime Borry, Rafal Stepien, Tanja Normark
+Copyright (c) James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, and the nf-core/taxprofiler team
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index ba08835..218cfe1 100644
--- a/README.md
+++ b/README.md
@@ -81,14 +81,28 @@ nf-core/taxprofiler was originally written by [James A. Fellows Yates](https://g
We thank the following people for their contributions to the development of this pipeline:
-[Lauri Mesilaakso](https://github.com/ljmesi), [Tanja Normark](https://github.com/talnor), [Maxime Borry](https://github.com/maxibor),[Thomas A. Christensen II](https://github.com/MillironX), [Jianhong Ou](https://github.com/jianhong), [Rafal Stepien](https://github.com/rafalstepien), [Mahwash Jamy](https://github.com/mjamy), and the [nf-core/community](https://nf-co.re/community).
+### Team
+
+- [Lauri Mesilaakso](https://github.com/ljmesi)
+- [Tanja Normark](https://github.com/talnor)
+- [Maxime Borry](https://github.com/maxibor)
+- [Thomas A. Christensen II](https://github.com/MillironX)
+- [Jianhong Ou](https://github.com/jianhong)
+- [Rafal Stepien](https://github.com/rafalstepien)
+- [Mahwash Jamy](https://github.com/mjamy)
+
+### Acknowledgments
We also are grateful for the feedback and comments from:
-- [Alex Hübner](https://github.com/alexhbnr)
-- [LilyAnderssonLee](https://github.com/LilyAnderssonLee)
+- The general [nf-core/community](https://nf-co.re/community)
-Credit and thanks also goes to [Zandra Fagernäs](https://github.com/ZandraFagernas) for the logo.
+And specifically to
+
+- [Alex Hübner](https://github.com/alexhbnr)
+- [Lily Andersson Lee](https://github.com/LilyAnderssonLee)
+
+❤️ also goes to [Zandra Fagernäs](https://github.com/ZandraFagernas) for the logo.
## Contributions and Support
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index f13d775..deaf219 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -37,8 +37,7 @@ run_modules:
sp:
diamond:
- contents: "diamond v"
- num_lines: 10
+ fn_re: ".*.diamond.log$"
fastqc/data:
fn_re: ".*(fastqc|falco)_data.txt$"
fastqc/zip:
diff --git a/conf/modules.config b/conf/modules.config
index 78d8153..25480ea 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -71,6 +71,14 @@ process {
path: { "${params.outdir}/fastp" },
mode: params.publish_dir_mode,
pattern: '*.{log,html,json}'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ // Don't know why `!` doesn't work here, but `== false` makes it work...
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
@@ -99,6 +107,13 @@ process {
path: { "${params.outdir}/fastp" },
mode: params.publish_dir_mode,
pattern: '*.{log,html,json}'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: params.shortread_qc_mergepairs ? '*merged.fastq.gz' : '*.fastp.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
@@ -122,6 +137,13 @@ process {
path: { "${params.outdir}/adapterremoval" },
mode: params.publish_dir_mode,
pattern: '*.settings'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*truncated.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
@@ -148,6 +170,27 @@ process {
path: { "${params.outdir}/adapterremoval" },
mode: params.publish_dir_mode,
pattern: '*.settings'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*{truncated.fastq,singleton.truncated}.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && !params.shortread_qc_mergepairs && params.save_analysis_ready_fastqs ? it : null}
+ ]
+ ]
+ }
+
+ // AdapterRemoval separate output merging
+ withName: CAT_FASTQ {
+ ext.prefix = { "${meta.id}_${meta.run_accession}" }
+ publishDir = [
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
@@ -165,6 +208,13 @@ process {
path: { "${params.outdir}/porechop" },
mode: params.publish_dir_mode,
pattern: '*.log'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*_porechopped.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && params.longread_qc_skipqualityfilter && !params.longread_qc_skipadaptertrim && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
]
]
}
@@ -188,16 +238,82 @@ process {
path: { "${params.outdir}/filtlong" },
mode: params.publish_dir_mode,
pattern: '*.log'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && !params.longread_qc_skipqualityfilter && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null }
+ ]
+ ]
+ }
+
+ withName: BBMAP_BBDUK {
+ ext.args = [
+ "entropy=${params.shortread_complexityfilter_entropy}",
+ "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
+ params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f"
+ ].join(' ').trim()
+ ext.prefix = { "${meta.id}_${meta.run_accession}" }
+ publishDir = [
+ [
+ path: { "${params.outdir}/bbduk/" },
+ mode: params.publish_dir_mode,
+ pattern: '*.{fastq.gz}',
+ enabled: params.save_complexityfiltered_reads
+ ],
+ [
+ path: { "${params.outdir}/bbduk/" },
+ mode: params.publish_dir_mode,
+ pattern: '*.log'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null }
+ ]
+ ]
+ }
+
+ withName: PRINSEQPLUSPLUS {
+ ext.args = [
+ params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}",
+ "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0",
+ ].join(' ').trim()
+ ext.prefix = { "${meta.id}_${meta.run_accession}" }
+ publishDir = [
+ [
+ path: { "${params.outdir}/prinseqplusplus/" },
+ mode: params.publish_dir_mode,
+ pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
+ enabled: params.save_complexityfiltered_reads
+ ],
+ [
+ path: { "${params.outdir}/prinseqplusplus/" },
+ mode: params.publish_dir_mode,
+ pattern: '*.log'
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null }
]
]
}
withName: BOWTIE2_BUILD {
publishDir = [
+ [
path: { "${params.outdir}/bowtie2/build" },
mode: params.publish_dir_mode,
pattern: 'bowtie2',
enabled: params.save_hostremoval_index
+ ]
]
}
@@ -221,6 +337,14 @@ process {
mode: params.publish_dir_mode,
pattern: '*.fastq.gz',
enabled: params.save_hostremoval_unmapped
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ enabled: params.perform_shortread_hostremoval,
+ pattern: '*.fastq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && params.perform_shortread_hostremoval && params.save_analysis_ready_fastqs ? it : null }
]
]
}
@@ -253,10 +377,19 @@ process {
withName: SAMTOOLS_FASTQ {
ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" }
publishDir = [
- path: { "${params.outdir}/samtools/fastq" },
- mode: params.publish_dir_mode,
- pattern: '*.fastq.gz',
- enabled: params.save_hostremoval_unmapped
+ [
+ path: { "${params.outdir}/samtools/fastq" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.save_hostremoval_unmapped
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fq.gz',
+ enabled: params.save_analysis_ready_fastqs,
+ saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun) ) && params.perform_longread_hostremoval && params.save_analysis_ready_fastqs ? it : null }
+ ]
]
}
@@ -269,56 +402,21 @@ process {
]
}
- withName: BBMAP_BBDUK {
- ext.args = [
- "entropy=${params.shortread_complexityfilter_entropy}",
- "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
- params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f"
- ].join(' ').trim()
- ext.prefix = { "${meta.id}_${meta.run_accession}" }
- publishDir = [
- [
- path: { "${params.outdir}/bbduk/" },
- mode: params.publish_dir_mode,
- pattern: '*.{fastq.gz,log}',
- enabled: params.save_complexityfiltered_reads
- ],
- [
- path: { "${params.outdir}/bbduk/" },
- mode: params.publish_dir_mode,
- pattern: '*.log'
- ]
- ]
- }
-
- withName: PRINSEQPLUSPLUS {
- ext.args = [
- params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}",
- "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0",
- ].join(' ').trim()
- ext.prefix = { "${meta.id}_${meta.run_accession}" }
- publishDir = [
- [
- path: { "${params.outdir}/prinseqplusplus/" },
- mode: params.publish_dir_mode,
- pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}',
- enabled: params.save_complexityfiltered_reads
- ],
- [
- path: { "${params.outdir}/prinseqplusplus/" },
- mode: params.publish_dir_mode,
- pattern: '*.log'
- ]
- ]
- }
-
- withName: CAT_FASTQ {
+ withName: MERGE_RUNS {
ext.prefix = { "${meta.id}" }
publishDir = [
- path: { "${params.outdir}/run_merging/" },
- mode: params.publish_dir_mode,
- pattern: '*.fastq.gz',
- enabled: params.save_runmerged_reads
+ [
+ path: { "${params.outdir}/run_merging/" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.save_runmerged_reads
+ ],
+ [
+ path: { "${params.outdir}/analysis_ready_fastqs" },
+ mode: params.publish_dir_mode,
+ pattern: '*.fastq.gz',
+ enabled: params.perform_runmerging && params.save_analysis_ready_fastqs
+ ]
]
}
diff --git a/docs/output.md b/docs/output.md
index 4240b25..c1aedaa 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -22,6 +22,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [minimap2](#minimap2) - Host removal for Nanopore reads
- [SAMtools stats](#samtools-stats) - Statistics from host removal
- [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only)
+- [Analysis Ready Reads](#analysis-read-reads) - Optional results directory containing the final processed reads used as input for classification/profiling.
- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations
- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches
- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species
@@ -102,7 +103,7 @@ You can change the default value for low complexity filtering by using the argum
By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected.
-You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads.
+You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc..
@@ -121,7 +122,7 @@ You will only find the `.fastq` files in the results directory if you provide `
The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually.
-You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`.
+You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy.
@@ -140,7 +141,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor
-By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads` .
+By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc..
@@ -159,7 +160,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor
-By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads` .
+By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc..
@@ -176,7 +177,7 @@ By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is
-You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`.
+You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ⚠️ We do **not** recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy.
@@ -199,7 +200,7 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/
-By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads.
+By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ℹ️ Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq)
@@ -242,10 +243,26 @@ By default, nf-core/taxprofiler will only provide the `.bam` file containing map
-This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied
+This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
> ℹ️ For short-read unmapped reads, see [bowtie2](#bowtie2).
+### Analysis Ready Reads
+
+> ℹ️ This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`.
+
+
+Output files
+
+- `samtoolsstats`
+ - `_{fq,fastq}.gz`: Final reads that underwent preprocessing and were sent for classification/profiling.
+
+
+
+The results directory will contain the 'final' processed reads used as input for classification/profiling. It will _only_ include the output of the _last_ step of any combinations of preprocessing steps that may have been specified in the run configuration. For example, if you perform the read QC and host-removal preprocessing steps, the final reads that are sent to classification/profiling are the host-removed FASTQ files - those will be the ones present in this directory.
+
+> ⚠️ If you turn off all preprocessing steps, then no results will be present in this directory. This happens independently for short- and long-reads. I.e. you will only have FASTQ files for short reads in this directory if you skip all long-read preprocessing.
+
### SAMtools stats
[SAMtools stats](http://www.htslib.org/doc/samtools-stats.html) collects statistics from a `.sam`, `.bam`, or `.cram` alignment file and outputs in a text format.
@@ -264,7 +281,7 @@ In most cases you do not need to check this file, as it is rendered in the Multi
nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet.
-This is the last preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps.
+This is the last possible preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps.
Output files
@@ -276,7 +293,7 @@ This is the last preprocessing step, so if you have multiple runs or libraries (
Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory.
-⚠️ You must make sure to turn on the saving of the reads from the previous preprocessing step you may have turned on, if you have single-run or library reads in your pipeline run, and wish to save the final reads that go into classification/profiling!
+This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`.
### Bracken
diff --git a/docs/usage.md b/docs/usage.md
index ee36dca..a98da95 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -168,6 +168,8 @@ nf-core/taxprofiler offers four main preprocessing steps for preprocessing raw s
- [**Host read-removal**](#host-read-removal): removal of reads aligning to reference genome(s) of a host.
- [**Run merging**](#run-merging): concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample.
+> ℹ️ You can save the 'final' reads used for classification/profiling from any combination of these steps with `--save_analysis_ready_reads`.
+
#### Read Processing
Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags.
diff --git a/nextflow.config b/nextflow.config
index 05cd7ea..c155769 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -105,6 +105,8 @@ params {
save_hostremoval_bam = false
save_hostremoval_unmapped = false
+ // Publishing final reads going into profiling
+ save_analysis_ready_fastqs = false
// MALT
run_malt = false
@@ -123,6 +125,7 @@ params {
krakenuniq_ram_chunk_size = '16G'
krakenuniq_save_reads = false // added directly to module in profiling.nf
krakenuniq_save_readclassifications = false // added directly to module in profiling.nf
+ krakenuniq_batch_size = 20
// Bracken
run_bracken = false
@@ -300,7 +303,7 @@ dag {
manifest {
name = 'nf-core/taxprofiler'
- author = """James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, Lauri Mesilaakso, Thomas A. Christensen II, Jianhong Ou, Mahwash Jamy, Maxime Borry, Rafal Stepien, Tanja Normark"""
+ author = """James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, and the nf-core/taxprofiler team"""
homePage = 'https://github.com/nf-core/taxprofiler'
description = """Taxonomic classification and profiling of shotgun metagenomic data"""
mainScript = 'main.nf'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 943d436..60c22df 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -69,6 +69,12 @@
"fa_icon": "fas fa-save",
"description": "Save reads from samples that went through the adapter clipping, pair-merging, and length filtering steps for both short and long reads",
"help_text": "This saves the FASTQ output from the following tools:\n\n- fastp\n- AdapterRemoval\n- Porechop\n- Filtlong\n\nThese reads will be a mixture of: adapter clipped, quality trimmed, pair-merged, and length filtered, depending on the parameters you set."
+ },
+ "save_analysis_ready_fastqs": {
+ "type": "boolean",
+ "fa_icon": "fas fa-save",
+ "description": "Save only the final reads from all read processing steps (that are sent to classification/profiling) in results directory.",
+ "help_text": "This flag will generate the directory `results/analysis_read_reads` that contains the reads from the last preprocessing (QC, host removal, run merging etc.) step of the pipeline run. \n\nThis can be useful if you wish to re-use the final cleaned-up and prepared reads - the data actually used for the actual classification/profiling steps of the pipeline - for other analyses or purposes (e.g., to reduce redundant preprocessing between different pipelines, e.g. [nf-core/mag](https://nf-co.re/mag)).\n\nIn most cases this will be preferred over similar parameters e.g. ` --save_preprocessed_reads` or ` --save_complexityfiltered_reads`, unless you wish to explore in more detail the output of each specific preprocessing step independently.\n\nNote if you do no preprocessing of any kind, nothing will be present in this directory. "
}
},
"fa_icon": "fas fa-users-cog"
@@ -426,6 +432,13 @@
"description": "Turn on saving of KrakenUniq per-read taxonomic assignment file",
"help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`"
},
+ "krakenuniq_batch_size": {
+ "type": "integer",
+ "default": 20,
+ "fa_icon": "far fa-window-restore",
+ "description": "Specify the number of samples for each KrakenUniq run",
+ "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once per nextflow process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared use of database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time."
+ },
"run_bracken": {
"type": "boolean",
"description": "Turn on Bracken (and the required Kraken2 prerequisite step).",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 2388173..1274c39 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -9,35 +9,58 @@ workflow INPUT_CHECK {
samplesheet // file: /path/to/samplesheet.csv
main:
- parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
+
+ // Table to list, group per sample, detect if sample has multi-run,
+ // then spread back to per-run rows but with multi-run info added to meta
+ ch_split_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
.csv
.splitCsv ( header:true, sep:',' )
+ .map{
+ row ->
+ [ [ row.sample.toString() ], row ]
+ }
+ .groupTuple()
+ .map {
+ sample, rows ->
+ def is_multirun = rows.size() > 1
+ [ rows, is_multirun ]
+ }
+ .transpose(by: 0)
+ .map {
+ row, is_multirun ->
+ row['is_multirun'] = is_multirun
+ return row
+ }
+
+ // Split for context-dependent channel generation
+ ch_parsed_samplesheet = ch_split_samplesheet
.branch { row ->
fasta: row.fasta != ''
nanopore: row.instrument_platform == 'OXFORD_NANOPORE'
fastq: true
}
- fastq = parsed_samplesheet.fastq
+ // Channel generation
+ ch_fastq = ch_parsed_samplesheet.fastq
.map { create_fastq_channel(it) }
- nanopore = parsed_samplesheet.nanopore
+ ch_nanopore = ch_parsed_samplesheet.nanopore
.map { create_fastq_channel(it) }
- fasta = parsed_samplesheet.fasta
+ ch_fasta = ch_parsed_samplesheet.fasta
.map { create_fasta_channel(it) }
emit:
- fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ]
- nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ]
- fasta = fasta ?: [] // channel: [ val(meta), fasta ]
- versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
+ fastq = ch_fastq ?: [] // channel: [ val(meta), [ reads ] ]
+ nanopore = ch_nanopore ?: [] // channel: [ val(meta), [ reads ] ]
+ fasta = ch_fasta ?: [] // channel: [ val(meta), fasta ]
+ versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
}
// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
def create_fastq_channel(LinkedHashMap row) {
// create meta map
- def meta = row.subMap(['sample', 'run_accession', 'instrument_platform'])
+ def meta = row.subMap(['sample', 'run_accession', 'instrument_platform', 'is_multirun'])
meta.id = meta.sample
meta.single_end = row.single_end.toBoolean()
meta.is_fasta = false
@@ -66,7 +89,9 @@ def create_fastq_channel(LinkedHashMap row) {
// Function to get list of [ meta, fasta ]
def create_fasta_channel(LinkedHashMap row) {
- def meta = row.subMap(['sample', 'run_accession', 'instrument_platform'])
+
+ // don't include multi-run information as we don't do FASTA run merging
+ def meta = row.subMap(['sample', 'run_accession', 'instrument_platform' ])
meta.id = meta.sample
meta.single_end = true
meta.is_fasta = true
diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
index d328a9c..7f8b943 100644
--- a/subworkflows/local/profiling.nf
+++ b/subworkflows/local/profiling.nf
@@ -315,16 +315,20 @@ workflow PROFILING {
if ( params.run_krakenuniq ) {
ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq
- .map {
- meta, reads, db_meta, db ->
- [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
- }
- .groupTuple(by: [0,2,3])
- .multiMap {
- single_meta, reads, db_meta, db ->
- reads: [ single_meta + db_meta, reads.flatten() ]
- db: db
- }
+ .map {
+ meta, reads, db_meta, db ->
+ [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db]
+ }
+ .groupTuple(by: [0,2,3])
+ .flatMap { single_meta, reads, db_meta, db ->
+ def batches = reads.collate(params.krakenuniq_batch_size)
+ return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]}
+ }
+ .multiMap {
+ meta, reads, db ->
+ reads: [ meta, reads ]
+ db: db
+ }
// Hardcode to _always_ produce the report file (which is our basic output, and goes into)
KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications )
ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report )
diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
index cf888e2..2fe8ecd 100644
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@@ -92,7 +92,7 @@ include { FASTQC } from '../modules/nf-core/fastqc/main'
include { FALCO } from '../modules/nf-core/falco/main'
include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main'
-include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main'
+include { CAT_FASTQ as MERGE_RUNS } from '../modules/nf-core/cat/fastq/main'
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -122,6 +122,9 @@ workflow TAXPROFILER {
)
ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
+ // Save final FASTA reads if requested, as otherwise no processing occurs on FASTA
+
+
DB_CHECK (
ch_databases
)
@@ -211,7 +214,7 @@ workflow TAXPROFILER {
skip: true
}
- ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads
+ ch_reads_runmerged = MERGE_RUNS ( ch_reads_for_cat_branch.cat ).reads
.mix( ch_reads_for_cat_branch.skip )
.map {
meta, reads ->
@@ -219,7 +222,7 @@ workflow TAXPROFILER {
}
.mix( INPUT_CHECK.out.fasta )
- ch_versions = ch_versions.mix(CAT_FASTQ.out.versions)
+ ch_versions = ch_versions.mix(MERGE_RUNS.out.versions)
} else {
ch_reads_runmerged = ch_shortreads_hostremoved
@@ -270,7 +273,12 @@ workflow TAXPROFILER {
ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
if ( params.preprocessing_qc_tool == 'falco' ) {
- ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt.collect{it[1]}.ifEmpty([]))
+ // only mix in files acutally used by MultiQC
+ ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt
+ .map { meta, reports -> reports }
+ .flatten()
+ .filter { path -> path.name.endsWith('_data.txt')}
+ .ifEmpty([]))
} else {
ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
}