diff --git a/README.md b/README.md index e976f73..f1d59d5 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ -**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling against multiple profiling tools and databases and produces standardised output tables. +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic profiling of shotgun metagenomic data. It allows for in-parallel profiling with multiple profiling tools against multiple databases, produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! @@ -32,20 +32,20 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Performs optional read pre-processing - - Adapter clipping and merging (short, and nanopore reads) - - Low complexity filtering - - Host read removal + - Adapter clipping and merging (short read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long read: [porechop](https://github.com/rrwick/Porechop)) + - Low complexity filtering ([bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus)) + - Host read removal ([BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/)) - Run merging -3. Performs taxonomic profiling a choice of: - - Kraken2 - - MetaPhlAn3 - - MALT - - DIAMOND - - Centrifuge - - Kaiju - - mOTUs +3. Performs taxonomic profiling using one or more of: + - [Kraken2](https://ccb.jhu.edu/software/kraken2/) + - [MetaPhlAn3](https://huttenhower.sph.harvard.edu/metaphlan/) + - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) + - [DIAMOND](https://github.com/bbuchfink/diamond) + - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) + - [Kaiju](https://kaiju.binf.ku.dk/) + - [mOTUs](https://motu-tool.org/) 4. Perform optional post-processing with: - - bracken + - [bracken](https://ccb.jhu.edu/software/bracken/) 5. Standardises output tables 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) @@ -70,10 +70,8 @@ On release, automated continuous integration tests run the pipeline on a full-si 4. Start running your own analysis! - - ```console - nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile + nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir --run_ --run_ -profile ``` ## Documentation @@ -86,7 +84,7 @@ nf-core/taxprofiler was originally written by nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: - +[James A. Fellows Yates](https://github.com/jfy133), [Moritz Beber](https://github.com/Midnighter), [Lauri Mesilaakso](https://github.com/ljmesi), [Sofia Stamouli](https://github.com/sofsam), [Maxime Borry](https://github.com/maxibor). ## Contributions and Support diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..82565b1 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,6 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, diff --git a/conf/modules.config b/conf/modules.config index 9bd2e30..e3b662a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -185,7 +185,7 @@ process { publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{rma6,tab,text,sam,log}' + pattern: '*.{log}' ] } @@ -195,7 +195,7 @@ process { publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, - pattern: '*.{fastq.gz,txt}' + pattern: '*.{txt}' ] } @@ -208,6 +208,16 @@ process { ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } + withName: CENTRIFUGE_CENTRIFUGE { + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + ext.args = { "${meta.db_params}" } + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -216,4 +226,12 @@ process { ] } + withName: MULTIQC { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/conf/test.config b/conf/test.config index 39bae58..616f82e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,15 +22,16 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true run_metaphlan3 = true + run_centrifuge = true shortread_clipmerge = true longread_clip = false shortread_complexityfilter = true - shortread_clipmerge = true shortread_hostremoval = true shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' } diff --git a/docs/usage.md b/docs/usage.md index bd840a4..239a55d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,56 +8,90 @@ -## Samplesheet input +## Samplesheet inputs -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermother, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. + +This samplesheet is then specified on the command line as follows: ```console ---input '[path to samplesheet file]' +--input '[path to samplesheet file]' --databases '[path to database sheet file]' ``` ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will process reads before performing profiling. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2612,run1,ILLUMINA,2612_run1_R1.fq.gz,, +2612,run2,ILLUMINA,2612_run2_R1.fq.gz,, +2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz, + ``` +> ⚠️ Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged. + ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA fies may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample name [required]. | +| `run_accession` | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required]. | +| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required]. | +| `fastq_1` | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fasta`. | +| `fastq_2` | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`. | +| `fasta` | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specifed. Cannot be combined with `fastq_1` or `fastq_2`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +### Full database sheet + +nf-core/taxprofiler supports multiple databases being profiled in parallel for each tool. These databases, and specific parameters for each, can be specified in a 4 column comma-separated sheet. + +> ⚠️ nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. + +An example database sheet can look as follows, where 4 tools are being used, and `malt` and `kraken2` will be used against two databases each. + +```console +tool,db_name,db_params,db_path +malt,malt85,-id 85,///malt/testdb-malt/ +malt,malt95,-id 90,///malt/testdb-malt.tar.gz +kraken2,db1,,///kraken2/testdb-kraken2.tar.gz +kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +centrifuge,db1,,///centrifuge/minigut_cf.tar.gz +metaphlan3,db1,,///metaphlan3/metaphlan_database/ +``` + +Column specifications are as follows: + +| Column | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. | +| `db_name` | A unique name of the particular database [required]. | +| `db_params` | Any parameters of the given taxonomic profiler that you wish to specify that the taxonomic profiling tool should use when profiling against this specific. Can be empty to use taxonomic profiler defaults Must not be surrounded by quotes [required]. | +| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | + +> 💡 You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during profiling. + ## Running the pipeline The typical command for running the pipeline is as follows: ```console -nextflow run nf-core/taxprofiler --input samplesheet.csv --outdir --genome GRCh37 -profile docker +nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir -profile docker --run_ --run_ ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -66,7 +100,7 @@ Note that the pipeline will create the following files in your working directory ```console work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` diff --git a/modules.json b/modules.json index aa06e9c..2be0b7f 100644 --- a/modules.json +++ b/modules.json @@ -18,6 +18,9 @@ "cat/fastq": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, + "centrifuge/centrifuge": { + "git_sha": "d2726fcf75063960f06b36d2229a4c0966614108" + }, "custom/dumpsoftwareversions": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, diff --git a/modules/nf-core/modules/centrifuge/centrifuge/main.nf b/modules/nf-core/modules/centrifuge/centrifuge/main.nf new file mode 100644 index 0000000..3d23fc9 --- /dev/null +++ b/modules/nf-core/modules/centrifuge/centrifuge/main.nf @@ -0,0 +1,61 @@ +process CENTRIFUGE_CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + val sam_format + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + def sam_output = sam_format ? "--out-fmt 'sam'" : '' + """ + ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` + centrifuge \\ + -x \$db_name \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $sam_output \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/centrifuge/centrifuge/meta.yml b/modules/nf-core/modules/centrifuge/centrifuge/meta.yml new file mode 100644 index 0000000..a252c00 --- /dev/null +++ b/modules/nf-core/modules/centrifuge/centrifuge/meta.yml @@ -0,0 +1,66 @@ +name: centrifuge_centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Path to directory containing centrifuge database files + - save_unaligned: + type: value + description: If true unmapped fastq files are saved + - save_aligned: + type: value + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/nextflow.config b/nextflow.config index 52cd655..0c20e7d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -89,6 +89,11 @@ params { // kraken2 run_kraken2 = false + // centrifuge + run_centrifuge = false + centrifuge_save_unaligned = false + centrifuge_save_aligned = false + centrifuge_sam_format = false // metaphlan3 run_metaphlan3 = false } diff --git a/nextflow_schema.json b/nextflow_schema.json index 6e491bb..4c77670 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -281,6 +281,18 @@ "run_kraken2": { "type": "boolean" }, + "run_centrifuge": { + "type": "boolean" + }, + "centrifuge_save_unaligned": { + "type": "boolean" + }, + "centrifuge_save_aligned": { + "type": "boolean" + }, + "centrifuge_sam_format": { + "type": "boolean" + }, "run_metaphlan3": { "type": "boolean", "description": "Enable MetaPhlAn for taxonomic profiling" diff --git a/nf-core/modules/centrifuge/centrifuge/main.nf b/nf-core/modules/centrifuge/centrifuge/main.nf new file mode 100644 index 0000000..3d23fc9 --- /dev/null +++ b/nf-core/modules/centrifuge/centrifuge/main.nf @@ -0,0 +1,61 @@ +process CENTRIFUGE_CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : + 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + val sam_format + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + def sam_output = sam_format ? "--out-fmt 'sam'" : '' + """ + ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` + centrifuge \\ + -x \$db_name \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $sam_output \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/nf-core/modules/centrifuge/centrifuge/meta.yml b/nf-core/modules/centrifuge/centrifuge/meta.yml new file mode 100644 index 0000000..a252c00 --- /dev/null +++ b/nf-core/modules/centrifuge/centrifuge/meta.yml @@ -0,0 +1,66 @@ +name: centrifuge_centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Path to directory containing centrifuge database files + - save_unaligned: + type: value + description: If true unmapped fastq files are saved + - save_aligned: + type: value + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 356915a..a718fa9 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -22,7 +22,7 @@ workflow DB_CHECK { ch_dbs_for_untar = parsed_samplesheet .branch { - untar: it[1].toString().endsWith(".tar.gz") && it[0]['tool'] != 'centrifuge' + untar: it[1].toString().endsWith(".tar.gz") skip: true } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index e8669b3..db46f2b 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -63,13 +63,12 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } + } return fastq_meta -} - -// Function to get list of [ meta, fasta ] +}// Function to get list of [ meta, fasta ] def create_fasta_channel(LinkedHashMap row) { def meta = [:] meta.id = row.sample diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf new file mode 100644 index 0000000..c74c583 --- /dev/null +++ b/subworkflows/local/profiling.nf @@ -0,0 +1,119 @@ +// +// Run profiling +// + +include { MALT_RUN } from '../../modules/nf-core/modules/malt/run/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/modules/kraken2/kraken2/main' +include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/modules/centrifuge/centrifuge/main' +include { METAPHLAN3 } from '../../modules/nf-core/modules/metaphlan3/main' + +workflow PROFILING { + take: + shortreads // [ [ meta ], [ reads ] ] + longreads // [ [ meta ], [ reads ] ] + databases // [ [ meta ], path ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + +/* + COMBINE READS WITH POSSIBLE DATABASES + */ + + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = shortreads + .mix( longreads ) + .combine(databases) + .branch { + malt: it[2]['tool'] == 'malt' + kraken2: it[2]['tool'] == 'kraken2' + metaphlan3: it[2]['tool'] == 'metaphlan3' + centrifuge: it[2]['tool'] == 'centrifuge' + unknown: true + } + + /* + PREPARE PROFILER INPUT CHANNELS + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs databases. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/databases + // channel element order in sync with each other + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? + ch_input_for_malt = ch_input_for_profiling.malt + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + } + + // All subsequent tools can easily run on a per-sample basis + + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + /* + RUN PROFILING + */ + + if ( params.run_malt ) { + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) + } + + if ( params.run_kraken2 ) { + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + } + + if ( params.run_centrifuge ) { + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + } + + if ( params.run_metaphlan3 ) { + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) + ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) + } + + + emit: + // TODO work out if there is enough standardisation of output to export as one? + //output = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 18baf17..9fb9425 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -10,7 +10,7 @@ workflow SHORTREAD_FASTP { reads // [[meta], [reads]] main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() ch_input_for_fastp = reads diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 81bde9e..abb330d 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -54,6 +54,7 @@ include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_ include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' include { SHORTREAD_HOSTREMOVAL } from '../subworkflows/local/shortread_hostremoval' include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' +include { PROFILING } from '../subworkflows/local/profiling' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,9 +70,6 @@ include { MULTIQC } from '../modules/nf-core/modules/multiqc include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' -include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' -include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' -include { METAPHLAN3 } from '../modules/nf-core/modules/metaphlan3/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +95,7 @@ workflow TAXPROFILER { DB_CHECK ( ch_databases ) + ch_versions = ch_versions.mix(DB_CHECK.out.versions) /* MODULE: Run FastQC @@ -113,6 +112,7 @@ workflow TAXPROFILER { SUBWORKFLOW: PERFORM PREPROCESSING */ if ( params.shortread_clipmerge ) { + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq @@ -146,73 +146,13 @@ workflow TAXPROFILER { } else { ch_shortreads_hostremoved = ch_shortreads_filtered } - + /* - COMBINE READS WITH POSSIBLE DATABASES + SUBWORKFLOW: PROFILING */ - // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = ch_shortreads_hostremoved - .mix( ch_longreads_preprocessed ) - .combine(DB_CHECK.out.dbs) - .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' - metaphlan3: it[2]['tool'] == 'metaphlan3' - unknown: true - } - - /* - PREPARE PROFILER INPUT CHANNELS - */ - - // We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database - // TODO document somewhere we only accept illumina short reads for MALT? - ch_input_for_malt = ch_input_for_profiling.malt - .filter { it[0]['instrument_platform'] == 'ILLUMINA' } - .map { - it -> - def temp_meta = [ id: it[2]['db_name']] + it[2] - def db = it[3] - [ temp_meta, it[1], db ] - } - .groupTuple(by: [0,2]) - .multiMap { - it -> - reads: [ it[0], it[1].flatten() ] - db: it[2] - } - - // We can run Kraken2 one-by-one sample-wise - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - /* - MODULE: RUN PROFILING - */ - if ( params.run_malt ) { - MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - } - - if ( params.run_kraken2 ) { - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) - } - - if ( params.run_metaphlan3 ) { - METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) - } + PROFILING ( ch_shortreads_hostremoved ch_longreads_preprocessed, DB_CHECK.out.dbs ) + ch_versions = ch_versions.mix( PROFILING.out.versions ) /* MODULE: MultiQC @@ -251,18 +191,9 @@ workflow TAXPROFILER { if (params.shortread_hostremoval) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) } + + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc ) - if (params.run_kraken2) { - ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) - } - - if (params.run_malt) { - ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) - ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() ) - } - - // TODO Versions for Karken/MALT not report? // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect()