diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a8078e5..b8975b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,6 +46,14 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale + run: | + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 + - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters diff --git a/CITATIONS.md b/CITATIONS.md index 8f286b0..b18f841 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -34,6 +34,10 @@ > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6. +- [MetaPhlAn3](https://doi.org/10.7554/eLife.65088) + + > Beghini, Francesco, Lauren J McIver, Aitor Blanco-Míguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, et al. 2021. “Integrating Taxonomic, Functional, and Strain-Level Profiling of Diverse Microbial Communities with BioBakery 3.” Edited by Peter Turnbaugh, Eduardo Franco, and C Titus Brown. ELife 10 (May): e65088. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/conf/modules.config b/conf/modules.config index 41faa62..3fa2c98 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -187,6 +187,15 @@ process { ] } + withName: METAPHLAN3 { + publishDir = [ + path: { "${params.outdir}/metaphlan3/${meta.db_name}" }, + mode: params.publish_dir_mode, + pattern: '*.{biom,txt}' + ] + ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index 90ea241..cf1d616 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,6 +26,7 @@ params { databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' run_kraken2 = true run_malt = true + run_metaphlan3 = true shortread_clipmerge = true shortread_hostremoval = true shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' diff --git a/modules.json b/modules.json index 7395d68..398c351 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,9 @@ "malt/run": { "git_sha": "72b96f4e504eef673f2b5c13560a9d90b669129b" }, + "metaphlan3": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + }, "multiqc": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, diff --git a/modules/local/ensure_fastq_extension.nf b/modules/local/ensure_fastq_extension.nf new file mode 100644 index 0000000..6de223b --- /dev/null +++ b/modules/local/ensure_fastq_extension.nf @@ -0,0 +1,31 @@ +process ENSURE_FASTQ_EXTENSION { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::bash=5.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv2/biocontainers_v1.2.0_cv2.img' : + 'biocontainers/biocontainers:v1.2.0_cv2' }" + + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + + script: + if (meta.single_end) { + fastq = "${reads.baseName}.fastq.gz" + """ + ln -s '${reads}' '${fastq}' + """ + } else { + first = "${reads[0].baseName}.fastq.gz" + second = "${reads[1].baseName}.fastq.gz" + """ + ln -s '${reads[0]}' '${first}' + ln -s '${reads[1]}' '${second}' + """ + } +} diff --git a/modules/nf-core/modules/metaphlan3/main.nf b/modules/nf-core/modules/metaphlan3/main.nf new file mode 100644 index 0000000..3fc6b27 --- /dev/null +++ b/modules/nf-core/modules/metaphlan3/main.nf @@ -0,0 +1,45 @@ +process METAPHLAN3 { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::metaphlan=3.0.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:3.0.12--pyhb7b1952_0' : + 'quay.io/biocontainers/metaphlan:3.0.12--pyhb7b1952_0' }" + + input: + tuple val(meta), path(input) + path metaphlan_db + + output: + tuple val(meta), path("*_profile.txt") , emit: profile + tuple val(meta), path("*.biom") , emit: biom + tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_type = ("$input".endsWith(".fastq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" + def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" + + """ + metaphlan \\ + --nproc $task.cpus \\ + $input_type \\ + $input_data \\ + $args \\ + $bowtie2_out \\ + --bowtie2db ${metaphlan_db} \\ + --biom ${prefix}.biom \\ + --output_file ${prefix}_profile.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan3: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/metaphlan3/meta.yml b/modules/nf-core/modules/metaphlan3/meta.yml new file mode 100644 index 0000000..d10a27d --- /dev/null +++ b/modules/nf-core/modules/metaphlan3/meta.yml @@ -0,0 +1,52 @@ +name: metaphlan3 +description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. +keywords: + - metagenomics + - classification + - fastq + - bam + - fasta +tools: + - metaphlan3: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.7554/eLife.65088" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Metaphlan 3.0 can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-separated output file of the predicted taxon relative abundances + pattern: "*.{txt}" + - biom: + type: file + description: General-use format for representing biological sample by observation contingency tables + pattern: "*.{biom}" + - bowtie2out: + type: file + description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 ) + pattern: "*.{bowtie2out.txt}" + +authors: + - "@MGordon09" diff --git a/nextflow.config b/nextflow.config index e559a85..384bb3c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -76,6 +76,9 @@ params { // kraken2 run_kraken2 = false + + // metaphlan3 + run_metaphlan3 = false } // Load base.config by default for all pipelines @@ -160,7 +163,7 @@ if (!params.igenomes_ignore) { // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { - PYTHONNOUSERSITE = 1 + PYTHONNOUSERSITE = '1' R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" diff --git a/nextflow_schema.json b/nextflow_schema.json index 0b5162b..4f14daa 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -281,6 +281,10 @@ "run_kraken2": { "type": "boolean" }, + "run_metaphlan3": { + "type": "boolean", + "description": "Enable MetaPhlAn for taxonomic profiling" + }, "shortread_clipmerge_tool": { "type": "string", "default": "fastp", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index ec404c2..e8669b3 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -31,9 +31,9 @@ workflow INPUT_CHECK { .set { fasta } emit: - fastq // channel: [ val(meta), [ reads ] ] - nanopore // channel: [ val(meta), [ reads ] ] - fasta // channel: [ val(meta), fasta ] + fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ] + nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ] + fasta = fasta ?: [] // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf index 5e005db..e522a1a 100644 --- a/subworkflows/local/shortread_adapterremoval.nf +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -5,6 +5,11 @@ Process short raw reads with AdapterRemoval include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/modules/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/modules/adapterremoval/main' include { CAT_FASTQ } from '../../modules/nf-core/modules/cat/fastq/main' +include { + ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1; + ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2; + ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3; +} from '../../modules/local/ensure_fastq_extension' workflow SHORTREAD_ADAPTERREMOVAL { @@ -24,63 +29,101 @@ workflow SHORTREAD_ADAPTERREMOVAL { ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, [] ) ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, [] ) - // due to the slightly ugly output implementation of the current AdapterRemoval2 version, each file - // has to be exported in a separate channel, and we must manually recombine when necessary + /* + * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file + * has to be exported in a separate channel and we must manually recombine when necessary. + */ if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) { - ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed - .mix( - ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, - ADAPTERREMOVAL_PAIRED.out.singles_truncated, - ADAPTERREMOVAL_PAIRED.out.pair1_truncated, - ADAPTERREMOVAL_PAIRED.out.pair2_truncated - ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new.single_end = true - [ meta_new, reads ] - } - .groupTuple() + ENSURE_FASTQ_EXTENSION1( + Channel.empty().mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, + ADAPTERREMOVAL_PAIRED.out.singles_truncated, + ADAPTERREMOVAL_PAIRED.out.pair1_truncated, + ADAPTERREMOVAL_PAIRED.out.pair2_truncated + ) + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) - ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + CAT_FASTQ( + ENSURE_FASTQ_EXTENSION1.out.reads + .groupTuple() + ) + + ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ENSURE_FASTQ_EXTENSION2.out.reads) } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) { - ch_adapterremoval_for_cat = ADAPTERREMOVAL_PAIRED.out.collapsed - .mix( ADAPTERREMOVAL_PAIRED.out.collapsed_truncated ) - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = true - [ meta_new, reads ] - } - .groupTuple(by: 0) + ENSURE_FASTQ_EXTENSION1( + Channel.empty().mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated + ) + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) - ch_adapterremoval_reads_prepped = CAT_FASTQ ( ch_adapterremoval_for_cat ).reads - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) + CAT_FASTQ( + ENSURE_FASTQ_EXTENSION1.out.reads + .groupTuple() + ) + + ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ENSURE_FASTQ_EXTENSION2.out.reads) } else { - ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.pair1_truncated - .join( ADAPTERREMOVAL_PAIRED.out.pair2_truncated ) - .groupTuple() - .map { meta, pair1, pair2 -> - [ meta, [ pair1, pair2 ].flatten() ] - } - .mix( ADAPTERREMOVAL_SINGLE.out.singles_truncated ) - } + ENSURE_FASTQ_EXTENSION1( + ADAPTERREMOVAL_PAIRED.out.pair1_truncated + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) - ch_processed_reads = ch_adapterremoval_reads_prepped + ENSURE_FASTQ_EXTENSION2( + ADAPTERREMOVAL_PAIRED.out.pair2_truncated + .map { meta, reads -> + meta.single_end = true + [meta, reads] + } + ) + + ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads + .join(ENSURE_FASTQ_EXTENSION2.out.reads) + .groupTuple() + .map { meta, pair1, pair2 -> + meta.single_end = false + [ meta, [ pair1, pair2 ].flatten() ] + } + .mix(ENSURE_FASTQ_EXTENSION3.out.reads) + + } ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} ) + ch_multiqc_files = ch_multiqc_files.mix( + ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]}, + ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]} + ) emit: - reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] + reads = ch_adapterremoval_reads_prepped // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 631aee6..496ef88 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -70,7 +70,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' include { MALT_RUN } from '../modules/nf-core/modules/malt/run/main' include { KRAKEN2_KRAKEN2 } from '../modules/nf-core/modules/kraken2/kraken2/main' - +include { METAPHLAN3 } from '../modules/nf-core/modules/metaphlan3/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -147,6 +147,7 @@ workflow TAXPROFILER { .branch { malt: it[2]['tool'] == 'malt' kraken2: it[2]['tool'] == 'kraken2' + metaphlan3: it[2]['tool'] == 'metaphlan3' unknown: true } @@ -180,6 +181,14 @@ workflow TAXPROFILER { db: it[3] } + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .dump(tag: "input_metaphlan3") + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + /* MODULE: RUN PROFILING */ @@ -191,6 +200,10 @@ workflow TAXPROFILER { KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) } + if ( params.run_metaphlan3 ) { + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) + } + /* MODULE: MultiQC */ @@ -227,6 +240,7 @@ workflow TAXPROFILER { // TODO MALT results overwriting per database? // TODO Versions for Karken/MALT not report? + // TODO create multiQC module for metaphlan MULTIQC ( ch_multiqc_files.collect() )