Merge branch 'dev' into hostremoval

2024-11-26 01:19:54 +00:00 · 2022-04-07 14:00:10 +02:00 · 2022-04-07 14:00:10 +02:00 · b554aa3e4d
commit b554aa3e4d
parent a76576c16b 4c4475c316
19 changed files with 503 additions and 160 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -29,8 +29,16 @@ jobs:
          - NXF_VER: ""
            NXF_EDGE: "1"
        parameters:
          - "--longread_clip false"
          - "--shortread_clip false"
          - "--shortread_clipmerge_tool fastp"
          - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged"
          - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs"
          - "--shortread_clipmerge_tool adapterremoval"
          - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged"
          - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs"
          - "--shortread_complexityfilter_tool bbduk"
          - "--shortread_complexityfilter_tool prinseq"
    steps:
      - name: Check out pipeline code
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -18,21 +18,27 @@
 - [fastp](https://doi.org/10.1093/bioinformatics/bty560)
-  > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. “Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor.” Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560.
+  > Chen, Shifu, Yanqing Zhou, Yaru Chen, and Jia Gu. 2018. Fastp: An Ultra-Fast All-in-One FASTQ Preprocessor. Bioinformatics 34 (17): i884-90. 10.1093/bioinformatics/bty560.
 - [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2)
-  > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. “AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging.” BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2.
+  > Schubert, Mikkel, Stinus Lindgreen, and Ludovic Orlando. 2016. AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging. BMC Research Notes 9 (February): 88. doi:10.1186/s13104-016-1900-2.
 - [Porechop](https://github.com/rrwick/Porechop)
 - [BBTools](http://sourceforge.net/projects/bbmap/)
 - [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1)
  > Cantu, Vito Adrian, Jeffrey Sadural, and Robert Edwards. 2019. PRINSEQ++, a Multi-Threaded Tool for Fast and Efficient Quality Control and Preprocessing of Sequencing Datasets. e27553v1. PeerJ Preprints. doi: 10.7287/peerj.preprints.27553v1.
 - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
-  > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. “Improved Metagenomic Analysis with Kraken 2.” Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
+  > Wood, Derrick E., Jennifer Lu, and Ben Langmead. 2019. Improved Metagenomic Analysis with Kraken 2. Genome Biology 20 (1): 257. doi: 10.1186/s13059-019-1891-0.
 - [MALT](https://doi.org/10.1038/s41559-017-0446-6)
-  > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. “Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico.” Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6.
+  > Vågene, Åshild J., Alexander Herbig, Michael G. Campana, Nelly M. Robles García, Christina Warinner, Susanna Sabin, Maria A. Spyrou, et al. 2018. Salmonella Enterica Genomes from Victims of a Major Sixteenth-Century Epidemic in Mexico. Nature Ecology & Evolution 2 (3): 520-28. doi: 10.1038/s41559-017-0446-6.
 - [MetaPhlAn3](https://doi.org/10.7554/eLife.65088)
--- a/conf/modules.config
+++ b/conf/modules.config
@ -12,12 +12,6 @@
 process {
    publishDir = [
        path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
        mode: params.publish_dir_mode,
        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
    ]
    withName: SAMPLESHEET_CHECK {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
@ -34,20 +28,12 @@ process {
        ]
    }
    withName: UNTAR {
        publishDir = [
            path: { "${params.outdir}/databases" },
            mode: params.publish_dir_mode,
            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
        ]
    }
    withName: FASTQC {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
        publishDir = [
            path: { "${params.outdir}/fastqc/raw" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
            pattern: '*.html'
        ]
    }
@ -57,7 +43,7 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
        publishDir = [
            path: { "${params.outdir}/fastqc/processed" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
            pattern: '*.html'
        ]
    }
@ -73,8 +59,9 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/fastp" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
-            pattern: '*.fastq.gz'
+            pattern: '*.fastq.gz',
            enabled: params.save_preprocessed_reads
        ]
    }
@ -92,8 +79,9 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/fastp" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
-            pattern: '*.fastq.gz'
+            pattern: '*.fastq.gz',
            enabled: params.save_preprocessed_reads
        ]
    }
@ -108,8 +96,9 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/adapterremoval" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
-            pattern: '*.fastq.gz'
+            pattern: '*.fastq.gz',
            enabled: params.save_preprocessed_reads
        ]
    }
@ -127,8 +116,9 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/adapterremoval" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
-            pattern: '*.fastq.gz'
+            pattern: '*.fastq.gz',
            enabled: params.save_preprocessed_reads
        ]
    }
@ -136,8 +126,9 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/porechop" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
-            pattern: '*.fastq.gz'
+            pattern: '*.fastq.gz',
            enabled: params.save_preprocessed_reads
        ]
    }
@ -145,7 +136,7 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/bowtie2/build" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
            pattern: '*.bt2'
        ]
    }
@ -154,16 +145,37 @@ process {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/bowtie2/align" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
            pattern: '*.{fastq.gz,bam}'
        ]
    }
-    withName: CAT_FASTQ {
+    withName: BBMAP_BBDUK {
        ext.args =  [
                "entropy=${params.shortread_complexityfilter_entropy}",
                "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}",
                params.shortread_complexityfilter_bbduk_mask ?  "entropymask=t" : "entropymask=f"
            ].join(' ').trim()
        ext.prefix = { "${meta.id}-${meta.run_accession}" }
        publishDir = [
-            path: { "${params.outdir}/prepared_sequences" },
+            path: { "${params.outdir}/bbduk/" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
-            pattern: '*.fastq.gz'
+            pattern: '*.{fastq.gz,log}',
            enabled: params.save_complexityfiltered_reads
        ]
    }
    withName: PRINSEQPLUSPLUS {
        ext.args =  [
                params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}",
                "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0"
            ].join(' ').trim()
        ext.prefix = { "${meta.id}-${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/prinseqplusplus/" },
            mode: params.publish_dir_mode,
            pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz,log}',
            enabled: params.save_complexityfiltered_reads
        ]
    }
@ -172,7 +184,7 @@ process {
        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/malt/${meta.db_name}" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
    }
@ -182,7 +194,7 @@ process {
        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
        publishDir = [
            path: { "${params.outdir}/kraken2/${meta.db_name}" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
            pattern: '*.{fastq.gz,txt}'
        ]
    }
--- a/conf/test.config
+++ b/conf/test.config
@ -28,7 +28,9 @@ params {
    run_malt                        = true
    run_metaphlan3                  = true
    shortread_clipmerge             = true
    longread_clip                   = false
    shortread_complexityfilter      = true
    shortread_clipmerge             = true
    shortread_hostremoval           = true
    shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta'
 }
--- a/modules.json
+++ b/modules.json
@ -4,7 +4,10 @@
    "repos": {
        "nf-core/modules": {
            "adapterremoval": {
-                "git_sha": "f0800157544a82ae222931764483331a81812012"
+                "git_sha": "879d42c5e28661fe0a5e744c9e2c515868f9e08a"
            },
            "bbmap/bbduk": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "bowtie2/align": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
@ -39,6 +42,9 @@
            "porechop": {
                "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
            },
            "prinseqplusplus": {
                "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b"
            },
            "untar": {
                "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918"
            }
--- a/modules/nf-core/modules/adapterremoval/main.nf
+++ b/modules/nf-core/modules/adapterremoval/main.nf
@ -12,14 +12,13 @@ process ADAPTERREMOVAL {
    path(adapterlist)
    output:
-    tuple val(meta), path("${prefix}.truncated.gz")            , optional: true, emit: singles_truncated
+    tuple val(meta), path("${prefix}.truncated.fastq.gz")            , optional: true, emit: singles_truncated
-    tuple val(meta), path("${prefix}.discarded.gz")            , optional: true, emit: discarded
+    tuple val(meta), path("${prefix}.discarded.fastq.gz")            , optional: true, emit: discarded
-    tuple val(meta), path("${prefix}.pair1.truncated.gz")      , optional: true, emit: pair1_truncated
+    tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz")  , optional: true, emit: paired_truncated
-    tuple val(meta), path("${prefix}.pair2.truncated.gz")      , optional: true, emit: pair2_truncated
+    tuple val(meta), path("${prefix}.collapsed.fastq.gz")            , optional: true, emit: collapsed
-    tuple val(meta), path("${prefix}.collapsed.gz")            , optional: true, emit: collapsed
+    tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz")  , optional: true, emit: collapsed_truncated
-    tuple val(meta), path("${prefix}.collapsed.truncated.gz")  , optional: true, emit: collapsed_truncated
+    tuple val(meta), path("${prefix}.paired.fastq.gz")               , optional: true, emit: paired_interleaved
-    tuple val(meta), path("${prefix}.paired.gz")               , optional: true, emit: paired_interleaved
+    tuple val(meta), path('*.settings')                              , emit: settings
    tuple val(meta), path('*.log')                             , emit: log
    path "versions.yml"                                              , emit: versions
    when:
@ -38,10 +37,19 @@ process ADAPTERREMOVAL {
            $adapterlist \\
            --basename ${prefix} \\
            --threads ${task.cpus} \\
            --settings ${prefix}.log \\
            --seed 42 \\
            --gzip
        ensure_fastq() {
            if [ -f "\${1}" ]; then
                mv "\${1}" "\${1::-3}.fastq.gz"
            fi
        }
        ensure_fastq '${prefix}.truncated.gz'
        ensure_fastq '${prefix}.discarded.gz'
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g")
@ -56,10 +64,24 @@ process ADAPTERREMOVAL {
            $adapterlist \\
            --basename ${prefix} \\
            --threads $task.cpus \\
            --settings ${prefix}.log \\
            --seed 42 \\
            --gzip
        ensure_fastq() {
            if [ -f "\${1}" ]; then
                mv "\${1}" "\${1::-3}.fastq.gz"
            fi
        }
        ensure_fastq '${prefix}.truncated.gz'
        ensure_fastq '${prefix}.discarded.gz'
        ensure_fastq '${prefix}.pair1.truncated.gz'
        ensure_fastq '${prefix}.pair2.truncated.gz'
        ensure_fastq '${prefix}.collapsed.gz'
        ensure_fastq '${prefix}.collapsed.truncated.gz'
        ensure_fastq '${prefix}.paired.gz'
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g")
--- a/modules/nf-core/modules/adapterremoval/meta.yml
+++ b/modules/nf-core/modules/adapterremoval/meta.yml
@ -43,43 +43,43 @@ output:
        Adapter trimmed FastQ files of either single-end reads, or singleton
        'orphaned' reads from merging of paired-end data (i.e., one of the pair
        was lost due to filtering thresholds).
-      pattern: "*.truncated.gz"
+      pattern: "*.truncated.fastq.gz"
  - discarded:
      type: file
      description: |
        Adapter trimmed FastQ files of reads that did not pass filtering
        thresholds.
-      pattern: "*.discarded.gz"
+      pattern: "*.discarded.fastq.gz"
  - pair1_truncated:
      type: file
      description: |
        Adapter trimmed R1 FastQ files of paired-end reads that did not merge
        with their respective R2 pair due to long templates. The respective pair
        is stored in 'pair2_truncated'.
-      pattern: "*.pair1.truncated.gz"
+      pattern: "*.pair1.truncated.fastq.gz"
  - pair2_truncated:
      type: file
      description: |
        Adapter trimmed R2 FastQ files of paired-end reads that did not merge
        with their respective R1 pair due to long templates. The respective pair
        is stored in 'pair1_truncated'.
-      pattern: "*.pair2.truncated.gz"
+      pattern: "*.pair2.truncated.fastq.gz"
  - collapsed:
      type: file
      description: |
        Collapsed FastQ of paired-end reads that successfully merged with their
        respective R1 pair but were not trimmed.
-      pattern: "*.collapsed.gz"
+      pattern: "*.collapsed.fastq.gz"
  - collapsed_truncated:
      type: file
      description: |
        Collapsed FastQ of paired-end reads that successfully merged with their
        respective R1 pair and were trimmed of adapter due to sufficient overlap.
-      pattern: "*.collapsed.truncated.gz"
+      pattern: "*.collapsed.truncated.fastq.gz"
  - log:
      type: file
      description: AdapterRemoval log file
-      pattern: "*.log"
+      pattern: "*.settings"
  - versions:
      type: file
      description: File containing software versions
--- a/modules/nf-core/modules/bbmap/bbduk/main.nf
+++ b/modules/nf-core/modules/bbmap/bbduk/main.nf
@ -0,0 +1,43 @@
 process BBMAP_BBDUK {
    tag "$meta.id"
    label 'process_medium'
    conda (params.enable_conda ? "bioconda::bbmap=38.90" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/bbmap:38.90--he522d1c_1' :
        'quay.io/biocontainers/bbmap:38.90--he522d1c_1' }"
    input:
    tuple val(meta), path(reads)
    path contaminants
    output:
    tuple val(meta), path('*.fastq.gz'), emit: reads
    tuple val(meta), path('*.log')     , emit: log
    path "versions.yml"                , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    def raw      = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}"
    def trimmed  = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz"
    def contaminants_fa = contaminants ? "ref=$contaminants" : ''
    """
    maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g')
    bbduk.sh \\
        -Xmx\$maxmem \\
        $raw \\
        $trimmed \\
        threads=$task.cpus \\
        $args \\
        $contaminants_fa \\
        &> ${prefix}.bbduk.log
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        bbmap: \$(bbversion.sh)
    END_VERSIONS
    """
 }
--- a/modules/nf-core/modules/bbmap/bbduk/meta.yml
+++ b/modules/nf-core/modules/bbmap/bbduk/meta.yml
@ -0,0 +1,52 @@
 name: bbmap_bbduk
 description: Adapter and quality trimming of sequencing reads
 keywords:
  - trimming
  - adapter trimming
  - quality trimming
 tools:
  - bbmap:
      description: BBMap is a short read aligner, as well as various other bioinformatic tools.
      homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
      documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/
      tool_dev_url: None
      doi: ""
      licence: ["UC-LBL license (see package)"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
        respectively.
  - contaminants:
      type: file
      description: |
        Reference files containing adapter and/or contaminant sequences for sequence kmer matching
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: The trimmed/modified fastq reads
      pattern: "*fastq.gz"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - log:
      type: file
      description: Bbduk log file
      pattern: "*bbduk.log"
 authors:
  - "@MGordon09"
--- a/modules/nf-core/modules/prinseqplusplus/main.nf
+++ b/modules/nf-core/modules/prinseqplusplus/main.nf
@ -0,0 +1,61 @@
 process PRINSEQPLUSPLUS {
    tag "$meta.id"
    label 'process_low'
    conda (params.enable_conda ? "bioconda::prinseq-plus-plus=1.2.3" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/prinseq-plus-plus:1.2.3--hc90279e_1':
        'quay.io/biocontainers/prinseq-plus-plus:1.2.3--hc90279e_1' }"
    input:
    tuple val(meta), path(reads)
    output:
    tuple val(meta), path("*_good_out*.fastq.gz")                  , emit: good_reads
    tuple val(meta), path("*_single_out*.fastq.gz"), optional: true, emit: single_reads
    tuple val(meta), path("*_bad_out*.fastq.gz")   , optional: true, emit: bad_reads
    tuple val(meta), path("*.log")                                 , emit: log
    path "versions.yml"                                            , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    if (meta.single_end) {
        """
        prinseq++ \\
            -threads $task.cpus \\
            -fastq ${reads} \\
            -out_name ${prefix} \\
            -out_gz \\
            -VERBOSE 1 \\
            $args \\
            | tee ${prefix}.log
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' ))
        END_VERSIONS
        """
    } else {
        """
        prinseq++ \\
            -threads $task.cpus \\
            -fastq ${reads[0]} \\
            -fastq2 ${reads[1]} \\
            -out_name ${prefix} \\
            -out_gz \\
            -VERBOSE 1 \\
            $args \\
            | tee ${prefix}.log
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' ))
        END_VERSIONS
        """
    }
 }
--- a/modules/nf-core/modules/prinseqplusplus/meta.yml
+++ b/modules/nf-core/modules/prinseqplusplus/meta.yml
@ -0,0 +1,60 @@
 name: "prinseqplusplus"
 description: PRINSEQ++ is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data
 keywords:
  - fastq
  - fasta
  - filter
  - trim
 tools:
  - "prinseqplusplus":
      description: "PRINSEQ++ - Multi-threaded C++ sequence cleaning"
      homepage: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus"
      documentation: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus"
      tool_dev_url: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus"
      doi: "10.7287/peerj.preprints.27553v1"
      licence: "['GPL v2']"
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end
        data, respectively.
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - good_reads:
      type: file
      description: Reads passing filter(s) in gzipped FASTQ format
      pattern: "*_good_out_{R1,R2}.fastq.gz"
  - single_reads:
      type: file
      description: |
        Single reads without the pair passing filter(s) in gzipped FASTQ format
      pattern: "*_single_out_{R1,R2}.fastq.gz"
  - bad_reads:
      type: file
      description: |
        Reads without not passing filter(s) in gzipped FASTQ format
      pattern: "*_bad_out_{R1,R2}.fastq.gz"
  - log:
      type: file
      description: |
        Verbose level 2 STDOUT information in a log file
      pattern: "*.log"
 authors:
  - "@jfy133"
--- a/nextflow.config
+++ b/nextflow.config
@ -51,7 +51,7 @@ params {
    max_cpus                   = 16
    max_time                   = '240.h'
-    // Databaess
+    // Databases
    databases = null
    // FASTQ preprocessing
@ -64,6 +64,18 @@ params {
    shortread_clipmerge_adapter2            = null
    shortread_clipmerge_minlength           = 15
    longread_clip                           = false
    save_preprocessed_reads                 = false
    // Complexity filtering
    shortread_complexityfilter                           = false
    shortread_complexityfilter_tool                      = 'bbduk'
    shortread_complexityfilter_entropy                   = 0.3
    shortread_complexityfilter_bbduk_windowsize          = 50
    shortread_complexityfilter_bbduk_mask                = false
    shortread_complexityfilter_prinseqplusplus_mode      = 'entropy'
    shortread_complexityfilter_prinseqplusplus_dustscore = 0.5
    save_complexityfiltered_reads                        = false
    // Host Removal
    shortread_hostremoval           = false
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -308,6 +308,41 @@
            "type": "integer",
            "default": 15
        },
        "save_preprocessed_reads": {
            "type": "boolean",
            "default": false
        },
        "shortread_complexityfilter_tool": {
            "type": "string",
            "default": "bbduk"
        },
        "shortread_complexityfilter_bbduk_windowsize": {
            "type": "integer",
            "default": 50
        },
        "shortread_complexityfilter_bbduk_mask": {
            "type": "boolean"
        },
        "shortread_complexityfilter": {
            "type": "boolean"
        },
        "shortread_complexityfilter_entropy": {
            "type": "number",
            "default": 0.3
        },
        "shortread_complexityfilter_prinseqplusplus_mode": {
            "type": "string",
            "default": "entropy",
            "enum": ["entropy", "dust"]
        },
        "shortread_complexityfilter_prinseqplusplus_dustscore": {
            "type": "number",
            "default": 0.5
        },
        "save_complexityfiltered_reads": {
            "type": "boolean",
            "default": false
        },
        "shortread_hostremoval": {
            "type": "boolean"
        },
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -1,6 +1,6 @@
-/*
+//
-Process long raw reads with porechop
+// Process long raw reads with porechop
-*/
+//
 include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/modules/fastqc/main'
 include { PORECHOP                   } from '../../modules/nf-core/modules/porechop/main'
@ -25,7 +25,7 @@ workflow LONGREAD_PREPROCESSING {
    FASTQC_PROCESSED ( PORECHOP.out.reads )
    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
    emit:
--- a/subworkflows/local/shortread_adapterremoval.nf
+++ b/subworkflows/local/shortread_adapterremoval.nf
@ -1,15 +1,10 @@
-/*
+//
-Process short raw reads with AdapterRemoval
+// Process short raw reads with AdapterRemoval
-*/
+//
 include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE       } from '../../modules/nf-core/modules/adapterremoval/main'
 include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED       } from '../../modules/nf-core/modules/adapterremoval/main'
 include { CAT_FASTQ                                     } from '../../modules/nf-core/modules/cat/fastq/main'
 include {
    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION1;
    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION2;
    ENSURE_FASTQ_EXTENSION as ENSURE_FASTQ_EXTENSION3;
 } from '../../modules/local/ensure_fastq_extension'
 workflow SHORTREAD_ADAPTERREMOVAL {
@ -36,89 +31,63 @@ workflow SHORTREAD_ADAPTERREMOVAL {
    if ( params.shortread_clipmerge_mergepairs && !params.shortread_clipmerge_excludeunmerged ) {
-        ENSURE_FASTQ_EXTENSION1(
+        ch_concat_fastq = Channel.empty()
-            Channel.empty().mix(
+            .mix(
                ADAPTERREMOVAL_PAIRED.out.collapsed,
                ADAPTERREMOVAL_PAIRED.out.collapsed_truncated,
                ADAPTERREMOVAL_PAIRED.out.singles_truncated,
-                ADAPTERREMOVAL_PAIRED.out.pair1_truncated,
+                ADAPTERREMOVAL_PAIRED.out.paired_truncated
                ADAPTERREMOVAL_PAIRED.out.pair2_truncated
            )
            .map { meta, reads ->
-                meta.single_end = true
+                def meta_new = meta.clone()
-                [meta, reads]
+                meta_new.single_end = true
                [meta_new, reads]
            }
        )
        CAT_FASTQ(
            ENSURE_FASTQ_EXTENSION1.out.reads
            .groupTuple()
-        )
+            // Paired-end reads cause a nested tuple during grouping.
            // We want to present a flat list of files to `CAT_FASTQ`.
            .map { meta, fastq -> [meta, fastq.flatten()] }
-        ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
        CAT_FASTQ(ch_concat_fastq)
        ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads
-            .mix(ENSURE_FASTQ_EXTENSION2.out.reads)
+            .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
    } else if ( params.shortread_clipmerge_mergepairs && params.shortread_clipmerge_excludeunmerged ) {
-        ENSURE_FASTQ_EXTENSION1(
+        ch_concat_fastq = Channel.empty()
-            Channel.empty().mix(
+            .mix(
                ADAPTERREMOVAL_PAIRED.out.collapsed,
                ADAPTERREMOVAL_PAIRED.out.collapsed_truncated
            )
            .map { meta, reads ->
-                meta.single_end = true
+                def meta_new = meta.clone()
-                [meta, reads]
+                meta_new.single_end = true
                [meta_new, reads]
            }
        )
        CAT_FASTQ(
            ENSURE_FASTQ_EXTENSION1.out.reads
            .groupTuple()
-        )
+            .map { meta, fastq -> [meta, fastq.flatten()] }
-        ENSURE_FASTQ_EXTENSION2(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
+
        CAT_FASTQ(ch_concat_fastq)
        ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads
-            .mix(ENSURE_FASTQ_EXTENSION2.out.reads)
+            .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
    } else {
-        ENSURE_FASTQ_EXTENSION1(
+        ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.paired_truncated
-            ADAPTERREMOVAL_PAIRED.out.pair1_truncated
+            .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
            .map { meta, reads ->
                meta.single_end = true
                [meta, reads]
            }
        )
        ENSURE_FASTQ_EXTENSION2(
            ADAPTERREMOVAL_PAIRED.out.pair2_truncated
            .map { meta, reads ->
                meta.single_end = true
                [meta, reads]
            }
        )
        ENSURE_FASTQ_EXTENSION3(ADAPTERREMOVAL_SINGLE.out.singles_truncated)
        ch_adapterremoval_reads_prepped = ENSURE_FASTQ_EXTENSION1.out.reads
            .join(ENSURE_FASTQ_EXTENSION2.out.reads)
            .groupTuple()
            .map { meta, pair1, pair2 ->
                meta.single_end = false
                [ meta, [ pair1, pair2 ].flatten() ]
            }
            .mix(ENSURE_FASTQ_EXTENSION3.out.reads)
    }
    ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() )
    ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() )
    ch_multiqc_files = ch_multiqc_files.mix(
-        ADAPTERREMOVAL_PAIRED.out.log.collect{it[1]},
+        ADAPTERREMOVAL_PAIRED.out.settings,
-        ADAPTERREMOVAL_SINGLE.out.log.collect{it[1]}
+        ADAPTERREMOVAL_SINGLE.out.settings
    )
    emit:
--- a/subworkflows/local/shortread_complexityfiltering.nf
+++ b/subworkflows/local/shortread_complexityfiltering.nf
@ -0,0 +1,32 @@
 //
 // Check input samplesheet and get read channels
 //
 include { BBMAP_BBDUK     } from '../../modules/nf-core/modules/bbmap/bbduk/main'
 include { PRINSEQPLUSPLUS } from '../../modules/nf-core/modules/prinseqplusplus/main'
 workflow SHORTREAD_COMPLEXITYFILTERING {
    take:
    reads // [ [ meta ], [ reads ] ]
    main:
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()
    if ( params.shortread_complexityfilter_tool == 'bbduk' ) {
        ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads
        ch_versions        =  ch_versions.mix( BBMAP_BBDUK.out.versions.first() )
        ch_multiqc_files   =  ch_multiqc_files.mix( BBMAP_BBDUK.out.log )
    } else if ( params.shortread_complexityfilter_tool == 'prinseqplusplus' ) {
        ch_filtered_reads = PRINSEQPLUSPLUS ( reads ).good_reads
        ch_versions        =  ch_versions.mix( PRINSEQPLUSPLUS.out.versions.first() )
    } else {
        ch_filtered_reads = reads
    }
    emit:
    reads    = ch_filtered_reads    // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
 }
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -1,6 +1,6 @@
-/*
+//
-Process short raw reads with FastP
+// Process short raw reads with FastP
-*/
+//
 include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
@ -44,8 +44,8 @@ workflow SHORTREAD_FASTP {
    ch_processed_reads = ch_fastp_reads_prepped
-    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json )
-    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json )
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -1,5 +1,5 @@
 //
-// Check input samplesheet and get read channels
+// Perform read trimming and merging
 //
@ -9,7 +9,7 @@ include { FASTQC as FASTQC_PROCESSED       } from '../../modules/nf-core/modules
 workflow SHORTREAD_PREPROCESSING {
    take:
-    reads // file: /path/to/samplesheet.csv
+    reads //  [ [ meta ], [ reads ] ]
    main:
    ch_versions       = Channel.empty()
@ -29,7 +29,7 @@ workflow SHORTREAD_PREPROCESSING {
    FASTQC_PROCESSED ( ch_processed_reads )
    ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions )
-    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip.collect{it[1]} )
+    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip )
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -18,8 +18,8 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true
 // Check mandatory parameters
 if (params.input    ) { ch_input     = file(params.input)     } else { exit 1, 'Input samplesheet not specified!' }
-if (params.databases                  ) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
+if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' }
-if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not except uncollapsed paired-reads. Pairs will be profiled as separate files."
+if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] warning: MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files."
 if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "[nf-core/taxprofiler] error: cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs"
 // TODO Add check if index but no reference exit 1
@ -53,6 +53,7 @@ include { DB_CHECK                } from '../subworkflows/local/db_check'
 include { SHORTREAD_PREPROCESSING       } from '../subworkflows/local/shortread_preprocessing'
 include { LONGREAD_PREPROCESSING        } from '../subworkflows/local/longread_preprocessing'
 include { SHORTREAD_HOSTREMOVAL         } from '../subworkflows/local/shortread_hostremoval'
 include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -100,7 +101,7 @@ workflow TAXPROFILER {
    /*
        MODULE: Run FastQC
    */
-    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ).dump(tag: "input_to_fastq")
+    ch_input_for_fastqc = INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore )
    FASTQC (
        ch_input_for_fastqc
@ -108,10 +109,6 @@ workflow TAXPROFILER {
    ch_versions = ch_versions.mix(FASTQC.out.versions.first())
    CUSTOM_DUMPSOFTWAREVERSIONS (
        ch_versions.unique().collectFile(name: 'collated_versions.yml')
    )
    /*
        SUBWORKFLOW: PERFORM PREPROCESSING
    */
@ -125,15 +122,29 @@ workflow TAXPROFILER {
        ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads
                                        .map { it -> [ it[0], [it[1]] ] }
        ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
-    } else {SHORTREAD_HOSTREMOVAL
+    } else {
        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
    }
    /*
        SUBWORKFLOW: COMPLEXITY FILTERING
    */
    if ( params.shortread_complexityfilter ) {
        ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads
    } else {
        ch_shortreads_filtered = ch_shortreads_preprocessed
    }
    /*
        SUBWORKFLOW: HOST REMOVAL
    */
    if ( params.shortread_hostremoval ) {
-        ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_preprocessed, ch_reference, ch_reference_index ).reads
+        ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_reference_index ).reads
        ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions.first())
    } else {
-        ch_shortreads_hostremoved = ch_shortreads_preprocessed
+        ch_shortreads_hostremoved = ch_shortreads_filtered
    }
    /*
@ -182,7 +193,6 @@ workflow TAXPROFILER {
                            }
    ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3
                            .dump(tag: "input_metaphlan3")
                            .multiMap {
                                it ->
                                    reads: [it[0] + it[2], it[1]]
@ -207,6 +217,12 @@ workflow TAXPROFILER {
    /*
        MODULE: MultiQC
    */
    CUSTOM_DUMPSOFTWAREVERSIONS (
        ch_versions.unique().collectFile(name: 'collated_versions.yml')
    )
    workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
    ch_workflow_summary = Channel.value(workflow_summary)
@ -218,27 +234,34 @@ workflow TAXPROFILER {
    ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
    if (params.shortread_clipmerge) {
-        ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc)
+        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
        ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions )
    }
    if (params.longread_clip) {
        ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) )
        ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions )
    }
    if (params.shortread_complexityfilter){
        ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) )
        ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions )
    }
    if (params.shortread_hostremoval) {
        ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([]))
    }
    if (params.longread_clip) {
        ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
    }
    if (params.run_kraken2) {
-        ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))
+        ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([])  )
-        ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions.first())
+        ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() )
-    }
+    }
-    if (params.run_malt) {
+
-        ch_multiqc_files = ch_multiqc_files.mix(MALT_RUN.out.log.collect{it[1]}.ifEmpty([]))
+    if (params.run_malt) {
-        ch_versions = ch_versions.mix(MALT_RUN.out.versions.first())
+        ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([])  )
        ch_versions = ch_versions.mix( MALT_RUN.out.versions.first() )
    }
    // TODO MALT results overwriting per database?
    // TODO Versions for Karken/MALT not report?
    // TODO create multiQC module for metaphlan
    MULTIQC (