Get skeleton read processing to input for profiling

2024-11-24 04:19:55 +00:00 · 2022-02-18 16:51:01 +01:00 · 2022-02-18 16:51:01 +01:00 · cf55cc592c
commit cf55cc592c
parent 1b893cb039
13 changed files with 407 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -42,9 +42,9 @@ On release, automated continuous integration tests run the pipeline on a full-si
   - Centrifuge
   - Kaiju
   - mOTUs
-4.  Perform optional post-processing with:
+4. Perform optional post-processing with:
-    -  bracken
+    - bracken
-5.  Standardises output tables
+5. Standardises output tables
 6. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
 ## Quick Start
--- a/conf/modules.config
+++ b/conf/modules.config
@ -28,8 +28,47 @@ process {
    withName: FASTQC {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_raw" }
        publishDir = [
            path: { "${params.outdir}/fastqc/raw" },
            mode: 'copy',
            pattern: '*.html'
        ]
    }
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        // TODO also include option to NOT merge
        ext.args   = [
            { ${meta.single_end} } == 0 ? "-m" : '',
            params.fastp_exclude_unmerged ? '' : "--include_unmerged"
        ].join(' ').trim()
        publishDir = [
            path: { "${params.outdir}/fastp" },
            mode: 'copy',
            pattern: '*.fastq.gz'
        ]
    }
    withName: FASTQC_POST {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
        publishDir = [
            path: { "${params.outdir}/fastqc/processed" },
            mode: 'copy',
            pattern: '*.html'
        ]
    }
    withName: CAT_FASTQ {
        publishDir = [
            path: { "${params.outdir}/prepared_sequences" },
            mode: 'copy',
            pattern: '*.fastq.gz'
        ]
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
--- a/conf/test.config
+++ b/conf/test.config
@ -22,8 +22,6 @@ params {
    // Input data
    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
+    input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv'
    // Genome references
    genome = 'R64-1-1'
 }
--- a/lib/WorkflowTaxprofiler.groovy
+++ b/lib/WorkflowTaxprofiler.groovy
@ -10,10 +10,11 @@ class WorkflowTaxprofiler {
    public static void initialise(params, log) {
        genomeExistsError(params, log)
-        if (!params.fasta) {
+        // TODO update as necessary
-            log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
+        //if (!params.fasta) {
-            System.exit(1)
+        //    log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file."
-        }
+        //    System.exit(1)
        //}
    }
    //
--- a/modules.json
+++ b/modules.json
@ -3,9 +3,15 @@
    "homePage": "https://github.com/nf-core/taxprofiler",
    "repos": {
        "nf-core/modules": {
            "cat/fastq": {
                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
            },
            "custom/dumpsoftwareversions": {
                "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
            },
            "fastp": {
                "git_sha": "d0a1cbb703a130c19f6796c3fce24fbe7dfce789"
            },
            "fastqc": {
                "git_sha": "9d0cad583b9a71a6509b754fdf589cbfbed08961"
            },
--- a/modules/nf-core/modules/cat/fastq/main.nf
+++ b/modules/nf-core/modules/cat/fastq/main.nf
@ -0,0 +1,51 @@
 process CAT_FASTQ {
    tag "$meta.id"
    label 'process_low'
    conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' :
        'biocontainers/biocontainers:v1.2.0_cv1' }"
    input:
    tuple val(meta), path(reads, stageAs: "input*/*")
    output:
    tuple val(meta), path("*.merged.fastq.gz"), emit: reads
    path "versions.yml"                       , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    def readList = reads.collect{ it.toString() }
    if (meta.single_end) {
        if (readList.size > 1) {
            """
            cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz
            cat <<-END_VERSIONS > versions.yml
            "${task.process}":
                cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
            END_VERSIONS
            """
        }
    } else {
        if (readList.size > 2) {
            def read1 = []
            def read2 = []
            readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v }
            """
            cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz
            cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz
            cat <<-END_VERSIONS > versions.yml
            "${task.process}":
                cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
            END_VERSIONS
            """
        }
    }
 }
--- a/modules/nf-core/modules/cat/fastq/meta.yml
+++ b/modules/nf-core/modules/cat/fastq/meta.yml
@ -0,0 +1,39 @@
 name: cat_fastq
 description: Concatenates fastq files
 keywords:
  - fastq
  - concatenate
 tools:
  - cat:
      description: |
        The cat utility reads files sequentially, writing them to the standard output.
      documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html
      licence: ["GPL-3.0-or-later"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: list
      description: |
        List of input FastQ files to be concatenated.
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: Merged fastq file
      pattern: "*.{merged.fastq.gz}"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
 authors:
  - "@joseespinosa"
  - "@drpatelh"
--- a/modules/nf-core/modules/fastp/main.nf
+++ b/modules/nf-core/modules/fastp/main.nf
@ -0,0 +1,75 @@
 process FASTP {
    tag "$meta.id"
    label 'process_medium'
    conda (params.enable_conda ? 'bioconda::fastp=0.23.2' : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' :
        'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }"
    input:
    tuple val(meta), path(reads)
    val   save_trimmed_fail
    val   save_merged
    output:
    tuple val(meta), path('*.trim.fastq.gz')  , optional:true, emit: reads
    tuple val(meta), path('*.json')           , emit: json
    tuple val(meta), path('*.html')           , emit: html
    tuple val(meta), path('*.log')            , emit: log
    path "versions.yml"                       , emit: versions
    tuple val(meta), path('*.fail.fastq.gz')  , optional:true, emit: reads_fail
    tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    // Added soft-links to original fastqs for consistent naming in MultiQC
    def prefix = task.ext.prefix ?: "${meta.id}"
    if (meta.single_end) {
        def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : ''
        """
        [ ! -f  ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz
        fastp \\
            --in1 ${prefix}.fastq.gz \\
            --out1 ${prefix}.trim.fastq.gz \\
            --thread $task.cpus \\
            --json ${prefix}.fastp.json \\
            --html ${prefix}.fastp.html \\
            $fail_fastq \\
            $args \\
            2> ${prefix}.fastp.log
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
        END_VERSIONS
        """
    } else {
        def fail_fastq  = save_trimmed_fail ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
        def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
        """
        [ ! -f  ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz
        [ ! -f  ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz
        fastp \\
            --in1 ${prefix}_1.fastq.gz \\
            --in2 ${prefix}_2.fastq.gz \\
            --out1 ${prefix}_1.trim.fastq.gz \\
            --out2 ${prefix}_2.trim.fastq.gz \\
            --json ${prefix}.fastp.json \\
            --html ${prefix}.fastp.html \\
            $fail_fastq \\
            $merge_fastq \\
            --thread $task.cpus \\
            --detect_adapter_for_pe \\
            $args \\
            2> ${prefix}.fastp.log
        cat <<-END_VERSIONS > versions.yml
        "${task.process}":
            fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
        END_VERSIONS
        """
    }
 }
--- a/modules/nf-core/modules/fastp/meta.yml
+++ b/modules/nf-core/modules/fastp/meta.yml
@ -0,0 +1,68 @@
 name: fastp
 description: Perform adapter/quality trimming on sequencing reads
 keywords:
  - trimming
  - quality control
  - fastq
 tools:
  - fastp:
      description: |
        A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
      documentation: https://github.com/OpenGene/fastp
      doi: https://doi.org/10.1093/bioinformatics/bty560
      licence: ["MIT"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: |
        List of input FastQ files of size 1 and 2 for single-end and paired-end data,
        respectively.
  - save_trimmed_fail:
      type: boolean
      description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz`
  - save_merged:
      type: boolean
      description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz`
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: The trimmed/modified/unmerged fastq reads
      pattern: "*trim.fastq.gz"
  - json:
      type: file
      description: Results in JSON format
      pattern: "*.json"
  - html:
      type: file
      description: Results in HTML format
      pattern: "*.html"
  - log:
      type: file
      description: fastq log file
      pattern: "*.log"
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - reads_fail:
      type: file
      description: Reads the failed the preprocessing
      pattern: "*fail.fastq.gz"
  - reads_merged:
      type: file
      description: Reads that were successfully merged
      pattern: "*.{merged.fastq.gz}"
 authors:
  - "@drpatelh"
  - "@kevinmenden"
--- a/nextflow.config
+++ b/nextflow.config
@ -33,7 +33,7 @@ params {
    help                       = false
    validate_params            = true
    show_hidden_params         = false
-    schema_ignore_params       = 'genomes'
+    schema_ignore_params       = 'genomes,fasta'
    enable_conda               = false
    // Config options
@ -50,6 +50,9 @@ params {
    max_cpus                   = 16
    max_time                   = '240.h'
    // FASTQ preprocessing
    fastp_clip_merge       = false
    fastp_exclude_unmerged = true
 }
 // Load base.config by default for all pipelines
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@ -56,15 +56,6 @@
                    "fa_icon": "fas fa-book",
                    "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details."
                },
                "fasta": {
                    "type": "string",
                    "format": "file-path",
                    "mimetype": "text/plain",
                    "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$",
                    "description": "Path to FASTA genome file.",
                    "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.",
                    "fa_icon": "far fa-file-code"
                },
                "igenomes_base": {
                    "type": "string",
                    "format": "directory-path",
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -9,22 +9,38 @@ workflow INPUT_CHECK {
    samplesheet // file: /path/to/samplesheet.csv
    main:
-    SAMPLESHEET_CHECK ( samplesheet )
+    parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet )
        .csv
        .splitCsv ( header:true, sep:',' )
        .dump(tag: "split_csv_out")
        .branch {
            fasta: it['fasta'] != ''
            fastq: true
        }
    parsed_samplesheet.fastq
        .map { create_fastq_channels(it) }
-        .set { reads }
+        .dump(tag: "fastq_channel_init")
        .set { fastq }
    parsed_samplesheet.fasta
        .map { create_fasta_channels(it) }
        .dump(tag: "fasta_channel_init")
        .set { fasta }
    emit:
-    reads                                     // channel: [ val(meta), [ reads ] ]
+    fastq                                     // channel: [ val(meta), [ reads ] ]
    fasta                                     // channel: [ val(meta), fasta ]
    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
 }
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
 def create_fastq_channels(LinkedHashMap row) {
    def meta = [:]
-    meta.id           = row.sample
+    meta.id                     = row.sample
-    meta.single_end   = row.single_end.toBoolean()
+    meta.run_accession          = row.run_accession
    meta.instrument_platform    = row.instrument_platform
    meta.single_end             = row.single_end.toBoolean()
    def array = []
    if (!file(row.fastq_1).exists()) {
@ -40,3 +56,20 @@ def create_fastq_channels(LinkedHashMap row) {
    }
    return array
 }
 // Function to get list of [ meta, fasta ]
 def create_fasta_channels(LinkedHashMap row) {
    def meta = [:]
    meta.id                     = row.sample
    meta.run_accession          = row.run_accession
    meta.instrument_platform    = row.instrument_platform
    meta.single_end             = true
    def array = []
    if (!file(row.fasta).exists()) {
        exit 1, "ERROR: Please check input samplesheet -> FastA file does not exist!\n${row.fasta}"
    }
    array = [ meta, [ file(row.fasta) ] ]
    return array
 }
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -11,7 +11,7 @@ WorkflowTaxprofiler.initialise(params, log)
 // TODO nf-core: Add all file path parameters for the pipeline to the list below
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ]
+def checkPathParamList = [ params.input, params.multiqc_config ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 // Check mandatory parameters
@ -50,6 +50,11 @@ include { FASTQC                      } from '../modules/nf-core/modules/fastqc/
 include { MULTIQC                     } from '../modules/nf-core/modules/multiqc/main'
 include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 include { FASTP as FASTP_SINGLE       } from '../modules/nf-core/modules/fastp/main'
 include { FASTP as FASTP_PAIRED       } from '../modules/nf-core/modules/fastp/main'
 include { FASTQC as FASTQC_POST       } from '../modules/nf-core/modules/fastqc/main'
 include { CAT_FASTQ                   } from '../modules/nf-core/modules/cat/fastq/main'
 /*
 ========================================================================================
    RUN MAIN WORKFLOW
@ -75,7 +80,7 @@ workflow TAXPROFILER {
    // MODULE: Run FastQC
    //
    FASTQC (
-        INPUT_CHECK.out.reads
+        INPUT_CHECK.out.fastq
    )
    ch_versions = ch_versions.mix(FASTQC.out.versions.first())
@ -83,6 +88,71 @@ workflow TAXPROFILER {
        ch_versions.unique().collectFile(name: 'collated_versions.yml')
    )
    //
    // MODULE: Run Clip/Merge/Complexity
    //
    // TODO give option to clip only and retain pairs
    // TODO give option to retain singletons (probably fastp option likely)
    // TODO move to subworkflow
    if ( params.fastp_clip_merge ) {
        ch_input_for_fastp = INPUT_CHECK.out.fastq
                                .dump(tag: "pre-fastp_branch")
                                .branch{
                                    single: it[0]['single_end'] == true
                                    paired: it[0]['single_end'] == false
                                }
        ch_input_for_fastp.single.dump(tag: "input_fastp_single")
        ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
        FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
        FASTP_PAIRED ( ch_input_for_fastp.paired, false, true )
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                    .mix( FASTP_SINGLE.out.reads )
                                    .map {
                                        meta, reads ->
                                        def meta_new = meta.clone()
                                        meta_new['single_end'] = 1
                                        [ meta_new, reads ]
                                    }
        FASTQC_POST ( ch_fastp_reads_prepped )
        ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first())
        ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first())
        ch_processed_reads = ch_fastp_reads_prepped
    } else {
        ch_processed_reads = INPUT_CHECK.out.fastq
    }
    // MODULE: Cat merge runs of same sample
    ch_processed_for_combine = ch_processed_reads
        .dump(tag: "prep_for_combine_grouping")
        .map {
            meta, reads ->
            def meta_new = meta.clone()
            meta_new['run_accession'] = 'combined'
            [ meta_new, reads ]
        }
        .groupTuple ( by: 0 )
        .branch{
            combine: it[1].size() >= 2
            skip: it[1].size() < 2
        }
    CAT_FASTQ ( ch_processed_for_combine.combine )
    // Ready for profiling!
    ch_reads_for_profiling = ch_processed_for_combine.skip
                                .dump(tag: "skip_combine")
                                .mix( CAT_FASTQ.out.reads )
                                .dump(tag: "files_for_profiling")
    //
    // MODULE: MultiQC
    //
@ -95,6 +165,12 @@ workflow TAXPROFILER {
    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
    ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
    if (params.fastp_clip_merge) {
        ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json.collect{it[1]}.ifEmpty([]))
        ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json.collect{it[1]}.ifEmpty([]))
        ch_multiqc_files = ch_multiqc_files.mix(FASTQC_POST.out.zip.collect{it[1]}.ifEmpty([]))
    }
    MULTIQC (
        ch_multiqc_files.collect()