Merge pull request #27 from genomic-medicine-sweden/add_nanopore

Add nanopore reads preprocessing with Porechop
2024-11-22 09:19:54 +00:00 · 2022-03-21 19:41:58 +01:00 · 2022-03-21 19:41:58 +01:00 · 51a0acd05b
commit 51a0acd05b
parent b23fb927a9 5b1b48e59e
10 changed files with 153 additions and 8 deletions
--- a/CITATIONS.md
+++ b/CITATIONS.md
@ -15,6 +15,8 @@
 * [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
    > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
 * [Porechop](https://github.com/rrwick/Porechop)
 ## Software packaging/containerisation tools
 * [Anaconda](https://anaconda.com)
--- a/README.md
+++ b/README.md
@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si
 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 2. Performs optional read pre-processing
-   - Adapter clipping and merging
+   - Adapter clipping and merging (short, and nanopore reads)
   - Low complexity filtering
   - Host read removal
   - Run merging
--- a/conf/modules.config
+++ b/conf/modules.config
@ -50,6 +50,15 @@ process {
        ]
    }
    withName: PORECHOP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        publishDir = [
            path: { "${params.outdir}/porechop" },
            mode: 'copy',
            pattern: '*.fastq.gz'
        ]
    }
    withName: FASTQC_POST {
        ext.args = '--quiet'
        ext.prefix = { "${meta.id}_${meta.run_accession}_processed" }
--- a/modules.json
+++ b/modules.json
@ -23,6 +23,9 @@
            },
            "multiqc": {
                "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41"
            },
            "porechop": {
                "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046"
            }
        }
    }
--- a/modules/nf-core/modules/porechop/main.nf
+++ b/modules/nf-core/modules/porechop/main.nf
@ -0,0 +1,35 @@
 process PORECHOP {
    tag "$meta.id"
    label 'process_medium'
    conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' :
        'quay.io/biocontainers/porechop:0.2.4--py39h7cff6ad_2' }"
    input:
    tuple val(meta), path(reads)
    output:
    tuple val(meta), path("*.fastq.gz"), emit: reads
    path "versions.yml"                , emit: versions
    when:
    task.ext.when == null || task.ext.when
    script:
    def args = task.ext.args ?: ''
    def prefix = task.ext.prefix ?: "${meta.id}"
    """
    porechop \\
        -i $reads \\
        -t $task.cpus \\
        $args \\
        -o ${prefix}.fastq.gz
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        porechop: \$( porechop --version )
    END_VERSIONS
    """
 }
--- a/modules/nf-core/modules/porechop/meta.yml
+++ b/modules/nf-core/modules/porechop/meta.yml
@ -0,0 +1,50 @@
 name: porechop
 description: Adapter removal and demultiplexing of Oxford Nanopore reads
 keywords:
  - adapter
  - nanopore
  - demultiplexing
 tools:
  - porechop:
      description: Adapter removal and demultiplexing of Oxford Nanopore reads
      homepage: "https://github.com/rrwick/Porechop"
      documentation: "https://github.com/rrwick/Porechop"
      tool_dev_url: "https://github.com/rrwick/Porechop"
      doi: "10.1099/mgen.0.000132"
      licence: ["GPL v3"]
 input:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - reads:
      type: file
      description: fastq/fastq.gz file
      pattern: "*.{fastq,fastq.gz,fq,fq.gz}"
 output:
  - meta:
      type: map
      description: |
        Groovy Map containing sample information
        e.g. [ id:'test', single_end:false ]
  - versions:
      type: file
      description: File containing software versions
      pattern: "versions.yml"
  - reads:
      type: file
      description: Demultiplexed and/or adapter-trimmed fastq.gz file
      pattern: "*.{fastq.gz}"
 authors:
  - "@ggabernet"
  - "@jasmezz"
  - "@d4straub"
  - "@LaurenceKuhl"
  - "@SusiJo"
  - "@jonasscheid"
  - "@jonoave"
  - "@GokceOGUZ"
--- a/nextflow.config
+++ b/nextflow.config
@ -57,6 +57,7 @@ params {
    // FASTQ preprocessing
    fastp_clip_merge           = false
    fastp_exclude_unmerged     = true
    remove_adapters            = false
    // MALT
    run_malt                   = false
--- a/subworkflows/local/longread_preprocessing.nf
+++ b/subworkflows/local/longread_preprocessing.nf
@ -0,0 +1,34 @@
 include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main'
 include { PORECHOP              } from '../../modules/nf-core/modules/porechop/main'
 workflow LONGREAD_PREPROCESSING {
    take:
    reads
    main:
    ch_versions      = Channel.empty()
    ch_multiqc_files = Channel.empty()
    PORECHOP ( reads )
    ch_processed_reads = PORECHOP.out.reads
                                .dump(tag: "pre_fastqc_check")
                                .map {
                                        meta, reads ->
                                        def meta_new = meta.clone()
                                        meta_new['single_end'] = 1
                                        [ meta_new, reads ]
                                    }
    FASTQC_POST ( PORECHOP.out.reads )
    ch_versions = ch_versions.mix(PORECHOP.out.versions.first())
    ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} )
    emit:
    reads    = ch_processed_reads   // channel: [ val(meta), [ reads ] ]
    versions = ch_versions          // channel: [ versions.yml ]
    mqc      = ch_multiqc_files
 }
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -7,7 +7,7 @@ include { FASTP as FASTP_SINGLE       } from '../../modules/nf-core/modules/fast
 include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fastp/main'
 include { FASTQC as FASTQC_POST       } from '../../modules/nf-core/modules/fastqc/main'
-workflow FASTQ_PREPROCESSING {
+workflow SHORTREAD_PREPROCESSING {
    take:
    reads // file: /path/to/samplesheet.csv
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -39,8 +39,8 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi
 include { INPUT_CHECK         } from '../subworkflows/local/input_check'
 include { DB_CHECK            } from '../subworkflows/local/db_check'
-include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing'
+include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing'
-
+include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -101,13 +101,24 @@ workflow TAXPROFILER {
    // PERFORM PREPROCESSING
    //
    if ( params.fastp_clip_merge ) {
-        FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq )
+        SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq )
    }
    ch_multiqc_files = Channel.empty()
    if ( params.remove_adapters ) {
        ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads
                                        .map { it -> [ it[0], [it[1]] ] }
    ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first())
        ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc)
    } else {
        ch_longreads_preprocessed = INPUT_CHECK.out.nanopore
    }
    //
    // PERFORM RUN MERGING
    //
-    ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads
+    ch_processed_for_combine = SHORTREAD_PREPROCESSING.out.reads
        .dump(tag: "prep_for_combine_grouping")
        .map {
            meta, reads ->
@ -134,6 +145,7 @@ workflow TAXPROFILER {
    // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
    ch_input_for_profiling = ch_reads_for_profiling
            .mix( ch_longreads_preprocessed )
            .combine(DB_CHECK.out.dbs)
            .dump(tag: "reads_plus_db")
            .branch {
@ -185,14 +197,13 @@ workflow TAXPROFILER {
    workflow_summary    = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params)
    ch_workflow_summary = Channel.value(workflow_summary)
    ch_multiqc_files = Channel.empty()
    ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config))
    ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([]))
    ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
    ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
    ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([]))
    if (params.fastp_clip_merge) {
-        ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc)
+        ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc)
    }
    if (params.run_kraken2) {
        ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))