diff --git a/CITATIONS.md b/CITATIONS.md index 192b2f4..53c53c3 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -15,6 +15,8 @@ * [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +* [Porechop](https://github.com/rrwick/Porechop) + ## Software packaging/containerisation tools * [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index 5d0c74b..d454a9b 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Performs optional read pre-processing - - Adapter clipping and merging + - Adapter clipping and merging (short, and nanopore reads) - Low complexity filtering - Host read removal - Run merging diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 16e668b..d10ee90 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -173,7 +173,7 @@ def check_samplesheet(file_in, file_out): ## Auto-detect paired-end/single-end if sample and fastq_1 and fastq_2: ## Paired-end short reads sample_info.extend(["0", fastq_1, fastq_2, fasta]) - elif sample and fastq_1 and not fastq_2: ## Single-end short reads + elif sample and fastq_1 and not fastq_2: ## Single-end short/long fastq reads sample_info.extend(["1", fastq_1, fastq_2, fasta]) elif ( sample and fasta and not fastq_1 and not fastq_2 diff --git a/conf/modules.config b/conf/modules.config index 620ae1d..ab8f021 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -41,7 +41,7 @@ process { // TODO also include option to NOT merge ext.args = [ { ${meta.single_end} } == 0 ? "-m" : '', - params.fastp_exclude_unmerged ? '' : "--include_unmerged" + params.shortread_excludeunmerged ? '' : "--include_unmerged" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/fastp" }, @@ -50,6 +50,15 @@ process { ] } + withName: PORECHOP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/porechop" }, + mode: 'copy', + pattern: '*.fastq.gz' + ] + } + withName: FASTQC_POST { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } @@ -75,7 +84,7 @@ process { pattern: '*.{rma6,tab,text,sam,log}' ] ext.args = { "${meta.db_params}" } - ext.when = params.run_malt + ext.prefix = { "${meta.id}-${meta.db_name}" } } withName: KRAKEN2_KRAKEN2 { @@ -85,7 +94,6 @@ process { pattern: '*.{fastq.gz,txt}' ] ext.args = { "${meta.db_params}" } - ext.when = params.run_kraken2 ext.prefix = { "${meta.id}-${meta.db_name}" } } diff --git a/docs/usage.md b/docs/usage.md index a8b0448..38c063e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -44,11 +44,11 @@ TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, ``` -| Column | Description | -|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1 or Nanopore reads. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/modules.json b/modules.json index 96a43d8..673a69b 100644 --- a/modules.json +++ b/modules.json @@ -26,6 +26,8 @@ }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" + "porechop": { + "git_sha": "e20e57f90b6787ac9a010a980cf6ea98bd990046" } } } diff --git a/modules/nf-core/modules/porechop/main.nf b/modules/nf-core/modules/porechop/main.nf new file mode 100644 index 0000000..65982b8 --- /dev/null +++ b/modules/nf-core/modules/porechop/main.nf @@ -0,0 +1,35 @@ +process PORECHOP { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' : + 'quay.io/biocontainers/porechop:0.2.4--py39h7cff6ad_2' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + porechop \\ + -i $reads \\ + -t $task.cpus \\ + $args \\ + -o ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/porechop/meta.yml b/modules/nf-core/modules/porechop/meta.yml new file mode 100644 index 0000000..81399d2 --- /dev/null +++ b/modules/nf-core/modules/porechop/meta.yml @@ -0,0 +1,50 @@ +name: porechop +description: Adapter removal and demultiplexing of Oxford Nanopore reads +keywords: + - adapter + - nanopore + - demultiplexing +tools: + - porechop: + description: Adapter removal and demultiplexing of Oxford Nanopore reads + homepage: "https://github.com/rrwick/Porechop" + documentation: "https://github.com/rrwick/Porechop" + tool_dev_url: "https://github.com/rrwick/Porechop" + doi: "10.1099/mgen.0.000132" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Demultiplexed and/or adapter-trimmed fastq.gz file + pattern: "*.{fastq.gz}" + +authors: + - "@ggabernet" + - "@jasmezz" + - "@d4straub" + - "@LaurenceKuhl" + - "@SusiJo" + - "@jonasscheid" + - "@jonoave" + - "@GokceOGUZ" diff --git a/nextflow.config b/nextflow.config index 7b897ab..5f7aec6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,8 +55,9 @@ params { databases = null // FASTQ preprocessing - fastp_clip_merge = false - fastp_exclude_unmerged = true + shortread_clipmerge = false + shortread_excludeunmerged = true + longread_clip = false // MALT run_malt = false diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 938c87f..67dadc2 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -15,14 +15,20 @@ workflow INPUT_CHECK { .dump(tag: "input_split_csv_out") .branch { fasta: it['fasta'] != '' + nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' fastq: true } parsed_samplesheet.fastq - .map { create_fastq_channels(it) } + .map { create_fastq_channel(it) } .dump(tag: "fastq_channel_init") .set { fastq } + parsed_samplesheet.nanopore + .map { create_fastq_channel(it) } + .dump(tag: "fastq_nanopore_channel_init") + .set { nanopore } + parsed_samplesheet.fasta .map { create_fasta_channels(it) } .dump(tag: "fasta_channel_init") @@ -30,6 +36,7 @@ workflow INPUT_CHECK { emit: fastq // channel: [ val(meta), [ reads ] ] + nanopore // channel: [ val(meta), [ reads ] ] fasta // channel: [ val(meta), fasta ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } @@ -51,10 +58,17 @@ def create_fastq_channels(LinkedHashMap row) { if (meta.single_end) { fastq_meta = [ meta, [ file(row.fastq_1) ] ] } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + if (meta.instrument_platform == 'OXFORD_NANOPORE') { + if (row.fastq_2 != '') { + exit 1, "ERROR: Please check input samplesheet -> For Oxford Nanopore reads Read 2 FastQ should be empty!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1) ] ] + } else { + if (!file(row.fastq_2).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } return fastq_meta } diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf new file mode 100644 index 0000000..da1049a --- /dev/null +++ b/subworkflows/local/longread_preprocessing.nf @@ -0,0 +1,34 @@ + +include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' +include { PORECHOP } from '../../modules/nf-core/modules/porechop/main' + +workflow LONGREAD_PREPROCESSING { + take: + reads + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + PORECHOP ( reads ) + + ch_processed_reads = PORECHOP.out.reads + .dump(tag: "pre_fastqc_check") + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = 1 + [ meta_new, reads ] + } + + FASTQC_POST ( PORECHOP.out.reads ) + ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_POST.out.zip.collect{it[1]} ) + + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf similarity index 97% rename from subworkflows/local/preprocessing.nf rename to subworkflows/local/shortread_preprocessing.nf index 5832824..d996a76 100644 --- a/subworkflows/local/preprocessing.nf +++ b/subworkflows/local/shortread_preprocessing.nf @@ -7,7 +7,7 @@ include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/modules/fast include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/modules/fastp/main' include { FASTQC as FASTQC_POST } from '../../modules/nf-core/modules/fastqc/main' -workflow FASTQ_PREPROCESSING { +workflow SHORTREAD_PREPROCESSING { take: reads // file: /path/to/samplesheet.csv @@ -23,7 +23,7 @@ workflow FASTQ_PREPROCESSING { // TODO move to subworkflow - if ( params.fastp_clip_merge ) { + if ( params.shortread_clipmerge ) { ch_input_for_fastp = reads .dump(tag: "pre-fastp_branch") diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index f740324..4aa0684 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -39,8 +39,8 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi include { INPUT_CHECK } from '../subworkflows/local/input_check' include { DB_CHECK } from '../subworkflows/local/db_check' -include { FASTQ_PREPROCESSING } from '../subworkflows/local/preprocessing' - +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,7 +89,7 @@ workflow TAXPROFILER { // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK.out.fastq + INPUT_CHECK.out.fastq.mix( INPUT_CHECK.out.nanopore ) ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) @@ -100,14 +100,22 @@ workflow TAXPROFILER { // // PERFORM PREPROCESSING // - if ( params.fastp_clip_merge ) { - FASTQ_PREPROCESSING ( INPUT_CHECK.out.fastq ) + if ( params.shortread_clipmerge ) { + SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ) + } + + if ( params.longread_clip ) { + ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads + .map { it -> [ it[0], [it[1]] ] } + ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) + } else { + ch_longreads_preprocessed = INPUT_CHECK.out.nanopore } // // PERFORM RUN MERGING // - ch_processed_for_combine = FASTQ_PREPROCESSING.out.reads + ch_processed_for_combine = SHORTREAD_PREPROCESSING.out.reads .dump(tag: "prep_for_combine_grouping") .map { meta, reads -> @@ -134,6 +142,7 @@ workflow TAXPROFILER { // output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = ch_reads_for_profiling + .mix( ch_longreads_preprocessed ) .combine(DB_CHECK.out.dbs) .dump(tag: "reads_plus_db") .branch { @@ -175,9 +184,13 @@ workflow TAXPROFILER { // // RUN PROFILING // - MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) - KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + if ( params.run_malt ) { + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) + } + if ( params.run_kraken2 ) { + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) + } // // MODULE: MultiQC @@ -191,8 +204,12 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - if (params.fastp_clip_merge) { - ch_multiqc_files = ch_multiqc_files.mix(FASTQ_PREPROCESSING.out.mqc) + + if (params.shortread_clipmerge) { + ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.mqc) + } + if (params.longread_clip) { + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.mqc) } if (params.run_kraken2) { ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]))