diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e57889..b18e601 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,8 @@ jobs: - NXF_VER: "" NXF_EDGE: "1" parameters: - - "--longread_clip false" - - "--shortread_clip false" + - "--perform_longread_clip false" + - "--perform_shortread_clipmerge false" - "--shortread_clipmerge_tool fastp" - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" - "--shortread_clipmerge_tool fastp --shortread_clipmerge_mergepairs" @@ -39,6 +39,8 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - "--shortread_complexityfilter_tool prinseq" + - "--perform_runmerging" + - "--perform_runmerging --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter false --shortread_hostremoval" steps: diff --git a/conf/modules.config b/conf/modules.config index e3b662a..ccd1748 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -179,9 +179,19 @@ process { ] } + withName: CAT_FASTQ { + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/run_merging/" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_runmerged_reads + ] + } + withName: MALT_RUN { ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/malt/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -191,7 +201,7 @@ process { withName: KRAKEN2_KRAKEN2 { ext.args = { "${meta.db_params}" } - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/kraken2/${meta.db_name}" }, mode: params.publish_dir_mode, @@ -200,12 +210,13 @@ process { } withName: METAPHLAN3 { + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}-${meta.db_name}" } : { "${meta.id}-${meta.run_accession}-${meta.db_name}" } publishDir = [ path: { "${params.outdir}/metaphlan3/${meta.db_name}" }, mode: params.publish_dir_mode, pattern: '*.{biom,txt}' ] - ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" } } withName: CENTRIFUGE_CENTRIFUGE { diff --git a/conf/test.config b/conf/test.config index 616f82e..1d08d91 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,16 +22,15 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' - run_kraken2 = true - run_malt = true - run_metaphlan3 = true - run_centrifuge = true - shortread_clipmerge = true - longread_clip = false - shortread_complexityfilter = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + perform_shortread_clipmerge = true + perform_longread_clip = false + perform_shortread_complexityfilter = true shortread_hostremoval = true shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' } diff --git a/nextflow.config b/nextflow.config index 0c20e7d..6b0a79d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -55,7 +55,7 @@ params { databases = null // FASTQ preprocessing - shortread_clipmerge = false + perform_shortread_clipmerge = false shortread_clipmerge_tool = 'fastp' shortread_clipmerge_skipadaptertrim = false shortread_clipmerge_mergepairs = false @@ -63,11 +63,11 @@ params { shortread_clipmerge_adapter1 = null shortread_clipmerge_adapter2 = null shortread_clipmerge_minlength = 15 - longread_clip = false + perform_longread_clip = false save_preprocessed_reads = false // Complexity filtering - shortread_complexityfilter = false + perform_shortread_complexityfilter = false shortread_complexityfilter_tool = 'bbduk' shortread_complexityfilter_entropy = 0.3 shortread_complexityfilter_bbduk_windowsize = 50 @@ -76,6 +76,9 @@ params { shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 save_complexityfiltered_reads = false + // run merging + perform_runmerging = false + save_runmerged_reads = false // Host Removal shortread_hostremoval = false @@ -94,6 +97,7 @@ params { centrifuge_save_unaligned = false centrifuge_save_aligned = false centrifuge_sam_format = false + // metaphlan3 run_metaphlan3 = false } diff --git a/nextflow_schema.json b/nextflow_schema.json index 4c77670..07295e3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -262,15 +262,9 @@ "type": "string", "default": "None" }, - "shortread_clipmerge": { - "type": "boolean" - }, "shortread_clipmerge_excludeunmerged": { "type": "boolean" }, - "longread_clip": { - "type": "boolean" - }, "run_malt": { "type": "boolean" }, @@ -321,8 +315,7 @@ "default": 15 }, "save_preprocessed_reads": { - "type": "boolean", - "default": false + "type": "boolean" }, "shortread_complexityfilter_tool": { "type": "string", @@ -335,9 +328,6 @@ "shortread_complexityfilter_bbduk_mask": { "type": "boolean" }, - "shortread_complexityfilter": { - "type": "boolean" - }, "shortread_complexityfilter_entropy": { "type": "number", "default": 0.3 @@ -352,8 +342,22 @@ "default": 0.5 }, "save_complexityfiltered_reads": { - "type": "boolean", - "default": false + "type": "boolean" + }, + "save_runmerged_reads": { + "type": "boolean" + }, + "perform_shortread_clipmerge": { + "type": "boolean" + }, + "perform_longread_clip": { + "type": "boolean" + }, + "perform_shortread_complexityfilter": { + "type": "boolean" + }, + "perform_runmerging": { + "type": "boolean" }, "shortread_hostremoval": { "type": "boolean" diff --git a/nf-core/modules/centrifuge/centrifuge/main.nf b/nf-core/modules/centrifuge/centrifuge/main.nf deleted file mode 100644 index 3d23fc9..0000000 --- a/nf-core/modules/centrifuge/centrifuge/main.nf +++ /dev/null @@ -1,61 +0,0 @@ -process CENTRIFUGE_CENTRIFUGE { - tag "$meta.id" - label 'process_high' - - conda (params.enable_conda ? "bioconda::centrifuge=1.0.4_beta" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--h9a82719_6' : - 'quay.io/biocontainers/centrifuge:1.0.4_beta--h9a82719_6' }" - - input: - tuple val(meta), path(reads) - path db - val save_unaligned - val save_aligned - val sam_format - - output: - tuple val(meta), path('*report.txt') , emit: report - tuple val(meta), path('*results.txt') , emit: results - tuple val(meta), path('*.sam') , optional: true, emit: sam - tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped - tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" - def unaligned = '' - def aligned = '' - if (meta.single_end) { - unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' - } else { - unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' - aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' - } - def sam_output = sam_format ? "--out-fmt 'sam'" : '' - """ - ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included - db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/.1.cf//'` - centrifuge \\ - -x \$db_name \\ - -p $task.cpus \\ - $paired \\ - --report-file ${prefix}.report.txt \\ - -S ${prefix}.results.txt \\ - $unaligned \\ - $aligned \\ - $sam_output \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') - END_VERSIONS - """ -} diff --git a/nf-core/modules/centrifuge/centrifuge/meta.yml b/nf-core/modules/centrifuge/centrifuge/meta.yml deleted file mode 100644 index a252c00..0000000 --- a/nf-core/modules/centrifuge/centrifuge/meta.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: centrifuge_centrifuge -description: Classifies metagenomic sequence data -keywords: - - classify - - metagenomics - - fastq - - db -tools: - - centrifuge: - description: Centrifuge is a classifier for metagenomic sequences. - homepage: https://ccb.jhu.edu/software/centrifuge/ - documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml - doi: 10.1101/gr.210641.116 - licence: ["GPL v3"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - db: - type: directory - description: Path to directory containing centrifuge database files - - save_unaligned: - type: value - description: If true unmapped fastq files are saved - - save_aligned: - type: value - description: If true mapped fastq files are saved -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - report: - type: file - description: | - File containing a classification summary - pattern: "*.{report.txt}" - - results: - type: file - description: | - File containing classification results - pattern: "*.{results.txt}" - - fastq_unmapped: - type: file - description: Unmapped fastq files - pattern: "*.unmapped.fastq.gz" - - fastq_mapped: - type: file - description: Mapped fastq files - pattern: "*.mapped.fastq.gz" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@sofstam" - - "@jfy133" - - "@sateeshperi" diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index c74c583..18de739 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -9,8 +9,7 @@ include { METAPHLAN3 } from '../../modules/nf-core/modules/meta workflow PROFILING { take: - shortreads // [ [ meta ], [ reads ] ] - longreads // [ [ meta ], [ reads ] ] + reads // [ [ meta ], [ reads ] ] databases // [ [ meta ], path ] main: @@ -22,8 +21,14 @@ workflow PROFILING { */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = shortreads - .mix( longreads ) + ch_input_for_profiling = reads + .map { + meta, reads -> + def meta_new = meta.clone() + pairtype = meta_new['single_end'] ? '_se' : '_pe' + meta_new['id'] = meta_new['id'] + pairtype + [meta_new, reads] + } .combine(databases) .branch { malt: it[2]['tool'] == 'malt' diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf index 9fb9425..6fed2ae 100644 --- a/subworkflows/local/shortread_fastp.nf +++ b/subworkflows/local/shortread_fastp.nf @@ -28,8 +28,8 @@ workflow SHORTREAD_FASTP { .map { meta, reads -> def meta_new = meta.clone() - meta_new['single_end'] = 1 - [ meta_new, reads ] + meta_new['single_end'] = true + [ meta_new, [ reads ].flatten() ] } ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 95e3588..894a1e1 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -111,14 +111,14 @@ workflow TAXPROFILER { /* SUBWORKFLOW: PERFORM PREPROCESSING */ - if ( params.shortread_clipmerge ) { + if ( params.perform_shortread_clipmerge ) { ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( INPUT_CHECK.out.fastq ).reads } else { ch_shortreads_preprocessed = INPUT_CHECK.out.fastq } - if ( params.longread_clip ) { + if ( params.perform_longread_clip ) { ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( INPUT_CHECK.out.nanopore ).reads .map { it -> [ it[0], [it[1]] ] } ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions.first()) @@ -130,7 +130,7 @@ workflow TAXPROFILER { SUBWORKFLOW: COMPLEXITY FILTERING */ - if ( params.shortread_complexityfilter ) { + if ( params.perform_shortread_complexityfilter ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads } else { ch_shortreads_filtered = ch_shortreads_preprocessed @@ -147,11 +147,48 @@ workflow TAXPROFILER { ch_shortreads_hostremoved = ch_shortreads_filtered } + */ + + if ( params.perform_runmerging ) { + + ch_reads_for_cat_branch = ch_shortreads_hostremoved + .mix( ch_longreads_preprocessed ) + .map { + meta, reads -> + def meta_new = meta.clone() + meta_new.remove('run_accession') + [ meta_new, reads ] + } + .groupTuple() + .map { + meta, reads -> + [ meta, reads.flatten() ] + } + .branch { + meta, reads -> + // we can't concatenate files if there is not a second run, we branch + // here to separate them out, and mix back in after for efficiency + cat: ( meta.single_end && reads.size() > 1 ) || ( !meta.single_end && reads.size() > 2 ) + skip: true + } + + ch_reads_runmerged = CAT_FASTQ ( ch_reads_for_cat_branch.cat ).reads + .mix( ch_reads_for_cat_branch.skip ) + .map { + meta, reads -> + [ meta, [ reads ].flatten() ] + } + + } else { + ch_reads_runmerged = ch_shortreads_hostremoved + .mix( ch_longreads_preprocessed ) + } + /* SUBWORKFLOW: PROFILING */ - PROFILING ( ch_shortreads_hostremoved, ch_longreads_preprocessed, DB_CHECK.out.dbs ) + PROFILING ( ch_reads_runmerged, DB_CHECK.out.dbs ) ch_versions = ch_versions.mix( PROFILING.out.versions ) /* @@ -173,23 +210,28 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - if (params.shortread_clipmerge) { + if (params.perform_shortread_clipmerge) { ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) } - if (params.longread_clip) { + if (params.perform_longread_clip) { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) } - if (params.shortread_complexityfilter){ + if (params.perform_shortread_complexityfilter){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } if (params.shortread_hostremoval) { ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) + } + + if (params.perform_runmerging){ + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) } ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc )