From 409470642141460daace239396d7fdc95b0580b5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 1 May 2022 07:24:58 +0200 Subject: [PATCH 01/25] Only create profiler input channels when profiler activate --- subworkflows/local/profiling.nf | 142 +++++++++++++++++--------------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 9389e19..d8e9c84 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -48,7 +48,7 @@ workflow PROFILING { } /* - PREPARE PROFILER INPUT CHANNELS + PREPARE PROFILER INPUT CHANNELS & RUN PROFILING */ // Each tool as a slightly different input structure and generally separate @@ -56,74 +56,26 @@ workflow PROFILING { // for each tool and make liberal use of multiMap to keep reads/databases // channel element order in sync with each other - // MALT: We groupTuple to have all samples in one channel for MALT as database - // loading takes a long time, so we only want to run it once per database - // TODO document somewhere we only accept illumina short reads for MALT? - ch_input_for_malt = ch_input_for_profiling.malt - .filter { it[0]['instrument_platform'] == 'ILLUMINA' } - .map { - it -> - def temp_meta = [ id: it[2]['db_name']] + it[2] - def db = it[3] - [ temp_meta, it[1], db ] - } - .groupTuple(by: [0,2]) - .multiMap { - it -> - reads: [ it[0], it[1].flatten() ] - db: it[2] - } - - // All subsequent tools can easily run on a per-sample basis - - ch_input_for_kraken2 = ch_input_for_profiling.kraken2 - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - ch_input_for_centrifuge = ch_input_for_profiling.centrifuge - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." - !it[0].is_fasta - } - .multiMap { - it -> - reads: [ it[0] + it[2], it[1] ] - db: it[3] - } - - ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 - .filter{ - if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." - !it[0].is_fasta - } - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - ch_input_for_kaiju = ch_input_for_profiling.kaiju - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - ch_input_for_diamond = ch_input_for_profiling.diamond - .multiMap { - it -> - reads: [it[0] + it[2], it[1]] - db: it[3] - } - - /* - RUN PROFILING - */ - if ( params.run_malt ) { + + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + // TODO document somewhere we only accept illumina short reads for MALT? + ch_input_for_malt = ch_input_for_profiling.malt + .filter { it[0]['instrument_platform'] == 'ILLUMINA' } + .map { + it -> + def temp_meta = [ id: it[2]['db_name']] + it[2] + def db = it[3] + [ temp_meta, it[1], db ] + } + .groupTuple(by: [0,2]) + .multiMap { + it -> + reads: [ it[0], it[1].flatten() ] + db: it[2] + MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) ch_maltrun_for_megan = MALT_RUN.out.rma6 @@ -143,40 +95,94 @@ workflow PROFILING { ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO.out.txt ) + } if ( params.run_kraken2 ) { + + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db ) ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.txt.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( KRAKEN2_KRAKEN2.out.txt ) + } if ( params.run_centrifuge ) { + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_unaligned, params.centrifuge_save_aligned, params.centrifuge_sam_format ) CENTRIFUGE_KREPORT (CENTRIFUGE_CENTRIFUGE.out.results, ch_input_for_centrifuge.db) ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport ) + } if ( params.run_metaphlan3 ) { + + ch_input_for_metaphlan3 = ch_input_for_profiling.metaphlan3 + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] MetaPhlAn3 currently does not accept FASTA files as input. Skipping MetaPhlAn3 for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + METAPHLAN3 ( ch_input_for_metaphlan3.reads, ch_input_for_metaphlan3.db ) ch_versions = ch_versions.mix( METAPHLAN3.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN3.out.biom ) + } if ( params.run_kaiju ) { + + ch_input_for_kaiju = ch_input_for_profiling.kaiju + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db) KAIJU_KAIJU2TABLE (KAIJU_KAIJU.out.results, ch_input_for_kaiju.db, params.kaiju_taxon_name) ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE.out.summary.collect{it[1]}.ifEmpty([]) ) ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE.out.summary ) + } if ( params.run_diamond ) { + + ch_input_for_diamond = ch_input_for_profiling.diamond + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, params.diamond_output_format ) ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.output ) + } emit: From d5049a34e49c2a093039b83b183e48a81c6e9d60 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sun, 1 May 2022 07:28:29 +0200 Subject: [PATCH 02/25] Add missing close bracket --- subworkflows/local/profiling.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index d8e9c84..7fb3ce9 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -75,6 +75,7 @@ workflow PROFILING { it -> reads: [ it[0], it[1].flatten() ] db: it[2] + } MALT_RUN ( ch_input_for_malt.reads, params.malt_mode, ch_input_for_malt.db ) From a821f748ee89ade6e229edab28fa7f3399969596 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 09:28:46 +0200 Subject: [PATCH 03/25] Add modules --- conf/modules.config | 41 ++++++++++++ modules.json | 14 +++- modules/nf-core/modules/metaphlan3/main.nf | 2 +- .../nf-core/modules/minimap2/align/main.nf | 48 ++++++++++++++ .../nf-core/modules/minimap2/align/meta.yml | 65 +++++++++++++++++++ .../nf-core/modules/minimap2/index/main.nf | 33 ++++++++++ .../nf-core/modules/minimap2/index/meta.yml | 30 +++++++++ .../nf-core/modules/samtools/bam2fq/main.nf | 56 ++++++++++++++++ .../nf-core/modules/samtools/bam2fq/meta.yml | 55 ++++++++++++++++ modules/nf-core/modules/samtools/view/main.nf | 44 +++++++++++++ .../nf-core/modules/samtools/view/meta.yml | 57 ++++++++++++++++ nf-core/modules/samtools/bam2fq/main.nf | 56 ++++++++++++++++ nf-core/modules/samtools/bam2fq/meta.yml | 55 ++++++++++++++++ 13 files changed, 554 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/modules/minimap2/align/main.nf create mode 100644 modules/nf-core/modules/minimap2/align/meta.yml create mode 100644 modules/nf-core/modules/minimap2/index/main.nf create mode 100644 modules/nf-core/modules/minimap2/index/meta.yml create mode 100644 modules/nf-core/modules/samtools/bam2fq/main.nf create mode 100644 modules/nf-core/modules/samtools/bam2fq/meta.yml create mode 100644 modules/nf-core/modules/samtools/view/main.nf create mode 100644 modules/nf-core/modules/samtools/view/meta.yml create mode 100644 nf-core/modules/samtools/bam2fq/main.nf create mode 100644 nf-core/modules/samtools/bam2fq/meta.yml diff --git a/conf/modules.config b/conf/modules.config index d8fb382..b707954 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -164,6 +164,47 @@ process { ] } + withName: MINIMAP2_INDEX { + ext.args = '-x map-ont' + publishDir = [ + path: { "${params.outdir}/minimap2/index" }, + mode: params.publish_dir_mode, + enabled: params.save_minimap2_hostremoval_index, + pattern: 'minimap2' + ] + } + + withName: MINIMAP2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/minimap2/align" }, + mode: params.publish_dir_mode, + enabled: params.save_minimap2_hostremoval_mapped, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_VIEW { + ext.args = '-f 4' + ext.prefix = { "${meta.id}.mapped.sorted" } + publishDir = [ + path: { "${params.outdir}/samtools/view" }, + mode: params.publish_dir_mode, + enabled: params.save_samtools_unmapped_bam, + pattern: '*.bam' + ] + } + + withName: SAMTOOLS_BAM2FQ { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/samtools/bam2fq" }, + mode: params.publish_dir_mode, + enabled: params.save_minimap2_unmapped_fq, + pattern: '*.fq.gz' + ] + } + withName: BBMAP_BBDUK { ext.args = [ "entropy=${params.shortread_complexityfilter_entropy}", diff --git a/modules.json b/modules.json index a65926c..071234d 100644 --- a/modules.json +++ b/modules.json @@ -52,6 +52,12 @@ "git_sha": "2d38566eca4cc15142b2ffa7c11837569b39aece" }, "metaphlan3": { + "git_sha": "ed4dd1a928ebf4308efb720de878045f7773f8e2" + }, + "minimap2/align": { + "git_sha": "1a5a9e7b4009dcf34e6867dd1a5a1d9a718b027b" + }, + "minimap2/index": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, "multiqc": { @@ -63,9 +69,15 @@ "prinseqplusplus": { "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b" }, + "samtools/bam2fq": { + "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" + }, + "samtools/view": { + "git_sha": "12afb6b0faf3cabf769c9a2a7dd477e3f066eac0" + }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/metaphlan3/main.nf b/modules/nf-core/modules/metaphlan3/main.nf index 3fc6b27..bff0eb9 100644 --- a/modules/nf-core/modules/metaphlan3/main.nf +++ b/modules/nf-core/modules/metaphlan3/main.nf @@ -23,7 +23,7 @@ process METAPHLAN3 { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_type = ("$input".endsWith(".fastq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" + def input_type = ("$input".endsWith(".fastq.gz") || "$input".endsWith(".fq.gz")) ? "--input_type fastq" : ("$input".contains(".fasta")) ? "--input_type fasta" : ("$input".endsWith(".bowtie2out.txt")) ? "--input_type bowtie2out" : "--input_type sam" def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" diff --git a/modules/nf-core/modules/minimap2/align/main.nf b/modules/nf-core/modules/minimap2/align/main.nf new file mode 100644 index 0000000..08ac6ee --- /dev/null +++ b/modules/nf-core/modules/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::minimap2=2.21 bioconda::samtools=1.12' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + path reference + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_reads = meta.single_end ? "$reads" : "${reads[0]} ${reads[1]}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + $reference \\ + $input_reads \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/minimap2/align/meta.yml b/modules/nf-core/modules/minimap2/align/meta.yml new file mode 100644 index 0000000..991b39a --- /dev/null +++ b/modules/nf-core/modules/minimap2/align/meta.yml @@ -0,0 +1,65 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/modules/minimap2/index/main.nf b/modules/nf-core/modules/minimap2/index/main.nf new file mode 100644 index 0000000..3dfeb86 --- /dev/null +++ b/modules/nf-core/modules/minimap2/index/main.nf @@ -0,0 +1,33 @@ +process MINIMAP2_INDEX { + label 'process_medium' + + conda (params.enable_conda ? 'bioconda::minimap2=2.21' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.21--h5bf99c6_0' : + 'quay.io/biocontainers/minimap2:2.21--h5bf99c6_0' }" + + input: + path fasta + + output: + path "*.mmi" , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/minimap2/index/meta.yml b/modules/nf-core/modules/minimap2/index/meta.yml new file mode 100644 index 0000000..3bf9f04 --- /dev/null +++ b/modules/nf-core/modules/minimap2/index/meta.yml @@ -0,0 +1,30 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - mmi: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf new file mode 100644 index 0000000..554af48 --- /dev/null +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam | gzip > ${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/modules/samtools/bam2fq/meta.yml b/modules/nf-core/modules/samtools/bam2fq/meta.yml new file mode 100644 index 0000000..319a60c --- /dev/null +++ b/modules/nf-core/modules/samtools/bam2fq/meta.yml @@ -0,0 +1,55 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: None + documentation: http://www.htslib.org/doc/1.1/samtools.html + tool_dev_url: None + doi: "" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" + +authors: + - "@lescai" diff --git a/modules/nf-core/modules/samtools/view/main.nf b/modules/nf-core/modules/samtools/view/main.nf new file mode 100644 index 0000000..11cfb74 --- /dev/null +++ b/modules/nf-core/modules/samtools/view/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + + output: + tuple val(meta), path("*.bam") , emit: bam , optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta} -C" : "" + def file_type = input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + $args \\ + $input \\ + $args2 \\ + > ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/samtools/view/meta.yml b/modules/nf-core/modules/samtools/view/meta.yml new file mode 100644 index 0000000..a8b43ec --- /dev/null +++ b/modules/nf-core/modules/samtools/view/meta.yml @@ -0,0 +1,57 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: optional file + description: BAM.BAI/CRAM.CRAI file + pattern: "*.{.bai,.crai}" + - fasta: + type: optional file + description: Reference file the CRAM was created with + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: filtered/converted BAM/SAM file + pattern: "*.{bam,sam}" + - cram: + type: file + description: filtered/converted CRAM file + pattern: "*.cram" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" diff --git a/nf-core/modules/samtools/bam2fq/main.nf b/nf-core/modules/samtools/bam2fq/main.nf new file mode 100644 index 0000000..5d6aa79 --- /dev/null +++ b/nf-core/modules/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : + 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam >${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/nf-core/modules/samtools/bam2fq/meta.yml b/nf-core/modules/samtools/bam2fq/meta.yml new file mode 100644 index 0000000..319a60c --- /dev/null +++ b/nf-core/modules/samtools/bam2fq/meta.yml @@ -0,0 +1,55 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + homepage: None + documentation: http://www.htslib.org/doc/1.1/samtools.html + tool_dev_url: None + doi: "" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" + +authors: + - "@lescai" From 5e6be52fab87d6b6a7f14ce254f63b02ad682a63 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 09:34:27 +0200 Subject: [PATCH 04/25] Implement hostremoval --- nextflow.config | 18 ++++++--- subworkflows/local/longread_hostremoval.nf | 47 ++++++++++++++++++++++ workflows/taxprofiler.nf | 17 ++++++-- 3 files changed, 72 insertions(+), 10 deletions(-) create mode 100644 subworkflows/local/longread_hostremoval.nf diff --git a/nextflow.config b/nextflow.config index 5644786..04273f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,12 +81,18 @@ params { save_runmerged_reads = false // Host Removal - perform_shortread_hostremoval = false - shortread_hostremoval_reference = null - shortread_hostremoval_index = null - save_hostremoval_index = false - save_hostremoval_mapped = false - save_hostremoval_unmapped = false + perform_shortread_hostremoval = false + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + longread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_mapped = false + save_hostremoval_unmapped = false + save_minimap2_hostremoval_index = false + save_minimap2_hostremoval_mapped = false + save_minimap2_hostremoval_unmapped = false + save_samtools_unmapped_bam = false + save_minimap2_unmapped_fq = false // MALT run_malt = false diff --git a/subworkflows/local/longread_hostremoval.nf b/subworkflows/local/longread_hostremoval.nf new file mode 100644 index 0000000..7db020b --- /dev/null +++ b/subworkflows/local/longread_hostremoval.nf @@ -0,0 +1,47 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { MINIMAP2_INDEX } from '../../modules/nf-core/modules/minimap2/index/main' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/modules/minimap2/align/main' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/modules/samtools/view/main' +include { SAMTOOLS_BAM2FQ } from '../../modules/nf-core/modules/samtools/bam2fq/main' + +workflow LONGREAD_HOSTREMOVAL { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + index // /path/to/index + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.longread_hostremoval_index ) { + ch_minimap2_index = MINIMAP2_INDEX ( reference ).index + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + } else { + ch_minimap2_index = index + } + + MINIMAP2_ALIGN ( reads, ch_minimap2_index, true, false, false ) + ch_versions = ch_versions.mix( MINIMAP2_ALIGN.out.versions.first() ) + ch_minimap2_mapped = MINIMAP2_ALIGN.out.bam + .map { + meta, reads -> + [ meta, reads, [] ] + } + + + SAMTOOLS_VIEW ( ch_minimap2_mapped , [] ) + ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) + + SAMTOOLS_BAM2FQ ( SAMTOOLS_VIEW.out.bam, false ) + ch_versions = ch_versions.mix( SAMTOOLS_BAM2FQ.out.versions.first() ) + + + emit: + reads = SAMTOOLS_BAM2FQ.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index a0046b2..9eb53a3 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -26,7 +26,8 @@ if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_refere if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } -if (params.shortread_hostremoval_index ) { ch_reference_index = file(params.shortread_hostremoval_index ) } else { ch_reference_index = [] } +if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } +if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -52,6 +53,7 @@ include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' include { SHORTREAD_HOSTREMOVAL } from '../subworkflows/local/shortread_hostremoval' +include { LONGREAD_HOSTREMOVAL } from '../subworkflows/local/longread_hostremoval' include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' include { PROFILING } from '../subworkflows/local/profiling' @@ -141,16 +143,23 @@ workflow TAXPROFILER { */ if ( params.perform_shortread_hostremoval ) { - ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_reference_index ).reads + ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_shortread_reference_index ).reads ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) } else { ch_shortreads_hostremoved = ch_shortreads_filtered } + if ( params.perform_longread_hostremoval ) { + ch_longreads_hostremoved = LONGREAD_HOSTREMOVAL ( ch_longreads_preprocessed, ch_reference, ch_longread_reference_index ).reads + ch_versions = ch_versions.mix(LONGREAD_HOSTREMOVAL.out.versions) + } else { + ch_longreads_hostremoved = ch_longreads_preprocessed + } + if ( params.perform_runmerging ) { ch_reads_for_cat_branch = ch_shortreads_hostremoved - .mix( ch_longreads_preprocessed ) + .mix( ch_longreads_hostremoved ) .map { meta, reads -> def meta_new = meta.clone() @@ -182,7 +191,7 @@ workflow TAXPROFILER { } else { ch_reads_runmerged = ch_shortreads_hostremoved - .mix( ch_longreads_preprocessed, INPUT_CHECK.out.fasta ) + .mix( ch_longreads_hostremoved, INPUT_CHECK.out.fasta ) } /* From afdebaf09a41cfc6fd3f3d3455f3f69b1e1dc608 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 10:15:11 +0200 Subject: [PATCH 05/25] Remove own modification to a module --- modules/nf-core/modules/samtools/bam2fq/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf index 554af48..0dd1da9 100644 --- a/modules/nf-core/modules/samtools/bam2fq/main.nf +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -45,7 +45,7 @@ process SAMTOOLS_BAM2FQ { bam2fq \\ $args \\ -@ $task.cpus \\ - $inputbam | gzip > ${prefix}_interleaved.fq.gz + $inputbam > ${prefix}_interleaved.fq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": From fa1c13263573e5cda53fb2468e8c339ec0ad0ecf Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:03:25 +0200 Subject: [PATCH 06/25] Update nextflow_schema --- nextflow_schema.json | 67 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f429d1b..d3878c1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -335,7 +348,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -388,7 +404,14 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -396,7 +419,39 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] + }, + "longread_hostremoval_index": { + "type": "string", + "default": null + }, + "save_minimap2_hostremoval_index": { + "type": "string", + "default": "false", + "description": "Flag for publishing minimap2 host removal index" + }, + "save_minimap2_hostremoval_mapped": { + "type": "string", + "default": "false", + "description": "Flag for publishinig bam file with all long reads mapped to a reference" + }, + "save_samtools_unmapped_bam": { + "type": "string", + "default": "false", + "description": "Flag for publishing bam for reads that did not map to the host reference" + }, + "save_minimap2_unmapped_fq": { + "type": "string", + "default": "false", + "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } } From 4b2a3789cd7ba15ac41f32f6179bcc1665cb8e47 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:03:48 +0200 Subject: [PATCH 07/25] Update default values for host removal parameters --- nextflow.config | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nextflow.config b/nextflow.config index 04273f1..95daba9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,18 +81,18 @@ params { save_runmerged_reads = false // Host Removal - perform_shortread_hostremoval = false - shortread_hostremoval_reference = null - shortread_hostremoval_index = null - longread_hostremoval_index = null - save_hostremoval_index = false - save_hostremoval_mapped = false - save_hostremoval_unmapped = false - save_minimap2_hostremoval_index = false - save_minimap2_hostremoval_mapped = false - save_minimap2_hostremoval_unmapped = false - save_samtools_unmapped_bam = false - save_minimap2_unmapped_fq = false + perform_shortread_hostremoval = false + shortread_hostremoval_reference = null + shortread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_mapped = false + save_hostremoval_unmapped = false + longread_hostremoval_index = null + save_minimap2_hostremoval_index = false + save_minimap2_hostremoval_mapped = false + save_samtools_unmapped_bam = false + save_minimap2_unmapped_fq = false + // MALT run_malt = false From 17039ebc6b143870331c4d2f8976a0fe63de0736 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:07:23 +0200 Subject: [PATCH 08/25] Fix type to boolean for flags --- nextflow_schema.json | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d3878c1..257f5cb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -431,27 +431,23 @@ }, "longread_hostremoval_index": { "type": "string", - "default": null + "default": "None" }, "save_minimap2_hostremoval_index": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishing minimap2 host removal index" }, "save_minimap2_hostremoval_mapped": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishinig bam file with all long reads mapped to a reference" }, "save_samtools_unmapped_bam": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishing bam for reads that did not map to the host reference" }, "save_minimap2_unmapped_fq": { - "type": "string", - "default": "false", + "type": "boolean", "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } -} +} \ No newline at end of file From 58d4dec70be5ac5eb2356aac351593c27f77502a Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:09:54 +0200 Subject: [PATCH 09/25] Update samtools view --- modules.json | 2 +- modules/nf-core/modules/samtools/view/main.nf | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 071234d..b568d9c 100644 --- a/modules.json +++ b/modules.json @@ -73,7 +73,7 @@ "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" }, "samtools/view": { - "git_sha": "12afb6b0faf3cabf769c9a2a7dd477e3f066eac0" + "git_sha": "6b64f9cb6c3dd3577931cc3cd032d6fb730000ce" }, "untar": { "git_sha": "e080f4c8acf5760039ed12ec1f206170f3f9a918" diff --git a/modules/nf-core/modules/samtools/view/main.nf b/modules/nf-core/modules/samtools/view/main.nf index 11cfb74..55194e8 100644 --- a/modules/nf-core/modules/samtools/view/main.nf +++ b/modules/nf-core/modules/samtools/view/main.nf @@ -41,4 +41,16 @@ process SAMTOOLS_VIEW { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } From c833d4fd9ae037e297eda9dea69045715fc4a064 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 11:42:05 +0200 Subject: [PATCH 10/25] Fix issues found with prettier --- modules.json | 2 +- nextflow_schema.json | 45 +++++++------------------------------------- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/modules.json b/modules.json index b568d9c..ce1fbc5 100644 --- a/modules.json +++ b/modules.json @@ -80,4 +80,4 @@ } } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 257f5cb..1902e27 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -348,10 +335,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -404,14 +388,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -419,15 +396,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] }, "longread_hostremoval_index": { "type": "string", @@ -450,4 +419,4 @@ "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } -} \ No newline at end of file +} From f4abbe280ab6497a220fde116f606d588b49bf73 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Tue, 3 May 2022 12:56:49 +0200 Subject: [PATCH 11/25] Fix back to nf-core modules version --- .../nf-core/modules/samtools/bam2fq/main.nf | 2 +- nf-core/modules/samtools/bam2fq/main.nf | 56 ------------------- nf-core/modules/samtools/bam2fq/meta.yml | 55 ------------------ 3 files changed, 1 insertion(+), 112 deletions(-) delete mode 100644 nf-core/modules/samtools/bam2fq/main.nf delete mode 100644 nf-core/modules/samtools/bam2fq/meta.yml diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf index 0dd1da9..5d6aa79 100644 --- a/modules/nf-core/modules/samtools/bam2fq/main.nf +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -45,7 +45,7 @@ process SAMTOOLS_BAM2FQ { bam2fq \\ $args \\ -@ $task.cpus \\ - $inputbam > ${prefix}_interleaved.fq.gz + $inputbam >${prefix}_interleaved.fq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nf-core/modules/samtools/bam2fq/main.nf b/nf-core/modules/samtools/bam2fq/main.nf deleted file mode 100644 index 5d6aa79..0000000 --- a/nf-core/modules/samtools/bam2fq/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process SAMTOOLS_BAM2FQ { - tag "$meta.id" - label 'process_low' - - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" - - input: - tuple val(meta), path(inputbam) - val split - - output: - tuple val(meta), path("*.fq.gz"), emit: reads - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - if (split){ - """ - samtools \\ - bam2fq \\ - $args \\ - -@ $task.cpus \\ - -1 ${prefix}_1.fq.gz \\ - -2 ${prefix}_2.fq.gz \\ - -0 ${prefix}_other.fq.gz \\ - -s ${prefix}_singleton.fq.gz \\ - $inputbam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - } else { - """ - samtools \\ - bam2fq \\ - $args \\ - -@ $task.cpus \\ - $inputbam >${prefix}_interleaved.fq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - } -} diff --git a/nf-core/modules/samtools/bam2fq/meta.yml b/nf-core/modules/samtools/bam2fq/meta.yml deleted file mode 100644 index 319a60c..0000000 --- a/nf-core/modules/samtools/bam2fq/meta.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: samtools_bam2fq -description: | - The module uses bam2fq method from samtools to - convert a SAM, BAM or CRAM file to FASTQ format -keywords: - - bam2fq - - samtools - - fastq -tools: - - samtools: - description: Tools for dealing with SAM, BAM and CRAM files - homepage: None - documentation: http://www.htslib.org/doc/1.1/samtools.html - tool_dev_url: None - doi: "" - licence: ["MIT"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - inputbam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - split: - type: boolean - description: | - TRUE/FALSE value to indicate if reads should be separated into - /1, /2 and if present other, or singleton. - Note: choosing TRUE will generate 4 different files. - Choosing FALSE will produce a single file, which will be interleaved in case - the input contains paired reads. - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - reads: - type: file - description: | - FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) - or a single interleaved .fq.gz file if the user chooses not to split the reads. - pattern: "*.fq.gz" - -authors: - - "@lescai" From 59c7f5a5b1b6a3e144a9b1c151e6f2a4c315342f Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 08:38:41 +0200 Subject: [PATCH 12/25] Update samtools/bam2fq --- modules.json | 4 ++-- modules/nf-core/modules/samtools/bam2fq/main.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules.json b/modules.json index ce1fbc5..043e4ad 100644 --- a/modules.json +++ b/modules.json @@ -70,7 +70,7 @@ "git_sha": "f1c5384c31e985591716afdd732cf8c2ae29d05b" }, "samtools/bam2fq": { - "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" + "git_sha": "5510ea39fe638594bc26ac34cadf4a84bf27d159" }, "samtools/view": { "git_sha": "6b64f9cb6c3dd3577931cc3cd032d6fb730000ce" @@ -80,4 +80,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/modules/samtools/bam2fq/main.nf b/modules/nf-core/modules/samtools/bam2fq/main.nf index 5d6aa79..9301d1d 100644 --- a/modules/nf-core/modules/samtools/bam2fq/main.nf +++ b/modules/nf-core/modules/samtools/bam2fq/main.nf @@ -45,7 +45,7 @@ process SAMTOOLS_BAM2FQ { bam2fq \\ $args \\ -@ $task.cpus \\ - $inputbam >${prefix}_interleaved.fq.gz + $inputbam | gzip --no-name > ${prefix}_interleaved.fq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": From 5b7b21415633c3df64cb2d88734d5863b4162a9c Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 08:40:53 +0200 Subject: [PATCH 13/25] Fix formatting with prettier --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index 043e4ad..a55c88b 100644 --- a/modules.json +++ b/modules.json @@ -80,4 +80,4 @@ } } } -} \ No newline at end of file +} From d94534e8acee01575b589e88549ed64ab4fc4411 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 09:07:33 +0200 Subject: [PATCH 14/25] Simplify params which control host removal publish --- conf/modules.config | 8 ++++---- nextflow.config | 6 +----- nextflow_schema.json | 16 ---------------- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index b707954..cd0fb04 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -169,7 +169,7 @@ process { publishDir = [ path: { "${params.outdir}/minimap2/index" }, mode: params.publish_dir_mode, - enabled: params.save_minimap2_hostremoval_index, + enabled: params.save_hostremoval_index, pattern: 'minimap2' ] } @@ -179,7 +179,7 @@ process { publishDir = [ path: { "${params.outdir}/minimap2/align" }, mode: params.publish_dir_mode, - enabled: params.save_minimap2_hostremoval_mapped, + enabled: params.save_hostremoval_mapped, pattern: '*.bam' ] } @@ -190,7 +190,7 @@ process { publishDir = [ path: { "${params.outdir}/samtools/view" }, mode: params.publish_dir_mode, - enabled: params.save_samtools_unmapped_bam, + enabled: params.save_hostremoval_unmapped, pattern: '*.bam' ] } @@ -200,7 +200,7 @@ process { publishDir = [ path: { "${params.outdir}/samtools/bam2fq" }, mode: params.publish_dir_mode, - enabled: params.save_minimap2_unmapped_fq, + enabled: params.save_hostremoval_unmapped, pattern: '*.fq.gz' ] } diff --git a/nextflow.config b/nextflow.config index 95daba9..8c99af2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -84,14 +84,10 @@ params { perform_shortread_hostremoval = false shortread_hostremoval_reference = null shortread_hostremoval_index = null + longread_hostremoval_index = null save_hostremoval_index = false save_hostremoval_mapped = false save_hostremoval_unmapped = false - longread_hostremoval_index = null - save_minimap2_hostremoval_index = false - save_minimap2_hostremoval_mapped = false - save_samtools_unmapped_bam = false - save_minimap2_unmapped_fq = false // MALT diff --git a/nextflow_schema.json b/nextflow_schema.json index 1902e27..9e4cc6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -401,22 +401,6 @@ "longread_hostremoval_index": { "type": "string", "default": "None" - }, - "save_minimap2_hostremoval_index": { - "type": "boolean", - "description": "Flag for publishing minimap2 host removal index" - }, - "save_minimap2_hostremoval_mapped": { - "type": "boolean", - "description": "Flag for publishinig bam file with all long reads mapped to a reference" - }, - "save_samtools_unmapped_bam": { - "type": "boolean", - "description": "Flag for publishing bam for reads that did not map to the host reference" - }, - "save_minimap2_unmapped_fq": { - "type": "boolean", - "description": "Flag for publishing fastq files for reads that did not map to the host reference" } } } From 557d31dfd2fecbac2a415ddab780e6a3abc87168 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 13:19:10 +0200 Subject: [PATCH 15/25] Add parameter for turning on longread host removal --- conf/test.config | 2 +- docs/usage.md | 2 +- nextflow.config | 2 +- nextflow_schema.json | 4 +++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/conf/test.config b/conf/test.config index a2464b2..6d04f60 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,7 +28,7 @@ params { perform_longread_clip = false perform_shortread_complexityfilter = true perform_shortread_hostremoval = true - shortread_hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + perform_longread_hostremoval = true run_kaiju = true run_kraken2 = true run_malt = true diff --git a/docs/usage.md b/docs/usage.md index cee2bb6..537b94a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -191,7 +191,7 @@ You can optionally save the FASTQ output of the run merging with the `--save_com #### Host Removal -Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` +Removal of possible-host reads from FASTQ files prior profiling can be activated with `--perform_shortread_hostremoval` or `--perform_longread_hostremoval`. Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during profiling that occur due to host-sequence contamination in reference genomes on public databases. diff --git a/nextflow.config b/nextflow.config index 8c99af2..4ac0c44 100644 --- a/nextflow.config +++ b/nextflow.config @@ -82,7 +82,7 @@ params { // Host Removal perform_shortread_hostremoval = false - shortread_hostremoval_reference = null + perform_longread_hostremoval = false shortread_hostremoval_index = null longread_hostremoval_index = null save_hostremoval_index = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 9e4cc6c..d2eee95 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -362,7 +362,9 @@ "perform_shortread_hostremoval": { "type": "boolean" }, - "shortread_hostremoval_reference": { + "perform_longread_hostremoval": { + "type": "boolean" + }, "type": "string", "default": "None" }, From 6618e8cac65fe43d40d4952863698801f9b4507b Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 13:20:34 +0200 Subject: [PATCH 16/25] Change param to a more generic name --- conf/test.config | 1 + docs/usage.md | 2 +- nextflow.config | 1 + nextflow_schema.json | 1 + workflows/taxprofiler.nf | 8 ++++---- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/conf/test.config b/conf/test.config index 6d04f60..a5244f9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true run_malt = true diff --git a/docs/usage.md b/docs/usage.md index 537b94a..1143da3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -197,7 +197,7 @@ Similarly to complexity filtering, host-removal can be useful for runtime optimi nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. -You can supply your reference genome in FASTA format with `--shortread_hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. +You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. > 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. diff --git a/nextflow.config b/nextflow.config index 4ac0c44..ca9e280 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,6 +83,7 @@ params { // Host Removal perform_shortread_hostremoval = false perform_longread_hostremoval = false + hostremoval_reference = null shortread_hostremoval_index = null longread_hostremoval_index = null save_hostremoval_index = false diff --git a/nextflow_schema.json b/nextflow_schema.json index d2eee95..ab2108e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -365,6 +365,7 @@ "perform_longread_hostremoval": { "type": "boolean" }, + "hostremoval_reference": { "type": "string", "default": "None" }, diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 9eb53a3..81ea5d9 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -11,7 +11,7 @@ WorkflowTaxprofiler.initialise(params, log) // TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.databases, params.shortread_hostremoval_reference, +def checkPathParamList = [ params.input, params.databases, params.hostremoval_reference, params.shortread_hostremoval_index, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } @@ -22,10 +22,10 @@ if (params.databases) { ch_databases = file(params.databases) } else { exit 1, ' if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" -if (params.perform_shortread_hostremoval && !params.shortread_hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --shortread_hostremoval_reference FASTA supplied. Check input." } -if (!params.shortread_hostremoval_reference && params.shortread_hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --shortread_hostremoval_reference FASTA supplied. Check input." } +if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } +if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } -if (params.shortread_hostremoval_reference ) { ch_reference = file(params.shortread_hostremoval_reference) } +if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } From e3e55f57c26ba462ceb6795be55b312e615ca550 Mon Sep 17 00:00:00 2001 From: ljmesi <37740329+ljmesi@users.noreply.github.com> Date: Thu, 5 May 2022 14:02:24 +0200 Subject: [PATCH 17/25] Fix alignment Just made this change to see if the CI tests would fail again or if the failure was just a one of thing. --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 81ea5d9..1f2e9d1 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -47,7 +47,7 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' include { DB_CHECK } from '../subworkflows/local/db_check' include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' From b0faeedc60c654fe445909ceed569ee010c84837 Mon Sep 17 00:00:00 2001 From: Lauri Mesilaakso Date: Fri, 6 May 2022 08:20:31 +0200 Subject: [PATCH 18/25] Update workflows/taxprofiler.nf Co-authored-by: James A. Fellows Yates --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 1f2e9d1..7a6cd09 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -25,7 +25,7 @@ if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_me if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } -if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } +if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = file(params.shortread_hostremoval_index ) } else { ch_shortread_reference_index = [] } if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } From 02517733aa2227d1bc6f4decdb607918de047b2d Mon Sep 17 00:00:00 2001 From: Lauri Mesilaakso Date: Fri, 6 May 2022 10:40:59 +0200 Subject: [PATCH 19/25] Update docs/usage.md Co-authored-by: James A. Fellows Yates --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 1143da3..4aa1d09 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -197,7 +197,7 @@ Similarly to complexity filtering, host-removal can be useful for runtime optimi nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2, and the use of the unaligned reads for downstream profiling. -You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. +You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index` or `--longread_hostremoval_index`, however nf-core/taxprofiler will generate this for you if necessary. Pre-supplying the directory of index files can greatly speed up the process, and these can be re-used. > 💡 If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. From 47a5ae0cff4060c12451beba47502dae5dbb9d17 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:09:05 +0200 Subject: [PATCH 20/25] Add FASTP complexity option --- conf/modules.config | 6 ++- conf/test.config | 1 + conf/test_nopreprocessing.config | 46 ++++++++++++++++ conf/test_noprofiling.config | 46 ++++++++++++++++ docs/usage.md | 4 +- nextflow.config | 3 ++ nextflow_schema.json | 52 ++++++++++++++++--- .../local/shortread_complexityfiltering.nf | 1 + workflows/taxprofiler.nf | 8 ++- 9 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 conf/test_nopreprocessing.config create mode 100644 conf/test_noprofiling.config diff --git a/conf/modules.config b/conf/modules.config index cd0fb04..c834f4e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -54,7 +54,8 @@ process { params.shortread_clipmerge_skipadaptertrim ? "--disable_adapter_trimming" : "", params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -74,7 +75,8 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options - "--length_required ${params.shortread_clipmerge_minlength}" + "--length_required ${params.shortread_clipmerge_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/conf/test.config b/conf/test.config index a5244f9..c687a86 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true + perform_runmerging = true hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' run_kaiju = true run_kraken2 = true diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config new file mode 100644 index 0000000..e8d4ed9 --- /dev/null +++ b/conf/test_nopreprocessing.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = false + perform_longread_clip = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config new file mode 100644 index 0000000..f908651 --- /dev/null +++ b/conf/test_noprofiling.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_clipmerge = true + perform_longread_clip = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_malt = false + run_metaphlan3 = false + run_centrifuge = false + run_diamond = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/docs/usage.md b/docs/usage.md index 4aa1d09..47ac952 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -183,11 +183,11 @@ Complexity filtering can be activated via the `--perform_shortread_complexityfil Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. -There are currently two options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) and [`prinseq++`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). +There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. -You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. #### Host Removal diff --git a/nextflow.config b/nextflow.config index ca9e280..411e7a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -74,6 +74,7 @@ params { shortread_complexityfilter_bbduk_mask = false shortread_complexityfilter_prinseqplusplus_mode = 'entropy' shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + shortread_complexityfilter_fastp_threshold = 30 save_complexityfiltered_reads = false // run merging @@ -185,6 +186,8 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_noprofiling { includeConfig 'conf/test_noprofiling.config' } + test_nopreprocessing { includeConfig 'conf/test_preprocessing.config' } } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index ab2108e..a0a830c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -173,7 +176,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -294,7 +304,10 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"] + "enum": [ + "fastp", + "adapterremoval" + ] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -319,7 +332,12 @@ }, "shortread_complexityfilter_tool": { "type": "string", - "default": "bbduk" + "default": "bbduk", + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -335,7 +353,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"] + "enum": [ + "entropy", + "dust" + ] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -391,7 +412,14 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"] + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ] }, "run_diamond": { "type": "boolean" @@ -399,11 +427,19 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ] }, "longread_hostremoval_index": { "type": "string", "default": "None" } } -} +} \ No newline at end of file diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf index 12686d7..a34440d 100644 --- a/subworkflows/local/shortread_complexityfiltering.nf +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -13,6 +13,7 @@ workflow SHORTREAD_COMPLEXITYFILTERING { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing if ( params.shortread_complexityfilter_tool == 'bbduk' ) { ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 7a6cd09..b8b953b 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -19,9 +19,12 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true // Check mandatory parameters if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } + if (params.shortread_clipmerge_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_clipmerge_excludeunmerged && !params.shortread_clipmerge_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging not turned on. Please specify --shortread_clipmerge_mergepairs" +if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_clipmerge == false || params.shortread_clipmerge_tool != 'fastp' )) exit 1, "ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_clipmerge and/or --shortread_clipmerge_tool 'fastp'" + if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input." } if (!params.hostremoval_reference && params.hostremoval_reference_index) { exit 1, "ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input." } @@ -131,7 +134,8 @@ workflow TAXPROFILER { SUBWORKFLOW: COMPLEXITY FILTERING */ - if ( params.perform_shortread_complexityfilter ) { + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) { ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) } else { @@ -228,7 +232,7 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } - if (params.perform_shortread_complexityfilter){ + if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){ ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) } From 0f651873dd711c2223a8f844a091deb1316f6979 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:12:24 +0200 Subject: [PATCH 21/25] Linting --- conf/modules.config | 4 ++-- nextflow_schema.json | 55 ++++++++++---------------------------------- 2 files changed, 14 insertions(+), 45 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c834f4e..0abff92 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ @@ -76,7 +76,7 @@ process { params.shortread_clipmerge_adapter2 ? "--adapter_sequence_r2 ${params.shortread_clipmerge_adapter2}" : "--detect_adapter_for_pe", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/nextflow_schema.json b/nextflow_schema.json index a0a830c..74fab27 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -176,14 +173,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -304,10 +294,7 @@ "shortread_clipmerge_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ] + "enum": ["fastp", "adapterremoval"] }, "shortread_clipmerge_skipadaptertrim": { "type": "boolean" @@ -333,11 +320,7 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": [ - "bbduk", - "prinseqplusplus", - "fastp" - ] + "enum": ["bbduk", "prinseqplusplus", "fastp"] }, "shortread_complexityfilter_bbduk_windowsize": { "type": "integer", @@ -353,10 +336,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ] + "enum": ["entropy", "dust"] }, "shortread_complexityfilter_prinseqplusplus_dustscore": { "type": "number", @@ -412,14 +392,7 @@ "kaiju_taxon_name": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ] + "enum": ["phylum", "class", "order", "family", "genus", "species"] }, "run_diamond": { "type": "boolean" @@ -427,19 +400,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ] + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"] }, "longread_hostremoval_index": { "type": "string", "default": "None" + }, + "shortread_complexityfilter_fastp_threshold": { + "type": "integer", + "default": 30 } } -} \ No newline at end of file +} From 3fa2181f498f5e1c994efb06be3a63f459dcf9cc Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Sat, 7 May 2022 06:15:15 +0200 Subject: [PATCH 22/25] Fix prinseq tool selection --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1ece72..7bb2076 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs --shortread_clipmerge_excludeunmerged" - "--shortread_clipmerge_tool adapterremoval --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter_tool bbduk" - - "--shortread_complexityfilter_tool prinseq" + - "--shortread_complexityfilter_tool prinseqplusplus" - "--perform_runmerging" - "--perform_runmerging --shortread_clipmerge_mergepairs" - "--shortread_complexityfilter false --perform_shortread_hostremoval" From d67543503b927f6af5bce9574ad8b428c53c0c46 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Sat, 7 May 2022 13:15:21 +0200 Subject: [PATCH 23/25] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- conf/modules.config | 2 +- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0abff92..5d8398e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { params.shortread_clipmerge_adapter1 ? "--adapter_sequence ${params.shortread_clipmerge_adapter1}" : "", // filtering options "--length_required ${params.shortread_clipmerge_minlength}", - params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' + (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '' ].join(' ').trim() ext.prefix = { "${meta.id}_${meta.run_accession}" } publishDir = [ diff --git a/docs/usage.md b/docs/usage.md index 47ac952..54ffce0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -185,7 +185,7 @@ Complexity filtering is primarily a run-time optimisation step. It is not necess There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). -The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of both tools (see links above) to decide on optimal methods and parameters for your dataset. +The tools offer different algorithms and parameters for removing low complexity reads. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset. You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. From aca7bc439dac189b8f4b33e7014859f7528d80bf Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 12 May 2022 08:08:14 +0100 Subject: [PATCH 24/25] Update nextflow.config --- nextflow.config | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nextflow.config b/nextflow.config index 411e7a6..4bcb1b1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -130,11 +130,11 @@ try { // Load nf-core/taxprofiler custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") +} profiles { From a560492353eacda36beadc896360d61ad4b53192 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Mon, 16 May 2022 10:22:38 +0000 Subject: [PATCH 25/25] Template update for nf-core/tools version 2.4 --- .github/workflows/awsfulltest.yml | 3 - .github/workflows/awstest.yml | 3 - .github/workflows/branch.yml | 3 +- .github/workflows/ci.yml | 2 - .github/workflows/fix-linting.yml | 55 ++++++++++ .github/workflows/linting.yml | 4 +- .github/workflows/linting_comment.yml | 1 - .prettierignore | 9 ++ README.md | 21 ++-- assets/email_template.html | 148 ++++++++------------------ bin/check_samplesheet.py | 16 ++- nextflow.config | 2 +- 12 files changed, 136 insertions(+), 131 deletions(-) create mode 100644 .github/workflows/fix-linting.yml create mode 100644 .prettierignore diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 41f5c11..bcbb910 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -28,6 +28,3 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/taxprofiler/results-${{ github.sha }}" } profiles: test_full,aws_tower - nextflow_config: | - process.errorStrategy = 'retry' - process.maxRetries = 3 diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index ebffec6..5d35eea 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -23,6 +23,3 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/taxprofiler/results-test-${{ github.sha }}" } profiles: test,aws_tower - nextflow_config: | - process.errorStrategy = 'retry' - process.maxRetries = 3 diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index b8a412e..b5b9ae2 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/taxprofiler' run: | - "{ [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/taxprofiler ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]]" + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/taxprofiler ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets @@ -42,4 +42,3 @@ jobs: Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false -# diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb6de77..69dc097 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,5 +48,3 @@ jobs: # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results - -# diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml new file mode 100644 index 0000000..5a5f775 --- /dev/null +++ b/.github/workflows/fix-linting.yml @@ -0,0 +1,55 @@ +name: Fix linting from a comment +on: + issue_comment: + types: [created] + +jobs: + deploy: + # Only run if comment is on a PR with the main repo, and if it contains the magic keywords + if: > + contains(github.event.comment.html_url, '/pull/') && + contains(github.event.comment.body, '@nf-core-bot fix linting') && + github.repository == 'nf-core/taxprofiler' + runs-on: ubuntu-latest + steps: + # Use the @nf-core-bot token to check out so we can push later + - uses: actions/checkout@v3 + with: + token: ${{ secrets.nf_core_bot_auth_token }} + + # Action runs on the issue comment, so we don't get the PR by default + # Use the gh cli to check out the PR + - name: Checkout Pull Request + run: gh pr checkout ${{ github.event.issue.number }} + env: + GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + + - uses: actions/setup-node@v2 + + - name: Install Prettier + run: npm install -g prettier @prettier/plugin-php + + # Check that we actually need to fix something + - name: Run 'prettier --check' + id: prettier_status + run: | + if prettier --check ${GITHUB_WORKSPACE}; then + echo "::set-output name=result::pass" + else + echo "::set-output name=result::fail" + fi + + - name: Run 'prettier --write' + if: steps.prettier_status.outputs.result == 'fail' + run: prettier --write ${GITHUB_WORKSPACE} + + - name: Commit & push changes + if: steps.prettier_status.outputs.result == 'fail' + run: | + git config user.email "core@nf-co.re" + git config user.name "nf-core-bot" + git config push.default upstream + git add . + git status + git commit -m "[automated] Fix linting with Prettier" + git push diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index e9cf5de..77358de 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -48,7 +48,7 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v3 with: python-version: "3.6" architecture: "x64" @@ -78,5 +78,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - -# diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 91c487a..04758f6 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -26,4 +26,3 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} path: linting-logs/lint_results.md -# diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..d0e7ae5 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,9 @@ +email_template.html +.nextflow* +work/ +data/ +results/ +.DS_Store +testing/ +testing* +*.pyc diff --git a/README.md b/README.md index fe8c049..6ce3a45 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,19 @@ -# ![nf-core/taxprofiler](docs/images/nf-core/taxprofiler_logo_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core/taxprofiler_logo_dark.png#gh-dark-mode-only) +# ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_light.png#gh-light-mode-only) ![nf-core/taxprofiler](docs/images/nf-core-taxprofiler_logo_dark.png#gh-dark-mode-only) [![GitHub Actions CI Status](https://github.com/nf-core/taxprofiler/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/taxprofiler/actions?query=workflow%3A%22nf-core+CI%22) [![GitHub Actions Linting Status](https://github.com/nf-core/taxprofiler/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/taxprofiler/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/taxprofiler/results) -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?logo=Amazon%20AWS)](https://nf-co.re/taxprofiler/results) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) -[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) -[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) -[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg)](https://www.nextflow.io/) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) +[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/taxprofiler) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23taxprofiler-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/taxprofiler) -[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core) -[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23taxprofiler-4A154B?logo=slack)](https://nfcore.slack.com/channels/taxprofiler) +[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?logo=twitter)](https://twitter.com/nf_core) +[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction diff --git a/assets/email_template.html b/assets/email_template.html index c8dce51..f40addd 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,111 +1,53 @@ - - - - + + + + - - - nf-core/taxprofiler Pipeline Report - - -
- + + nf-core/taxprofiler Pipeline Report + + +
-

nf-core/taxprofiler v${version}

-

Run Name: $runName

+ - <% if (!success){ out << """ -
-

nf-core/taxprofiler execution completed unsuccessfully!

+

nf-core/taxprofiler v${version}

+

Run Name: $runName

+ +<% if (!success){ + out << """ +
+

nf-core/taxprofiler execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
${errorReport}
-
- """ } else { out << """ -
- nf-core/taxprofiler execution completed successfully! -
- """ } %> - -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
-$commandLine
- -

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> " - - - - - " }.join("\n") %> - -
- $k - -
$v
-
- -

nf-core/taxprofiler

-

https://github.com/nf-core/taxprofiler

+
${errorReport}
- + """ +} else { + out << """ +
+ nf-core/taxprofiler execution completed successfully! +
+ """ +} +%> + +

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
+ +

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
+ +

nf-core/taxprofiler

+

https://github.com/nf-core/taxprofiler

+ +
+ + diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 5473b62..3652c63 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -98,7 +98,7 @@ class RowChecker: if row[self._first_col] and row[self._second_col]: row[self._single_col] = False assert ( - Path(row[self._first_col]).suffixes == Path(row[self._second_col]).suffixes + Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:] ), "FASTQ pairs must have the same file extensions." else: row[self._single_col] = True @@ -129,6 +129,16 @@ class RowChecker: row[self._sample_col] = f"{sample}_T{seen[sample]}" +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + def sniff_format(handle): """ Detect the tabular format. @@ -144,13 +154,13 @@ def sniff_format(handle): https://docs.python.org/3/glossary.html#term-text-file """ - peek = handle.read(2048) + peek = read_head(handle) + handle.seek(0) sniffer = csv.Sniffer() if not sniffer.has_header(peek): logger.critical(f"The given sample sheet does not appear to contain a header.") sys.exit(1) dialect = sniffer.sniff(peek) - handle.seek(0) return dialect diff --git a/nextflow.config b/nextflow.config index 97ce0a8..87a1aa7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -159,7 +159,7 @@ trace { } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" } manifest {