Add PRINSEQPLUSPLUS (#1481)

* fix: remove left-over unnecessary code * Add prinseq++ * Remove last todo * Fix tests due to variability of output FASTQs (reads can be ordered differently between runs) * Apply suggestions from code review
2024-12-21 10:48:18 +00:00 · 2022-04-03 16:06:22 +02:00 · 2022-04-03 16:06:22 +02:00 · f1c5384c31
commit f1c5384c31
parent 67c1bc9568
6 changed files with 185 additions and 0 deletions
--- a/modules/prinseqplusplus/main.nf
+++ b/modules/prinseqplusplus/main.nf
@ -0,0 +1,61 @@
+process PRINSEQPLUSPLUS {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda (params.enable_conda ? "bioconda::prinseq-plus-plus=1.2.3" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/prinseq-plus-plus:1.2.3--hc90279e_1':
+        'quay.io/biocontainers/prinseq-plus-plus:1.2.3--hc90279e_1' }"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*_good_out*.fastq.gz")                  , emit: good_reads
+    tuple val(meta), path("*_single_out*.fastq.gz"), optional: true, emit: single_reads
+    tuple val(meta), path("*_bad_out*.fastq.gz")   , optional: true, emit: bad_reads
+    tuple val(meta), path("*.log")                                 , emit: log
+    path "versions.yml"                                            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    if (meta.single_end) {
+        """
+        prinseq++ \\
+            -threads $task.cpus \\
+            -fastq ${reads} \\
+            -out_name ${prefix} \\
+            -out_gz \\
+            -VERBOSE 1 \\
+            $args \\
+            | tee ${prefix}.log
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' ))
+        END_VERSIONS
+        """
+    } else {
+        """
+        prinseq++ \\
+            -threads $task.cpus \\
+            -fastq ${reads[0]} \\
+            -fastq2 ${reads[1]} \\
+            -out_name ${prefix} \\
+            -out_gz \\
+            -VERBOSE 1 \\
+            $args \\
+            | tee ${prefix}.log
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' ))
+        END_VERSIONS
+        """
+    }
+}
--- a/modules/prinseqplusplus/meta.yml
+++ b/modules/prinseqplusplus/meta.yml
@ -0,0 +1,60 @@
+name: "prinseqplusplus"
+description: PRINSEQ++ is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data
+keywords:
+  - fastq
+  - fasta
+  - filter
+  - trim
+tools:
+  - "prinseqplusplus":
+      description: "PRINSEQ++ - Multi-threaded C++ sequence cleaning"
+      homepage: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus"
+      documentation: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus"
+      tool_dev_url: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus"
+      doi: "10.7287/peerj.preprints.27553v1"
+      licence: "['GPL v2']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+        List of input FastQ files of size 1 and 2 for single-end and paired-end
+        data, respectively.
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - good_reads:
+      type: file
+      description: Reads passing filter(s) in gzipped FASTQ format
+      pattern: "*_good_out_{R1,R2}.fastq.gz"
+  - single_reads:
+      type: file
+      description: |
+        Single reads without the pair passing filter(s) in gzipped FASTQ format
+      pattern: "*_single_out_{R1,R2}.fastq.gz"
+  - bad_reads:
+      type: file
+      description: |
+        Reads without not passing filter(s) in gzipped FASTQ format
+      pattern: "*_bad_out_{R1,R2}.fastq.gz"
+  - log:
+      type: file
+      description: |
+        Verbose level 2 STDOUT information in a log file
+      pattern: "*.log"
+
+authors:
+  - "@jfy133"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -1407,6 +1407,10 @@ preseq/lcextrap:
  - modules/preseq/lcextrap/**
  - tests/modules/preseq/lcextrap/**

+prinseqplusplus:
+  - modules/prinseqplusplus/**
+  - tests/modules/prinseqplusplus/**
+
 prodigal:
  - modules/prodigal/**
  - tests/modules/prodigal/**
--- a/tests/modules/prinseqplusplus/main.nf
+++ b/tests/modules/prinseqplusplus/main.nf
@ -0,0 +1,24 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { PRINSEQPLUSPLUS } from '../../../modules/prinseqplusplus/main.nf'
+
+workflow test_prinseqplusplus_single_end {
+
+        input = [ [ id:'test', single_end:true ], // meta map
+            [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ]
+        ]
+
+    PRINSEQPLUSPLUS ( input )
+}
+
+workflow test_prinseqplusplus_paired_end {
+
+    input = [ [ id:'test', single_end:false ], // meta map
+            [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ]
+        ]
+
+    PRINSEQPLUSPLUS ( input )
+}
--- a/tests/modules/prinseqplusplus/nextflow.config
+++ b/tests/modules/prinseqplusplus/nextflow.config
@ -0,0 +1,9 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+
+    withName: PRINSEQPLUSPLUS {
+        ext.args = "-lc_entropy=0.8"
+    }
+
+}
--- a/tests/modules/prinseqplusplus/test.yml
+++ b/tests/modules/prinseqplusplus/test.yml
@ -0,0 +1,27 @@
+- name: prinseqplusplus test_prinseqplusplus_single_end
+  command: nextflow run tests/modules/prinseqplusplus -entry test_prinseqplusplus_single_end -c tests/config/nextflow.config
+  tags:
+    - prinseqplusplus
+  files:
+    - path: output/prinseqplusplus/test.log
+      contains:
+        - "reads removed by -lc_entropy"
+    - path: output/prinseqplusplus/test_bad_out.fastq.gz
+    - path: output/prinseqplusplus/test_good_out.fastq.gz
+    - path: output/prinseqplusplus/versions.yml
+
+- name: prinseqplusplus test_prinseqplusplus_paired_end
+  command: nextflow run tests/modules/prinseqplusplus -entry test_prinseqplusplus_paired_end -c tests/config/nextflow.config
+  tags:
+    - prinseqplusplus
+  files:
+    - path: output/prinseqplusplus/test.log
+      contains:
+        - "reads removed by -lc_entropy"
+    - path: output/prinseqplusplus/test_bad_out_R1.fastq.gz
+    - path: output/prinseqplusplus/test_bad_out_R2.fastq.gz
+    - path: output/prinseqplusplus/test_good_out_R1.fastq.gz
+    - path: output/prinseqplusplus/test_good_out_R2.fastq.gz
+    - path: output/prinseqplusplus/test_single_out_R1.fastq.gz
+    - path: output/prinseqplusplus/test_single_out_R2.fastq.gz
+    - path: output/prinseqplusplus/versions.yml