module FASTP: Support for interleaved FASTQ (#1891)

* 1882 FASTP now supports interleaved FASTQ files

Changes:
 - single_end FASTP pipes the FASTQ file
 - Using args, it can be configured for interleaved in `--interleaved_in`
 - Out is automatically interleaved if input is paired end.
 - Removed md5sum checks for FASTQ files as compression seemed to cause
 differences
 - Instead, we check inside the FASTQ files for content.

Relates to #1882

Co-authored-by: Matthias De Smet <11850640+matthdsm@users.noreply.github.com>
This commit is contained in:
Adam Talbot 2022-07-19 16:27:15 +01:00 committed by GitHub
parent 8d4373b4e8
commit 7e8ad56688
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 125 additions and 21 deletions

View file

@ -28,19 +28,23 @@ process FASTP {
def args = task.ext.args ?: ''
// Added soft-links to original fastqs for consistent naming in MultiQC
def prefix = task.ext.prefix ?: "${meta.id}"
// Use single ended for interleaved. Add --interleaved_in in config.
if (meta.single_end) {
def fail_fastq = save_trimmed_fail ? "--failed_out ${prefix}.fail.fastq.gz" : ''
"""
[ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
fastp \\
cat ${prefix}.fastq.gz \\
| fastp \\
--stdin \\
--stdout \\
--in1 ${prefix}.fastq.gz \\
--out1 ${prefix}.fastp.fastq.gz \\
--thread $task.cpus \\
--json ${prefix}.fastp.json \\
--html ${prefix}.fastp.html \\
$fail_fastq \\
$args \\
2> ${prefix}.fastp.log
2> ${prefix}.fastp.log \\
| gzip -c > ${prefix}.fastp.fastq.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")

View file

@ -15,7 +15,7 @@ input:
- meta:
type: map
description: |
Groovy Map containing sample information
Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
e.g. [ id:'test', single_end:false ]
- reads:
type: file

View file

@ -61,6 +61,7 @@ params {
test_1_fastq_gz = "${test_data_dir}/genomics/sarscov2/illumina/fastq/test_1.fastq.gz"
test_2_fastq_gz = "${test_data_dir}/genomics/sarscov2/illumina/fastq/test_2.fastq.gz"
test_interleaved_fastq_gz = "${test_data_dir}/genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz"
test2_1_fastq_gz = "${test_data_dir}/genomics/sarscov2/illumina/fastq/test2_1.fastq.gz"
test2_2_fastq_gz = "${test_data_dir}/genomics/sarscov2/illumina/fastq/test2_2.fastq.gz"
test_methylated_1_fastq_gz = "${test_data_dir}/genomics/sarscov2/illumina/fastq/test.methylated_1.fastq.gz"

View file

@ -31,6 +31,19 @@ workflow test_fastp_paired_end {
FASTP ( input, save_trimmed_fail, save_merged )
}
//
// Test with intereleaved data
//
workflow test_fastp_interleaved {
input = [ [ id:'test', single_end:true ], // meta map
[ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ]
]
save_trimmed_fail = false
save_merged = false
FASTP ( input, save_trimmed_fail, save_merged )
}
//
// Test with single-end data with saving trimming fails
//

View file

@ -2,4 +2,7 @@ process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: '.*test_fastp_interleaved:FASTP' {
ext.args = "--interleaved_in"
}
}

View file

@ -4,13 +4,19 @@
- fastp
files:
- path: output/fastp/test.fastp.fastq.gz
md5sum: 4ce5c2b4db68a743cb0635ce7da3b9a4
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1"
- "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT"
- "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1"
- "ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC"
- "AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"
- path: output/fastp/test.fastp.html
contains:
- "Q20 bases:</td><td class='col2'>12.922000 K (92.984097%)"
- "single end (151 cycles)"
- path: output/fastp/test.fastp.json
md5sum: 7ee735cefb67f549dc857eefb9e7f123
md5sum: 803a024342be986f76486f6ffea15909
- path: output/fastp/test.fastp.log
contains:
- "Q20 bases: 12922(92.9841%)"
@ -33,9 +39,45 @@
- "No adapter detected for read1"
- "Q30 bases: 12281(88.3716%)"
- path: output/fastp/test_1.fastp.fastq.gz
md5sum: 4ce5c2b4db68a743cb0635ce7da3b9a4
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1"
- "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT"
- "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1"
- "ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC"
- "AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"
- path: output/fastp/test_2.fastp.fastq.gz
md5sum: 532b190fb4dc7b2277ee5cf1464e598c
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2"
- "ATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTG"
- "AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEAEEEEEAAEEEEEEEEEAAEAAA<<EAAEEEEEEEAAA<<<AE"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/2"
- "GCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGT"
- "AAAAA6EEAEEEEEAEEAEEAEEEEEEA6EEEEAEEAEEEEE6EEEEEEAEEEEA///A<<EEEEEEEEEAEEEEEE"
- name: fastp test_fastp_interleaved
command: nextflow run ./tests/modules/fastp -entry test_fastp_interleaved -c ./tests/config/nextflow.config -c ./tests/modules/fastp/nextflow.config
tags:
- fastp
files:
- path: output/fastp/test.fastp.fastq.gz
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1"
- "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT"
- "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA"
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2"
- "ATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTG"
- "AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEAEEEEEAAEEEEEEEEEAAEAAA<<EAAEEEEEEEAAA<<<AE"
- path: output/fastp/test.fastp.html
contains:
- "Q20 bases:</td><td class='col2'>25.719000 K (93.033098%)"
- "paired end (151 cycles + 151 cycles)"
- path: output/fastp/test.fastp.json
md5sum: 5b70f43f33778d278a84b3e9270fa114
- path: output/fastp/test.fastp.log
contains:
- "Q20 bases: 12922(92.9841%)"
- "reads passed filter: 198"
- name: fastp test_fastp_single_end_trim_fail
command: nextflow run ./tests/modules/fastp -entry test_fastp_single_end_trim_fail -c ./tests/config/nextflow.config -c ./tests/modules/fastp/nextflow.config
@ -43,15 +85,24 @@
- fastp
files:
- path: output/fastp/test.fail.fastq.gz
md5sum: b57f2026eb259a0b0c0b3960c270258d
contains:
- "@ERR5069949.885966 NS500628:121:HK3MMAFX2:4:11610:19682:20132/1 failed_quality_filter"
- "GTCTAATCATAATTTCTTGGTACAGGCTGGTATTGTTCATCTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTT"
- "AAA//E/EAA/E//E//E//E/E//AE/A/E//EAEA///AE//E///E/EEE6EEEAEEA///E/AEE/EAEE/E//E"
- path: output/fastp/test.fastp.fastq.gz
md5sum: 4ce5c2b4db68a743cb0635ce7da3b9a4
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1"
- "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT"
- "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1"
- "ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC"
- "AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"
- path: output/fastp/test.fastp.html
contains:
- "Q20 bases:</td><td class='col2'>12.922000 K (92.984097%)"
- "single end (151 cycles)"
- path: output/fastp/test.fastp.json
md5sum: feafc4181a2a61b4b52d9c2b59b419ad
md5sum: b647fa752d3fe7956d17429bfe27d72c
- path: output/fastp/test.fastp.log
contains:
- "Q20 bases: 12922(92.9841%)"
@ -73,14 +124,28 @@
- path: output/fastp/test.fastp.json
contains:
- '"passed_filter_reads": 198'
- path: output/fastp/test_1.fail.fastq.gz
md5sum: d41d8cd98f00b204e9800998ecf8427e
- path: output/fastp/test_1.fastp.fastq.gz
md5sum: 4ce5c2b4db68a743cb0635ce7da3b9a4
- path: output/fastp/test_2.fail.fastq.gz
md5sum: 72d0002841967676ac936d08746a9128
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1"
- "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT"
- "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1"
- "ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC"
- "AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"
- path: output/fastp/test_2.fastp.fastq.gz
md5sum: 532b190fb4dc7b2277ee5cf1464e598c
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2"
- "ATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTG"
- "AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEAEEEEEAAEEEEEEEEEAAEAAA<<EAAEEEEEEEAAA<<<AE"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/2"
- "GCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGT"
- "AAAAA6EEAEEEEEAEEAEEAEEEEEEA6EEEEAEEAEEEEE6EEEEEEAEEEEA///A<<EEEEEEEEEAEEEEEE"
- path: output/fastp/test_1.fail.fastq.gz
- path: output/fastp/test_2.fail.fastq.gz
contains:
- "@ERR5069949.885966 NS500628:121:HK3MMAFX2:4:11610:19682:20132/2"
- "CTTAGGTCTTAGGATTGGCTGTATCAACCTTAAGCTTAAGTACACAATTTTGCATAGAATGTCCAATAA"
- "A//AA6EEAEEEEE6EEE/EEA/EA///AAE/EAEEEAE6AE/E/E/EEAAE/EAA/E/E/<EA//E/6"
- name: fastp test_fastp_paired_end_merged
command: nextflow run ./tests/modules/fastp -entry test_fastp_paired_end_merged -c ./tests/config/nextflow.config -c ./tests/modules/fastp/nextflow.config
@ -100,9 +165,27 @@
- "Merged and filtered:"
- "total reads: 75"
- "total bases: 13683"
- path: output/fastp/test.merged.fastq.gz
md5sum: 4955ca2c899729b17bd526d2626a8d73
- path: output/fastp/test_1.fastp.fastq.gz
md5sum: 4a03721ee252b7c6e81e007550e6ab63
contains:
- "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1"
- "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC"
- "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE<AEEAEEEAEAEEEAEAEEAE/AEEEEAEEEEAEA"
- "@ERR5069949.324865 NS500628:121:HK3MMAFX2:1:11102:17526:14721/1"
- "CACAAACTCTAAAAGAATGTATAGGGTCAGCACCAAAAATACCAGCAGATAATAATGTTGCAAGTAGAACTTCGTGCAGATTAAAATTTTCATAAGCACTCTAAAGAAGTTGAATGTCTTCAAATTTCTTAACATTAGGGCCCACAACAAG"
- "AAAAAEA/A<EAA/AE/EE/EE//////EA/EEE/E/EEEE//E/6//EA//<AA/A/EEEAA/EEEE/EEEA/E/</AEE////AEEEE//<E//EAE/A///<EEE//<E<<EEE<///A//E/E/EEEAA/<A////<A/AEAAA//E"
- path: output/fastp/test_2.fastp.fastq.gz
md5sum: 7a4ddf8485c147cd7aaf0d4f6cd57ace
contains:
- "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/2"
- "GTACAAAAATAGCCTAAGAAACAATAAACTAGCATTATACACTGAAGTGTATTACCAGTTATGAAGAAAATAGGGCAATACTCAACACACATAAAAACAATACCTCTGGCCAAAAACATGACAGTTGTAACTACACCTGAGTAGTTAGAAG"
- "AAAAAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEA/EAA</EEEEEEEEEE/AE//A/E<AE<AA<AEEE/AAEAAAEEAEEA<A6AEEA<EEAEEEEEEEAA//EE"
- "@ERR5069949.324865 NS500628:121:HK3MMAFX2:1:11102:17526:14721/2"
- "ATGAATCTGATGAATACATAGCTACTAATGGACCTCTTAAAGTGCGTGGTAGTTGTGATTAAAGCGGACACATACTTGCTAAACACTCTCTTCATGATGTC"
- "A/AAAEEEEA6AA6EE//EEA/EEEAE/EA/A////E</EEAA//EEA////EAE<///E/AEA</AAE/EA//E<EAAAE/AA//AEE//A/AE//</EE"
- path: output/fastp/test.merged.fastq.gz
contains:
- "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1 merged_150_37"
- "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGTGGTGCAGGTAATTGAGCAGGGTCGCCAATGTACACAT"
- "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAAEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEAAAAA"
- "@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1 merged_77_0"
- "ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC"
- "AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"