new module: pbccs (#688)

* 📦 NEW: First commit of pbccs module

* 👌 IMPROVE: Remove option from command + rename output (ccs -> bam)

* 👌 IMPROVE: Move .pbi output  into report channel

* 🐛FIX: Correct code after --rq option removal from command line module

- module main.nf: Remove ramaining rq input channel
- Test main.nf: Transfert rq into addParams
- Test test.yml: Update md5sums

* 🐛FIX: Repair additionnal option usage

* 👌 IMPROVE: Add some pacbio test files

* 🐛 FIX: Add Pacbio index to test_data.config

* 👌 IMPROVE: CCS is run in parallel with --chunk option

* 👌 IMPROVE: Add Pbindex in bam ouput channel

* 👌 IMPROVE: Change label to process_low

* 👌 IMPROVE: Define reports files names + add json version of txt report

* 🐛 FIX: Add missing backslashes

* 🐛 FIX: Add missing gz extension

* 🐛 FIX: update ouput channel

* 🐛 FIX: output file name

* 👌 IMPROVE: .gitignore

* 👌 IMPROVE: Update function.nf to last version

* 👌 IMPROVE: Update saveAs in main.nf

* 👌 IMPROVE: Add pbccs module

* 🐛 FIX: Fix Broken test

* 👌 IMPROVE: Update test_data.config

* 🐛 FIX: Fix test

* 👌 IMPROVE: Update path of test dataset files

* 👌 IMPROVE: Remove useless index + Fix Typos

* 📦 NEW: First commit of pbccs module

* 👌 IMPROVE: Remove option from command + rename output (ccs -> bam)

* 👌 IMPROVE: Move .pbi output  into report channel

* 🐛FIX: Correct code after --rq option removal from command line module

- module main.nf: Remove ramaining rq input channel
- Test main.nf: Transfert rq into addParams
- Test test.yml: Update md5sums

* 🐛FIX: Repair additionnal option usage

* 👌 IMPROVE: Add some pacbio test files

* 🐛 FIX: Add Pacbio index to test_data.config

* 👌 IMPROVE: CCS is run in parallel with --chunk option

* 👌 IMPROVE: Add Pbindex in bam ouput channel

* 👌 IMPROVE: Change label to process_low

* 👌 IMPROVE: Define reports files names + add json version of txt report

* 🐛 FIX: Add missing backslashes

* 🐛 FIX: Add missing gz extension

* 🐛 FIX: update ouput channel

* 🐛 FIX: output file name

* 👌 IMPROVE: .gitignore

* 👌 IMPROVE: Update function.nf to last version

* 👌 IMPROVE: Update saveAs in main.nf

* 👌 IMPROVE: Add pbccs module

* 🐛 FIX: Fix Broken test

* 👌 IMPROVE: Update test_data.config

* 🐛 FIX: Fix test

* 👌 IMPROVE: Update path of test dataset files

* 👌 IMPROVE: Remove useless index + Fix Typos

* 🐛 FIX: fill contains args

* 👌 IMPROVE: One output => One Channel

* 👌 IMPROVE: One input => One channel

* 🐛 FIX: Update tests

* 🐛 FIX: Remove TODOs from test.yaml

* 👌 IMPROVE: Revert and keep bam and pbi together

* 🐛 FIX: Remove old rq input from meta.yml

* 👌 IMPROVE: Update test to match input channels

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
This commit is contained in:
Sébastien Guizard 2021-09-16 11:48:18 +01:00 committed by GitHub
parent 1840289068
commit bbf268c5d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 225 additions and 1 deletions

1
.gitignore vendored
View file

@ -7,3 +7,4 @@ output/
*.code-workspace *.code-workspace
.screenrc .screenrc
.*.sw? .*.sw?
tests/data/

View file

@ -0,0 +1,68 @@
//
// Utility functions used in nf-core DSL2 module files
//
//
// Extract name of software tool from process name using $task.process
//
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}
//
// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
//
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.args3 = args.args3 ?: ''
options.publish_by_meta = args.publish_by_meta ?: []
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}
//
// Tidy up and join elements of a list to return a path string
//
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}
//
// Function to save/publish module results
//
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_meta) {
def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
for (key in key_list) {
if (args.meta && key instanceof String) {
def path = key
if (args.meta.containsKey(key)) {
path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
}
path = path instanceof String ? path : ''
path_list.add(path)
}
}
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}

54
modules/pbccs/main.nf Normal file
View file

@ -0,0 +1,54 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)
process PBCCS {
tag "$meta.id"
label 'process_low'
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
conda (params.enable_conda ? "bioconda::pbccs=6.0.0" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/pbccs:6.0.0--h9ee0642_2"
} else {
container "quay.io/biocontainers/pbccs:6.0.0--h9ee0642_2"
}
input:
tuple val(meta), path(bam), path(pbi)
val chunk_num
val chunk_on
output:
tuple val(meta), path("*.ccs.bam") , emit: bam
tuple val(meta), path("*.ccs.bam.pbi") , emit: pbi
tuple val(meta), path("*.ccs_report.txt" ) , emit: ccs_report_txt
tuple val(meta), path("*.ccs_report.json" ) , emit: ccs_report_json
tuple val(meta), path("*.zmw_metrics.json.gz"), emit: zmw_metrics
tuple val(meta), path("*.version.txt" ) , emit: version
script:
def software = getSoftwareName(task.process)
// def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
def ccs = bam.toString().replaceAll(/bam$/, '') + chunk_num + '.ccs.bam'
def report_txt = bam.toString().replaceAll(/bam$/, '') + chunk_num + '.ccs_report.txt'
def report_json = bam.toString().replaceAll(/bam$/, '') + chunk_num + '.ccs_report.json'
def zmw_metrics = bam.toString().replaceAll(/bam$/, '') + chunk_num + '.zmw_metrics.json.gz'
"""
ccs \\
$bam \\
$ccs \\
--report-file $report_txt \\
--report-json $report_json \\
--metrics-json $zmw_metrics \\
--chunk $chunk_num/$chunk_on \\
-j $task.cpus \\
$options.args
echo \$(ccs --version 2>&1) | grep -e 'commit' > ${software}.version.txt
"""
}

51
modules/pbccs/meta.yml Normal file
View file

@ -0,0 +1,51 @@
name: pbccs
description: Pacbio ccs - Generate Higly Accurate Single-Molecule Consensus Reads
keywords:
- ccs
tools:
- pbccs:
description: pbccs - Generate Highly Accurate Single-Molecule Consensus Reads (HiFi Reads)
homepage: https://github.com/PacificBiosciences/pbbioconda
documentation: https://ccs.how/
tool_dev_url: https://github.com/PacificBiosciences/ccs
doi: ""
licence: ['BSD-3-clause-Clear']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: Raw subreads bam
pattern: "*.subreads.bam"
- pbi:
type: file
description: Pacbio BAM Index
pattern: "*.pbi"
- chunk_num:
-type: integer
-description: BAM part to process
- chunk_on:
-type: integer
-description: Total number of bam parts to process
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
- css:
type: file
description: Consensus sequences
pattern: "*.ccs.bam"
authors:
- "@sguizard"

View file

@ -651,6 +651,10 @@ pangolin:
- modules/pangolin/** - modules/pangolin/**
- tests/modules/pangolin/** - tests/modules/pangolin/**
pbccs:
- modules/pbccs/**
- tests/modules/pbccs/**
picard/collectmultiplemetrics: picard/collectmultiplemetrics:
- modules/picard/collectmultiplemetrics/** - modules/picard/collectmultiplemetrics/**
- tests/modules/picard/collectmultiplemetrics/** - tests/modules/picard/collectmultiplemetrics/**

View file

@ -100,6 +100,7 @@ params {
genome_sizes = "${test_data_dir}/genomics/homo_sapiens/genome/genome.sizes" genome_sizes = "${test_data_dir}/genomics/homo_sapiens/genome/genome.sizes"
genome_bed = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed" genome_bed = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed"
transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta" transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta"
genome2_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/genome2.fasta"
dbsnp_146_hg38_vcf_gz = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" dbsnp_146_hg38_vcf_gz = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
dbsnp_146_hg38_vcf_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" dbsnp_146_hg38_vcf_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi"
@ -168,7 +169,18 @@ params {
test2_yak = "${test_data_dir}/genomics/homo_sapiens/illumina/yak/test2.yak" test2_yak = "${test_data_dir}/genomics/homo_sapiens/illumina/yak/test2.yak"
} }
'pacbio' { 'pacbio' {
test_hifi_fastq_gz = "${test_data_dir}/genomics/homo_sapiens/pacbio/fastq/test_hifi.fastq.gz" primers = "${test_data_dir}/genomics/homo_sapiens/pacbio/fasta/primers.fasta"
alz = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.bam"
alzpbi = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.bam.pbi"
ccs = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.bam"
lima = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.bam"
refine = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.bam"
cluster = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.bam"
singletons = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.bam"
aligned = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned.bam"
alignedbai = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned.bam.bai"
genemodel1 = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.bed"
genemodel2 = "${test_data_dir}/genomics/homo_sapiens/pacbio/bam/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.2.bed"
} }
} }
} }

View file

@ -0,0 +1,19 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { PBCCS } from '../../../modules/pbccs/main.nf' addParams( options: [args:'--min-rq 0.9'] )
workflow test_pbccs {
input = [
[ id:'test' ], // meta map
file(params.test_data['homo_sapiens']['pacbio']['alz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['pacbio']['alzpbi'], checkIfExists: true)
]
chunk_num = 2
chunk_on = 3
PBCCS ( input, chunk_num, chunk_on )
}

View file

@ -0,0 +1,15 @@
- name: pbccs test_pbccs
command: nextflow run tests/modules/pbccs -entry test_pbccs -c tests/config/nextflow.config
tags:
- pbccs
files:
- path: output/pbccs/alz.2.ccs.bam
md5sum: b9c8093b362a07b575d52592b19fc909
- path: output/pbccs/alz.2.ccs.bam.pbi
md5sum: 78d015230a8c957a24338581efda4e55
- path: output/pbccs/alz.2.ccs_report.json
contains: ['Created by pbcopper v1.8.0']
- path: output/pbccs/alz.2.ccs_report.txt
md5sum: db379e9299295679f4ca7eeb37011f08
- path: output/pbccs/alz.2.zmw_metrics.json.gz
contains: ['zmws']