fix: reduce number of required input files for damage profiler (#612)

* Reduce number of required input files for damage profiler

* Remove rebugging

* Add optional species list file.

* Working pending updated test-dataset update

* Add genome header to config
This commit is contained in:
James A. Fellows Yates 2021-10-20 10:02:30 +02:00 committed by GitHub
parent 4e9e732b76
commit 97fe899f79
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 144 additions and 35 deletions

View file

@ -22,25 +22,30 @@ process DAMAGEPROFILER {
tuple val(meta), path(bam) tuple val(meta), path(bam)
path fasta path fasta
path fai path fai
path specieslist
output: output:
tuple val(meta), path("${prefix}"), emit: results tuple val(meta), path("${prefix}"), emit: results
path "versions.yml" , emit: versions path "versions.yml" , emit: versions
script: script:
prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" def software = getSoftwareName(task.process)
prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
def reference = fasta ? "-r $fasta" : ""
def species_list = specieslist ? "-sf $specieslist" : ""
""" """
damageprofiler \\ damageprofiler \\
-i $bam \\ -i $bam \\
-r $fasta \\ -o $prefix/ \\
-o $prefix/ \\ $options.args \\
$options.args $reference \\
$species_list
cat <<-END_VERSIONS > versions.yml cat <<-END_VERSIONS > versions.yml
${getProcessName(task.process)}: ${getProcessName(task.process)}:
${getSoftwareName(task.process)}: \$(damageprofiler -v | sed 's/^DamageProfiler v//') ${getSoftwareName(task.process)}: \$(damageprofiler -v | sed 's/^DamageProfiler v//')
END_VERSIONS END_VERSIONS
""" """
} }

View file

@ -32,12 +32,16 @@ input:
pattern: "*.{bam,cram,sam}" pattern: "*.{bam,cram,sam}"
- fasta: - fasta:
type: file type: file
description: FASTA reference file description: OPTIONAL FASTA reference file
pattern: "*.{fasta,fna,fa}" pattern: "*.{fasta,fna,fa}"
- fai: - fai:
type: file type: file
description: FASTA index file from samtools faidx description: OPTIONAL FASTA index file from samtools faidx
pattern: "*.{fai}" pattern: "*.{fai}"
- specieslist:
type: file
description: OPTIONAL text file with list of target reference headers
pattern: "*.{txt}"
output: output:
- versions: - versions:

View file

@ -104,6 +104,7 @@ params {
genome_gtf = "${test_data_dir}/genomics/homo_sapiens/genome/genome.gtf" genome_gtf = "${test_data_dir}/genomics/homo_sapiens/genome/genome.gtf"
genome_sizes = "${test_data_dir}/genomics/homo_sapiens/genome/genome.sizes" genome_sizes = "${test_data_dir}/genomics/homo_sapiens/genome/genome.sizes"
genome_bed = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed" genome_bed = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed"
genome_header = "${test_data_dir}/genomics/homo_sapiens/genome/genome.header"
genome_bed_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz" genome_bed_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz"
genome_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz.tbi" genome_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz.tbi"
transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta" transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta"
@ -119,18 +120,19 @@ params {
repeat_expansions = "${test_data_dir}/genomics/homo_sapiens/genome/loci/repeat_expansions.json" repeat_expansions = "${test_data_dir}/genomics/homo_sapiens/genome/loci/repeat_expansions.json"
} }
'illumina' { 'illumina' {
test_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam" test_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam"
test_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai" test_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai"
test_paired_end_markduplicates_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam" test_paired_end_markduplicates_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam"
test_paired_end_markduplicates_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam.bai" test_paired_end_markduplicates_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam.bai"
test_paired_end_recalibrated_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam" test_paired_end_markduplicates_sorted_referencesn_txt = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.referencesn.txt"
test_paired_end_recalibrated_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai" test_paired_end_recalibrated_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam"
test_paired_end_umi_consensus_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_consensus.bam" test_paired_end_recalibrated_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai"
test_paired_end_umi_converted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_converted.bam" test_paired_end_umi_consensus_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_consensus.bam"
test_paired_end_umi_grouped_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_grouped.bam" test_paired_end_umi_converted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_converted.bam"
test_paired_end_umi_histogram_txt = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_histogram.txt" test_paired_end_umi_grouped_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_grouped.bam"
test_paired_end_umi_unsorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_unsorted.bam" test_paired_end_umi_histogram_txt = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_histogram.txt"
test_paired_end_umi_unsorted_tagged_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam" test_paired_end_umi_unsorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_unsorted.bam"
test_paired_end_umi_unsorted_tagged_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam"
test2_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam" test2_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam"
test2_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai" test2_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai"

View file

@ -6,10 +6,34 @@ include { DAMAGEPROFILER } from '../../../modules/damageprofiler/main.nf' addPar
workflow test_damageprofiler { workflow test_damageprofiler {
input = [ [ id:'test', single_end:false ], // meta map input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) ] ] [ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_markduplicates_sorted_bam'], checkIfExists: true) ] ]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) fasta = []
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) fai = []
species_list = []
DAMAGEPROFILER ( input, fasta, fai )
DAMAGEPROFILER ( input, fasta, fai, species_list )
}
workflow test_damageprofiler_reference {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_markduplicates_sorted_bam'], checkIfExists: true) ] ]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true)
species_list = []
DAMAGEPROFILER ( input, fasta, fai, species_list )
}
workflow test_damageprofiler_specieslist {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_markduplicates_sorted_bam'], checkIfExists: true) ] ]
fasta = []
fai = []
species_list = file(params.test_data['homo_sapiens']['genome']['genome_header'], checkIfExists: true)
DAMAGEPROFILER ( input, fasta, fai, species_list )
} }

View file

@ -4,13 +4,13 @@
- damageprofiler - damageprofiler
files: files:
- path: output/damageprofiler/test/3p_freq_misincorporations.txt - path: output/damageprofiler/test/3p_freq_misincorporations.txt
md5sum: da4cac90c78899a7cb6d72d415392b49 md5sum: de3b84d946a6b63cdcfadf82bf6854c0
- path: output/damageprofiler/test/3pGtoA_freq.txt - path: output/damageprofiler/test/3pGtoA_freq.txt
md5sum: 8dab75d51a4b943b501d0995169c767f md5sum: 61c903b1504ed7d7182570dfc75e4498
- path: output/damageprofiler/test/5pCtoT_freq.txt - path: output/damageprofiler/test/5pCtoT_freq.txt
md5sum: fcc48ee5f72edff930d627c8bfdd8a5b md5sum: 15a75b60ee519b61ce04a83fe3afe855
- path: output/damageprofiler/test/5p_freq_misincorporations.txt - path: output/damageprofiler/test/5p_freq_misincorporations.txt
md5sum: 54665474f5ef17dcc268567e5eaa7d86 md5sum: 3b3240d6c1a3491e461b39199a9fcfe3
- path: output/damageprofiler/test/DamagePlot_five_prime.svg - path: output/damageprofiler/test/DamagePlot_five_prime.svg
- path: output/damageprofiler/test/DamagePlot.pdf - path: output/damageprofiler/test/DamagePlot.pdf
- path: output/damageprofiler/test/DamagePlot_three_prime.svg - path: output/damageprofiler/test/DamagePlot_three_prime.svg
@ -18,19 +18,93 @@
contains: contains:
- "FINISHED SUCCESSFULLY" - "FINISHED SUCCESSFULLY"
- path: output/damageprofiler/test/dmgprof.json - path: output/damageprofiler/test/dmgprof.json
md5sum: 98499024c7e937896e481f2d3cfbdd3e md5sum: 2e54e712d2ae9e32c4c298e5fd8f60fe
- path: output/damageprofiler/test/DNA_comp_genome.txt - path: output/damageprofiler/test/DNA_comp_genome.txt
md5sum: f91e70760d91a1193a27e360aaddf2fd md5sum: fea48af1ecf491b439d36d4a919473df
- path: output/damageprofiler/test/DNA_composition_sample.txt - path: output/damageprofiler/test/DNA_composition_sample.txt
md5sum: 1257eb3eb42484647bfba2151f9ef04f md5sum: 9e17a0b1e5ad4eb13201cd24ad8507dd
- path: output/damageprofiler/test/edit_distance.pdf - path: output/damageprofiler/test/edit_distance.pdf
- path: output/damageprofiler/test/edit_distance.svg - path: output/damageprofiler/test/edit_distance.svg
- path: output/damageprofiler/test/editDistance.txt - path: output/damageprofiler/test/editDistance.txt
md5sum: af2d2f4a99058ec56eae88ec27779e38 md5sum: 04d14b449a5afa8b5dbff0dfa762356b
- path: output/damageprofiler/test/Length_plot_combined_data.svg - path: output/damageprofiler/test/Length_plot_combined_data.svg
- path: output/damageprofiler/test/Length_plot_forward_reverse_separated.svg - path: output/damageprofiler/test/Length_plot_forward_reverse_separated.svg
- path: output/damageprofiler/test/Length_plot.pdf - path: output/damageprofiler/test/Length_plot.pdf
- path: output/damageprofiler/test/lgdistribution.txt - path: output/damageprofiler/test/lgdistribution.txt
md5sum: c5d029bf3a92b613310ee23f47d94981 md5sum: df2e19195185ea9ee05e8e84b2948f36
- path: output/damageprofiler/test/misincorporation.txt - path: output/damageprofiler/test/misincorporation.txt
md5sum: 3aa6dd749010a492d92a815a83c196a8 md5sum: bec0c5fc2fa9c82b04949e2d8b6e979c
- name: damageprofiler_reference
command: nextflow run ./tests/modules/damageprofiler -entry test_damageprofiler_reference -c tests/config/nextflow.config -dump-channels
tags:
- damageprofiler
files:
- path: output/damageprofiler/test/3p_freq_misincorporations.txt
md5sum: de3b84d946a6b63cdcfadf82bf6854c0
- path: output/damageprofiler/test/3pGtoA_freq.txt
md5sum: 61c903b1504ed7d7182570dfc75e4498
- path: output/damageprofiler/test/5pCtoT_freq.txt
md5sum: 15a75b60ee519b61ce04a83fe3afe855
- path: output/damageprofiler/test/5p_freq_misincorporations.txt
md5sum: 3b3240d6c1a3491e461b39199a9fcfe3
- path: output/damageprofiler/test/DamagePlot_five_prime.svg
- path: output/damageprofiler/test/DamagePlot.pdf
- path: output/damageprofiler/test/DamagePlot_three_prime.svg
- path: output/damageprofiler/test/DamageProfiler.log
contains:
- "FINISHED SUCCESSFULLY"
- path: output/damageprofiler/test/dmgprof.json
md5sum: 2e54e712d2ae9e32c4c298e5fd8f60fe
- path: output/damageprofiler/test/DNA_comp_genome.txt
md5sum: fea48af1ecf491b439d36d4a919473df
- path: output/damageprofiler/test/DNA_composition_sample.txt
md5sum: 9e17a0b1e5ad4eb13201cd24ad8507dd
- path: output/damageprofiler/test/edit_distance.pdf
- path: output/damageprofiler/test/edit_distance.svg
- path: output/damageprofiler/test/editDistance.txt
md5sum: 04d14b449a5afa8b5dbff0dfa762356b
- path: output/damageprofiler/test/Length_plot_combined_data.svg
- path: output/damageprofiler/test/Length_plot_forward_reverse_separated.svg
- path: output/damageprofiler/test/Length_plot.pdf
- path: output/damageprofiler/test/lgdistribution.txt
md5sum: df2e19195185ea9ee05e8e84b2948f36
- path: output/damageprofiler/test/misincorporation.txt
md5sum: bec0c5fc2fa9c82b04949e2d8b6e979c
- name: damageprofiler_specieslist
command: nextflow run ./tests/modules/damageprofiler -entry test_damageprofiler_specieslist -c tests/config/nextflow.config -dump-channels
tags:
- damageprofiler
files:
- path: output/damageprofiler/test/chr22/3p_freq_misincorporations.txt
md5sum: de3b84d946a6b63cdcfadf82bf6854c0
- path: output/damageprofiler/test/chr22/3pGtoA_freq.txt
md5sum: 61c903b1504ed7d7182570dfc75e4498
- path: output/damageprofiler/test/chr22/5pCtoT_freq.txt
md5sum: 15a75b60ee519b61ce04a83fe3afe855
- path: output/damageprofiler/test/chr22/5p_freq_misincorporations.txt
md5sum: 3b3240d6c1a3491e461b39199a9fcfe3
- path: output/damageprofiler/test/chr22/DamagePlot_five_prime.svg
- path: output/damageprofiler/test/chr22/DamagePlot.pdf
- path: output/damageprofiler/test/chr22/DamagePlot_three_prime.svg
- path: output/damageprofiler/test/DamageProfiler.log
contains:
- "FINISHED SUCCESSFULLY"
- path: output/damageprofiler/test/chr22/dmgprof.json
md5sum: 2e54e712d2ae9e32c4c298e5fd8f60fe
- path: output/damageprofiler/test/chr22/DNA_comp_genome.txt
md5sum: fea48af1ecf491b439d36d4a919473df
- path: output/damageprofiler/test/chr22/DNA_composition_sample.txt
md5sum: 9e17a0b1e5ad4eb13201cd24ad8507dd
- path: output/damageprofiler/test/chr22/edit_distance.pdf
- path: output/damageprofiler/test/chr22/edit_distance.svg
- path: output/damageprofiler/test/chr22/editDistance.txt
md5sum: 04d14b449a5afa8b5dbff0dfa762356b
- path: output/damageprofiler/test/chr22/Length_plot_combined_data.svg
- path: output/damageprofiler/test/chr22/Length_plot_forward_reverse_separated.svg
- path: output/damageprofiler/test/chr22/Length_plot.pdf
- path: output/damageprofiler/test/chr22/lgdistribution.txt
md5sum: df2e19195185ea9ee05e8e84b2948f36
- path: output/damageprofiler/test/chr22/misincorporation.txt
md5sum: bec0c5fc2fa9c82b04949e2d8b6e979c