From 97fe899f792b2188737cb0f0075219feb22c2b2c Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 20 Oct 2021 10:02:30 +0200 Subject: [PATCH] fix: reduce number of required input files for damage profiler (#612) * Reduce number of required input files for damage profiler * Remove rebugging * Add optional species list file. * Working pending updated test-dataset update * Add genome header to config --- modules/damageprofiler/main.nf | 17 +++-- modules/damageprofiler/meta.yml | 8 ++- tests/config/test_data.config | 26 ++++---- tests/modules/damageprofiler/main.nf | 34 ++++++++-- tests/modules/damageprofiler/test.yml | 94 ++++++++++++++++++++++++--- 5 files changed, 144 insertions(+), 35 deletions(-) diff --git a/modules/damageprofiler/main.nf b/modules/damageprofiler/main.nf index 1537b019..3800a305 100644 --- a/modules/damageprofiler/main.nf +++ b/modules/damageprofiler/main.nf @@ -22,25 +22,30 @@ process DAMAGEPROFILER { tuple val(meta), path(bam) path fasta path fai + path specieslist output: tuple val(meta), path("${prefix}"), emit: results path "versions.yml" , emit: versions script: - prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + def software = getSoftwareName(task.process) + prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + def reference = fasta ? "-r $fasta" : "" + def species_list = specieslist ? "-sf $specieslist" : "" """ damageprofiler \\ - -i $bam \\ - -r $fasta \\ - -o $prefix/ \\ - $options.args - + -i $bam \\ + -o $prefix/ \\ + $options.args \\ + $reference \\ + $species_list cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: ${getSoftwareName(task.process)}: \$(damageprofiler -v | sed 's/^DamageProfiler v//') END_VERSIONS """ + } diff --git a/modules/damageprofiler/meta.yml b/modules/damageprofiler/meta.yml index ff82ba09..19ba908f 100644 --- a/modules/damageprofiler/meta.yml +++ b/modules/damageprofiler/meta.yml @@ -32,12 +32,16 @@ input: pattern: "*.{bam,cram,sam}" - fasta: type: file - description: FASTA reference file + description: OPTIONAL FASTA reference file pattern: "*.{fasta,fna,fa}" - fai: type: file - description: FASTA index file from samtools faidx + description: OPTIONAL FASTA index file from samtools faidx pattern: "*.{fai}" + - specieslist: + type: file + description: OPTIONAL text file with list of target reference headers + pattern: "*.{txt}" output: - versions: diff --git a/tests/config/test_data.config b/tests/config/test_data.config index 5381a311..6abfa4f8 100644 --- a/tests/config/test_data.config +++ b/tests/config/test_data.config @@ -104,6 +104,7 @@ params { genome_gtf = "${test_data_dir}/genomics/homo_sapiens/genome/genome.gtf" genome_sizes = "${test_data_dir}/genomics/homo_sapiens/genome/genome.sizes" genome_bed = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed" + genome_header = "${test_data_dir}/genomics/homo_sapiens/genome/genome.header" genome_bed_gz = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz" genome_bed_gz_tbi = "${test_data_dir}/genomics/homo_sapiens/genome/genome.bed.gz.tbi" transcriptome_fasta = "${test_data_dir}/genomics/homo_sapiens/genome/transcriptome.fasta" @@ -119,18 +120,19 @@ params { repeat_expansions = "${test_data_dir}/genomics/homo_sapiens/genome/loci/repeat_expansions.json" } 'illumina' { - test_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam" - test_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai" - test_paired_end_markduplicates_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam" - test_paired_end_markduplicates_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam.bai" - test_paired_end_recalibrated_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam" - test_paired_end_recalibrated_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai" - test_paired_end_umi_consensus_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_consensus.bam" - test_paired_end_umi_converted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_converted.bam" - test_paired_end_umi_grouped_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_grouped.bam" - test_paired_end_umi_histogram_txt = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_histogram.txt" - test_paired_end_umi_unsorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_unsorted.bam" - test_paired_end_umi_unsorted_tagged_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam" + test_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam" + test_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai" + test_paired_end_markduplicates_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam" + test_paired_end_markduplicates_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam.bai" + test_paired_end_markduplicates_sorted_referencesn_txt = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.referencesn.txt" + test_paired_end_recalibrated_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam" + test_paired_end_recalibrated_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai" + test_paired_end_umi_consensus_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_consensus.bam" + test_paired_end_umi_converted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_converted.bam" + test_paired_end_umi_grouped_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_grouped.bam" + test_paired_end_umi_histogram_txt = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_histogram.txt" + test_paired_end_umi_unsorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_unsorted.bam" + test_paired_end_umi_unsorted_tagged_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam" test2_paired_end_sorted_bam = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam" test2_paired_end_sorted_bam_bai = "${test_data_dir}/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai" diff --git a/tests/modules/damageprofiler/main.nf b/tests/modules/damageprofiler/main.nf index 5b128770..36ae7b24 100644 --- a/tests/modules/damageprofiler/main.nf +++ b/tests/modules/damageprofiler/main.nf @@ -6,10 +6,34 @@ include { DAMAGEPROFILER } from '../../../modules/damageprofiler/main.nf' addPar workflow test_damageprofiler { - input = [ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) ] ] - fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) - fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + input = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_markduplicates_sorted_bam'], checkIfExists: true) ] ] + fasta = [] + fai = [] + species_list = [] - DAMAGEPROFILER ( input, fasta, fai ) + + DAMAGEPROFILER ( input, fasta, fai, species_list ) +} + +workflow test_damageprofiler_reference { + + input = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_markduplicates_sorted_bam'], checkIfExists: true) ] ] + fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + fai = file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + species_list = [] + + DAMAGEPROFILER ( input, fasta, fai, species_list ) +} + +workflow test_damageprofiler_specieslist { + + input = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['homo_sapiens']['illumina']['test_paired_end_markduplicates_sorted_bam'], checkIfExists: true) ] ] + fasta = [] + fai = [] + species_list = file(params.test_data['homo_sapiens']['genome']['genome_header'], checkIfExists: true) + + DAMAGEPROFILER ( input, fasta, fai, species_list ) } diff --git a/tests/modules/damageprofiler/test.yml b/tests/modules/damageprofiler/test.yml index 357647be..9ef964dc 100644 --- a/tests/modules/damageprofiler/test.yml +++ b/tests/modules/damageprofiler/test.yml @@ -4,13 +4,13 @@ - damageprofiler files: - path: output/damageprofiler/test/3p_freq_misincorporations.txt - md5sum: da4cac90c78899a7cb6d72d415392b49 + md5sum: de3b84d946a6b63cdcfadf82bf6854c0 - path: output/damageprofiler/test/3pGtoA_freq.txt - md5sum: 8dab75d51a4b943b501d0995169c767f + md5sum: 61c903b1504ed7d7182570dfc75e4498 - path: output/damageprofiler/test/5pCtoT_freq.txt - md5sum: fcc48ee5f72edff930d627c8bfdd8a5b + md5sum: 15a75b60ee519b61ce04a83fe3afe855 - path: output/damageprofiler/test/5p_freq_misincorporations.txt - md5sum: 54665474f5ef17dcc268567e5eaa7d86 + md5sum: 3b3240d6c1a3491e461b39199a9fcfe3 - path: output/damageprofiler/test/DamagePlot_five_prime.svg - path: output/damageprofiler/test/DamagePlot.pdf - path: output/damageprofiler/test/DamagePlot_three_prime.svg @@ -18,19 +18,93 @@ contains: - "FINISHED SUCCESSFULLY" - path: output/damageprofiler/test/dmgprof.json - md5sum: 98499024c7e937896e481f2d3cfbdd3e + md5sum: 2e54e712d2ae9e32c4c298e5fd8f60fe - path: output/damageprofiler/test/DNA_comp_genome.txt - md5sum: f91e70760d91a1193a27e360aaddf2fd + md5sum: fea48af1ecf491b439d36d4a919473df - path: output/damageprofiler/test/DNA_composition_sample.txt - md5sum: 1257eb3eb42484647bfba2151f9ef04f + md5sum: 9e17a0b1e5ad4eb13201cd24ad8507dd - path: output/damageprofiler/test/edit_distance.pdf - path: output/damageprofiler/test/edit_distance.svg - path: output/damageprofiler/test/editDistance.txt - md5sum: af2d2f4a99058ec56eae88ec27779e38 + md5sum: 04d14b449a5afa8b5dbff0dfa762356b - path: output/damageprofiler/test/Length_plot_combined_data.svg - path: output/damageprofiler/test/Length_plot_forward_reverse_separated.svg - path: output/damageprofiler/test/Length_plot.pdf - path: output/damageprofiler/test/lgdistribution.txt - md5sum: c5d029bf3a92b613310ee23f47d94981 + md5sum: df2e19195185ea9ee05e8e84b2948f36 - path: output/damageprofiler/test/misincorporation.txt - md5sum: 3aa6dd749010a492d92a815a83c196a8 + md5sum: bec0c5fc2fa9c82b04949e2d8b6e979c + +- name: damageprofiler_reference + command: nextflow run ./tests/modules/damageprofiler -entry test_damageprofiler_reference -c tests/config/nextflow.config -dump-channels + tags: + - damageprofiler + files: + - path: output/damageprofiler/test/3p_freq_misincorporations.txt + md5sum: de3b84d946a6b63cdcfadf82bf6854c0 + - path: output/damageprofiler/test/3pGtoA_freq.txt + md5sum: 61c903b1504ed7d7182570dfc75e4498 + - path: output/damageprofiler/test/5pCtoT_freq.txt + md5sum: 15a75b60ee519b61ce04a83fe3afe855 + - path: output/damageprofiler/test/5p_freq_misincorporations.txt + md5sum: 3b3240d6c1a3491e461b39199a9fcfe3 + - path: output/damageprofiler/test/DamagePlot_five_prime.svg + - path: output/damageprofiler/test/DamagePlot.pdf + - path: output/damageprofiler/test/DamagePlot_three_prime.svg + - path: output/damageprofiler/test/DamageProfiler.log + contains: + - "FINISHED SUCCESSFULLY" + - path: output/damageprofiler/test/dmgprof.json + md5sum: 2e54e712d2ae9e32c4c298e5fd8f60fe + - path: output/damageprofiler/test/DNA_comp_genome.txt + md5sum: fea48af1ecf491b439d36d4a919473df + - path: output/damageprofiler/test/DNA_composition_sample.txt + md5sum: 9e17a0b1e5ad4eb13201cd24ad8507dd + - path: output/damageprofiler/test/edit_distance.pdf + - path: output/damageprofiler/test/edit_distance.svg + - path: output/damageprofiler/test/editDistance.txt + md5sum: 04d14b449a5afa8b5dbff0dfa762356b + - path: output/damageprofiler/test/Length_plot_combined_data.svg + - path: output/damageprofiler/test/Length_plot_forward_reverse_separated.svg + - path: output/damageprofiler/test/Length_plot.pdf + - path: output/damageprofiler/test/lgdistribution.txt + md5sum: df2e19195185ea9ee05e8e84b2948f36 + - path: output/damageprofiler/test/misincorporation.txt + md5sum: bec0c5fc2fa9c82b04949e2d8b6e979c + +- name: damageprofiler_specieslist + command: nextflow run ./tests/modules/damageprofiler -entry test_damageprofiler_specieslist -c tests/config/nextflow.config -dump-channels + tags: + - damageprofiler + files: + - path: output/damageprofiler/test/chr22/3p_freq_misincorporations.txt + md5sum: de3b84d946a6b63cdcfadf82bf6854c0 + - path: output/damageprofiler/test/chr22/3pGtoA_freq.txt + md5sum: 61c903b1504ed7d7182570dfc75e4498 + - path: output/damageprofiler/test/chr22/5pCtoT_freq.txt + md5sum: 15a75b60ee519b61ce04a83fe3afe855 + - path: output/damageprofiler/test/chr22/5p_freq_misincorporations.txt + md5sum: 3b3240d6c1a3491e461b39199a9fcfe3 + - path: output/damageprofiler/test/chr22/DamagePlot_five_prime.svg + - path: output/damageprofiler/test/chr22/DamagePlot.pdf + - path: output/damageprofiler/test/chr22/DamagePlot_three_prime.svg + - path: output/damageprofiler/test/DamageProfiler.log + contains: + - "FINISHED SUCCESSFULLY" + - path: output/damageprofiler/test/chr22/dmgprof.json + md5sum: 2e54e712d2ae9e32c4c298e5fd8f60fe + - path: output/damageprofiler/test/chr22/DNA_comp_genome.txt + md5sum: fea48af1ecf491b439d36d4a919473df + - path: output/damageprofiler/test/chr22/DNA_composition_sample.txt + md5sum: 9e17a0b1e5ad4eb13201cd24ad8507dd + - path: output/damageprofiler/test/chr22/edit_distance.pdf + - path: output/damageprofiler/test/chr22/edit_distance.svg + - path: output/damageprofiler/test/chr22/editDistance.txt + md5sum: 04d14b449a5afa8b5dbff0dfa762356b + - path: output/damageprofiler/test/chr22/Length_plot_combined_data.svg + - path: output/damageprofiler/test/chr22/Length_plot_forward_reverse_separated.svg + - path: output/damageprofiler/test/chr22/Length_plot.pdf + - path: output/damageprofiler/test/chr22/lgdistribution.txt + md5sum: df2e19195185ea9ee05e8e84b2948f36 + - path: output/damageprofiler/test/chr22/misincorporation.txt + md5sum: bec0c5fc2fa9c82b04949e2d8b6e979c