From 77b5986463042de584c823976299d4b498212988 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Wed, 25 May 2022 13:27:10 +0200 Subject: [PATCH 1/6] Create ampir module --- modules/ampir/main.nf | 43 ++++++++++++++++++++++ modules/ampir/meta.yml | 55 +++++++++++++++++++++++++++++ tests/config/pytest_modules.yml | 4 +++ tests/modules/ampir/main.nf | 20 +++++++++++ tests/modules/ampir/nextflow.config | 5 +++ tests/modules/ampir/test.yml | 9 +++++ 6 files changed, 136 insertions(+) create mode 100644 modules/ampir/main.nf create mode 100644 modules/ampir/meta.yml create mode 100644 tests/modules/ampir/main.nf create mode 100644 tests/modules/ampir/nextflow.config create mode 100644 tests/modules/ampir/test.yml diff --git a/modules/ampir/main.nf b/modules/ampir/main.nf new file mode 100644 index 00000000..e1720b83 --- /dev/null +++ b/modules/ampir/main.nf @@ -0,0 +1,43 @@ +process AMPIR { + tag "$meta.id" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::r-ampir=1.1.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-ampir:1.1.0': + 'quay.io/biocontainers/r-ampir:1.1.0' }" + + input: + tuple val(meta), path(faa) + val cut_off + val model + val output_name + + output: + tuple val(meta), path(output_name) , emit: amps + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + library(ampir) + + protein_seqs <- read_faa("${faa}") + prediction <- predict_amps(protein_seqs, model = '$model') + prediction <- protein_seqs[which(prediction\$prob_AMP >= as.integer($cut_off)), ] + df_to_faa(protein_seqs, '$output_name') + + version_file_path <- "versions.yml" + version_ampir <- paste(unlist(packageVersion("ampir")), collapse = ".") + f <- file(version_file_path, "w") + writeLines('"${task.process}":', f) + writeLines(" ampir: ", f, sep = "") + writeLines(version_ampir, f) + close(f) + """ +} diff --git a/modules/ampir/meta.yml b/modules/ampir/meta.yml new file mode 100644 index 00000000..b37ace09 --- /dev/null +++ b/modules/ampir/meta.yml @@ -0,0 +1,55 @@ +name: "ampir" +description: A fast and user-friendly method to predict antimicrobial peptides (AMPs) from any given size protein dataset. ampir uses a supervised statistical machine learning approach to predict AMPs. +keywords: + - ampir + - amp + - antimicrobial peptide prediction +tools: + - "ampir": + description: "A toolkit to predict antimicrobial peptides from protein sequences on a genome-wide scale." + homepage: "https://github.com/Legana/ampir" + documentation: "https://github.com/Legana/ampir" + tool_dev_url: "https://github.com/Legana/ampir" + doi: "10.1093/bioinformatics/btaa653" + licence: ["GPL v2"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - faa: + type: file + description: FASTA file containing amino acid sequences + pattern: "*.{faa,fasta}" + - model: + type: value + description: Model for AMP prediction + pattern: "{precursor,mature}" + - cut_off: + type: value + description: Cut-off for AMP prediction + pattern: "[0-9][0-9]" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - amps: + type: file + description: File containing AMP predictions + pattern: "prediction.fasta" + - output_name: + type: value + description: File name of the FASTA output file + pattern: "*.{faa,fasta}" + +authors: + - "@jasmezz" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index 57f1a8a8..442a3cd4 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -26,6 +26,10 @@ allelecounter: - modules/allelecounter/** - tests/modules/allelecounter/** +ampir: + - modules/ampir/** + - tests/modules/ampir/** + amplify/predict: - modules/amplify/predict/** - tests/modules/amplify/predict/** diff --git a/tests/modules/ampir/main.nf b/tests/modules/ampir/main.nf new file mode 100644 index 00000000..769ebb3f --- /dev/null +++ b/tests/modules/ampir/main.nf @@ -0,0 +1,20 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +include { AMPIR } from '../../../modules/ampir/main.nf' + +workflow test_ampir { + + fasta = [ [ id:'test', single_end:false ], // meta map + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['proteome_fasta'], checkIfExists: true), + ] + + cut_off = "80" + + model = "precursor" + + output_name = "prediction.fasta" + + AMPIR ( fasta, cut_off, model, output_name ) +} diff --git a/tests/modules/ampir/nextflow.config b/tests/modules/ampir/nextflow.config new file mode 100644 index 00000000..50f50a7a --- /dev/null +++ b/tests/modules/ampir/nextflow.config @@ -0,0 +1,5 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + +} \ No newline at end of file diff --git a/tests/modules/ampir/test.yml b/tests/modules/ampir/test.yml new file mode 100644 index 00000000..663ba675 --- /dev/null +++ b/tests/modules/ampir/test.yml @@ -0,0 +1,9 @@ +- name: "ampir" + command: nextflow run ./tests/modules/ampir -entry test_ampir -c ./tests/config/nextflow.config -c ./tests/modules/ampir/nextflow.config + tags: + - "ampir" + files: + - path: output/ampir/prediction.fasta + md5sum: e605d38752fd90261c924d51f7007189 + - path: output/ampir/versions.yml + md5sum: 4a11d25b8a904a7ffb34ae88f6826888 From ba53e09b6416305e12b071e16e604e9a96f94d59 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Thu, 9 Jun 2022 15:37:35 +0200 Subject: [PATCH 2/6] Implement suggestions from code review --- modules/ampir/main.nf | 15 ++++++++------- modules/ampir/meta.yml | 14 +++++++------- tests/modules/ampir/main.nf | 4 +--- tests/modules/ampir/test.yml | 10 ++++++---- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/modules/ampir/main.nf b/modules/ampir/main.nf index e1720b83..412317b7 100644 --- a/modules/ampir/main.nf +++ b/modules/ampir/main.nf @@ -11,11 +11,11 @@ process AMPIR { tuple val(meta), path(faa) val cut_off val model - val output_name output: - tuple val(meta), path(output_name) , emit: amps - path "versions.yml" , emit: versions + tuple val(meta), path("*.faa"), emit: amps_faa + tuple val(meta), path("*.csv"), emit: amps_csv + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -27,10 +27,11 @@ process AMPIR { #!/usr/bin/env Rscript library(ampir) - protein_seqs <- read_faa("${faa}") - prediction <- predict_amps(protein_seqs, model = '$model') - prediction <- protein_seqs[which(prediction\$prob_AMP >= as.integer($cut_off)), ] - df_to_faa(protein_seqs, '$output_name') + protein_seqs <- read_faa('${faa}') + prediction <- predict_amps(protein_seqs, model = '${model}') + prediction <- protein_seqs[which(prediction\$prob_AMP >= as.integer(${cut_off})), ] + df_to_faa(protein_seqs, "${prefix}.faa") + write.table(prediction, file = "${prefix}.csv", row.names = FALSE, quote = FALSE, dec = '.') version_file_path <- "versions.yml" version_ampir <- paste(unlist(packageVersion("ampir")), collapse = ".") diff --git a/modules/ampir/meta.yml b/modules/ampir/meta.yml index b37ace09..30e55e99 100644 --- a/modules/ampir/meta.yml +++ b/modules/ampir/meta.yml @@ -8,7 +8,7 @@ tools: - "ampir": description: "A toolkit to predict antimicrobial peptides from protein sequences on a genome-wide scale." homepage: "https://github.com/Legana/ampir" - documentation: "https://github.com/Legana/ampir" + documentation: "https://cran.r-project.org/web/packages/ampir/index.html" tool_dev_url: "https://github.com/Legana/ampir" doi: "10.1093/bioinformatics/btaa653" licence: ["GPL v2"] @@ -42,14 +42,14 @@ output: type: file description: File containing software versions pattern: "versions.yml" - - amps: + - amps_faa: type: file - description: File containing AMP predictions - pattern: "prediction.fasta" - - output_name: - type: value - description: File name of the FASTA output file + description: File containing AMP predictions in amino acid FASTA format pattern: "*.{faa,fasta}" + - amps_csv: + type: file + description: File containing AMP predictions in CSV format + pattern: "*.csv" authors: - "@jasmezz" diff --git a/tests/modules/ampir/main.nf b/tests/modules/ampir/main.nf index 769ebb3f..cc7b07c8 100644 --- a/tests/modules/ampir/main.nf +++ b/tests/modules/ampir/main.nf @@ -14,7 +14,5 @@ workflow test_ampir { model = "precursor" - output_name = "prediction.fasta" - - AMPIR ( fasta, cut_off, model, output_name ) + AMPIR ( fasta, cut_off, model ) } diff --git a/tests/modules/ampir/test.yml b/tests/modules/ampir/test.yml index 663ba675..40bedd37 100644 --- a/tests/modules/ampir/test.yml +++ b/tests/modules/ampir/test.yml @@ -1,9 +1,11 @@ -- name: "ampir" - command: nextflow run ./tests/modules/ampir -entry test_ampir -c ./tests/config/nextflow.config -c ./tests/modules/ampir/nextflow.config +- name: ampir test_ampir + command: nextflow run ./tests/modules/ampir -entry test_ampir -c ./tests/config/nextflow.config -c ./tests/modules/ampir/nextflow.config tags: - - "ampir" + - ampir files: - - path: output/ampir/prediction.fasta + - path: output/ampir/test.csv + md5sum: 063fd8866f6a669457d7a49404fb449f + - path: output/ampir/test.faa md5sum: e605d38752fd90261c924d51f7007189 - path: output/ampir/versions.yml md5sum: 4a11d25b8a904a7ffb34ae88f6826888 From ecf1d146ab3c3a60fdc5daf0fee18a26c9105c24 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Thu, 9 Jun 2022 16:59:00 +0200 Subject: [PATCH 3/6] Add full user options for AMPir --- modules/ampir/main.nf | 15 +++++++++------ modules/ampir/meta.yml | 8 ++++++-- tests/modules/ampir/main.nf | 8 +++++--- tests/modules/ampir/test.yml | 4 ++-- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/modules/ampir/main.nf b/modules/ampir/main.nf index 412317b7..11fc88cf 100644 --- a/modules/ampir/main.nf +++ b/modules/ampir/main.nf @@ -9,8 +9,9 @@ process AMPIR { input: tuple val(meta), path(faa) - val cut_off val model + val min_length + val min_probability output: tuple val(meta), path("*.faa"), emit: amps_faa @@ -23,15 +24,17 @@ process AMPIR { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + min_length = ("${min_length}" == "[]") ? "": " min_len = as.integer(${min_length})," // Fall back to AMPir default value if none specified """ #!/usr/bin/env Rscript library(ampir) - protein_seqs <- read_faa('${faa}') - prediction <- predict_amps(protein_seqs, model = '${model}') - prediction <- protein_seqs[which(prediction\$prob_AMP >= as.integer(${cut_off})), ] - df_to_faa(protein_seqs, "${prefix}.faa") - write.table(prediction, file = "${prefix}.csv", row.names = FALSE, quote = FALSE, dec = '.') + input_seqs <- read_faa('${faa}') + prediction <- predict_amps(input_seqs,${min_length} model = '${model}') + prediction <- prediction[which(prediction\$prob_AMP >= as.numeric(${min_probability})), ] + output_seqs <- input_seqs[row.names(prediction), ] + write.table(prediction, file = "${prefix}.csv", row.names = FALSE, sep = ";", quote = FALSE, dec = '.') + df_to_faa(output_seqs, "${prefix}.faa") version_file_path <- "versions.yml" version_ampir <- paste(unlist(packageVersion("ampir")), collapse = ".") diff --git a/modules/ampir/meta.yml b/modules/ampir/meta.yml index 30e55e99..0b3850c0 100644 --- a/modules/ampir/meta.yml +++ b/modules/ampir/meta.yml @@ -25,9 +25,13 @@ input: pattern: "*.{faa,fasta}" - model: type: value - description: Model for AMP prediction + description: Built-in model for AMP prediction pattern: "{precursor,mature}" - - cut_off: + - min_length: + type: value + description: Minimum protein length for which predictions will be generated + pattern: "[0-9]+" + - min_probability: type: value description: Cut-off for AMP prediction pattern: "[0-9][0-9]" diff --git a/tests/modules/ampir/main.nf b/tests/modules/ampir/main.nf index cc7b07c8..bee2ff90 100644 --- a/tests/modules/ampir/main.nf +++ b/tests/modules/ampir/main.nf @@ -10,9 +10,11 @@ workflow test_ampir { file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['proteome_fasta'], checkIfExists: true), ] - cut_off = "80" - model = "precursor" - AMPIR ( fasta, cut_off, model ) + min_length = [] + + min_probability = "0.7" + + AMPIR ( fasta, model, min_length, min_probability ) } diff --git a/tests/modules/ampir/test.yml b/tests/modules/ampir/test.yml index 40bedd37..4077ef14 100644 --- a/tests/modules/ampir/test.yml +++ b/tests/modules/ampir/test.yml @@ -4,8 +4,8 @@ - ampir files: - path: output/ampir/test.csv - md5sum: 063fd8866f6a669457d7a49404fb449f + md5sum: a3b3152373a8add3613ace8c71073e68 - path: output/ampir/test.faa - md5sum: e605d38752fd90261c924d51f7007189 + md5sum: 0435609144022c55ac196db053f0df89 - path: output/ampir/versions.yml md5sum: 4a11d25b8a904a7ffb34ae88f6826888 From 5841f11edbbae29f4904a2994e53d1e81233efc4 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Thu, 9 Jun 2022 18:15:27 +0200 Subject: [PATCH 4/6] Replace MD5 with contains check --- tests/modules/ampir/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/modules/ampir/test.yml b/tests/modules/ampir/test.yml index 4077ef14..3c6a2f64 100644 --- a/tests/modules/ampir/test.yml +++ b/tests/modules/ampir/test.yml @@ -4,7 +4,7 @@ - ampir files: - path: output/ampir/test.csv - md5sum: a3b3152373a8add3613ace8c71073e68 + contains: ["seq_name;seq_aa;prob_AMP", "WP_014895017.1"] - path: output/ampir/test.faa md5sum: 0435609144022c55ac196db053f0df89 - path: output/ampir/versions.yml From 39388abc0473ca5c7a282ec0ea7f63ba7c439053 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Fri, 10 Jun 2022 12:54:39 +0200 Subject: [PATCH 5/6] Add file name collision check + TSV output --- modules/ampir/main.nf | 5 +++-- modules/ampir/meta.yml | 6 +++--- tests/modules/ampir/test.yml | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/modules/ampir/main.nf b/modules/ampir/main.nf index 11fc88cf..97ea0bbb 100644 --- a/modules/ampir/main.nf +++ b/modules/ampir/main.nf @@ -15,7 +15,7 @@ process AMPIR { output: tuple val(meta), path("*.faa"), emit: amps_faa - tuple val(meta), path("*.csv"), emit: amps_csv + tuple val(meta), path("*.tsv"), emit: amps_csv path "versions.yml" , emit: versions when: @@ -25,6 +25,7 @@ process AMPIR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" min_length = ("${min_length}" == "[]") ? "": " min_len = as.integer(${min_length})," // Fall back to AMPir default value if none specified + if ("$faa" == "${prefix}.faa") error "Input and output names are the same, set prefix in module configuration to disambiguate!" """ #!/usr/bin/env Rscript library(ampir) @@ -33,7 +34,7 @@ process AMPIR { prediction <- predict_amps(input_seqs,${min_length} model = '${model}') prediction <- prediction[which(prediction\$prob_AMP >= as.numeric(${min_probability})), ] output_seqs <- input_seqs[row.names(prediction), ] - write.table(prediction, file = "${prefix}.csv", row.names = FALSE, sep = ";", quote = FALSE, dec = '.') + write.table(prediction, file = "${prefix}.tsv", row.names = FALSE, sep = "\t", quote = FALSE, dec = '.') df_to_faa(output_seqs, "${prefix}.faa") version_file_path <- "versions.yml" diff --git a/modules/ampir/meta.yml b/modules/ampir/meta.yml index 0b3850c0..33189d66 100644 --- a/modules/ampir/meta.yml +++ b/modules/ampir/meta.yml @@ -49,11 +49,11 @@ output: - amps_faa: type: file description: File containing AMP predictions in amino acid FASTA format - pattern: "*.{faa,fasta}" + pattern: "*.{faa}" - amps_csv: type: file - description: File containing AMP predictions in CSV format - pattern: "*.csv" + description: File containing AMP predictions in TSV format + pattern: "*.tsv" authors: - "@jasmezz" diff --git a/tests/modules/ampir/test.yml b/tests/modules/ampir/test.yml index 3c6a2f64..54921e9a 100644 --- a/tests/modules/ampir/test.yml +++ b/tests/modules/ampir/test.yml @@ -3,8 +3,8 @@ tags: - ampir files: - - path: output/ampir/test.csv - contains: ["seq_name;seq_aa;prob_AMP", "WP_014895017.1"] + - path: output/ampir/test.tsv + contains: ["seq_name\tseq_aa\tprob_AMP", "WP_014895017.1"] - path: output/ampir/test.faa md5sum: 0435609144022c55ac196db053f0df89 - path: output/ampir/versions.yml From 01d4a1fa3a0f6c87b02a50f450f5e85464eaa7e4 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 10 Jun 2022 14:46:58 +0200 Subject: [PATCH 6/6] Apply suggestions from code review --- modules/ampir/main.nf | 2 +- modules/ampir/meta.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ampir/main.nf b/modules/ampir/main.nf index 97ea0bbb..57e20902 100644 --- a/modules/ampir/main.nf +++ b/modules/ampir/main.nf @@ -15,7 +15,7 @@ process AMPIR { output: tuple val(meta), path("*.faa"), emit: amps_faa - tuple val(meta), path("*.tsv"), emit: amps_csv + tuple val(meta), path("*.tsv"), emit: amps_tsv path "versions.yml" , emit: versions when: diff --git a/modules/ampir/meta.yml b/modules/ampir/meta.yml index 33189d66..7569ca69 100644 --- a/modules/ampir/meta.yml +++ b/modules/ampir/meta.yml @@ -50,7 +50,7 @@ output: type: file description: File containing AMP predictions in amino acid FASTA format pattern: "*.{faa}" - - amps_csv: + - amps_tsv: type: file description: File containing AMP predictions in TSV format pattern: "*.tsv"