From 07bd989bc6f8e663c9e10b998101d8ed86dc5d5f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 12 Dec 2022 10:02:22 +0100 Subject: [PATCH 1/3] Start work on groovy native datbase checks --- conf/modules.config | 8 -------- modules/local/database_check.nf | 29 --------------------------- subworkflows/local/db_check.nf | 35 +++++++++++++++++++++++++++------ workflows/taxprofiler.nf | 2 +- 4 files changed, 30 insertions(+), 44 deletions(-) delete mode 100644 modules/local/database_check.nf diff --git a/conf/modules.config b/conf/modules.config index dd85c0c..017e0c9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,14 +12,6 @@ process { - withName: DATABASE_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: FASTQC { ext.args = '--quiet' ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } diff --git a/modules/local/database_check.nf b/modules/local/database_check.nf deleted file mode 100644 index ccf611c..0000000 --- a/modules/local/database_check.nf +++ /dev/null @@ -1,29 +0,0 @@ -process DATABASE_CHECK { - tag "$databasesheet" - label 'process_single' - - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" - - input: - path databasesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ - """ - cat $databasesheet >> database_sheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 5d0c4eb..ce26905 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -2,7 +2,6 @@ // Check input samplesheet and get read channels // -include { DATABASE_CHECK } from '../../modules/local/database_check' include { UNTAR } from '../../modules/nf-core/untar/main' workflow DB_CHECK { @@ -10,15 +9,18 @@ workflow DB_CHECK { dbsheet // file: /path/to/dbsheet.csv main: + ch_versions = Channel.empty() // TODO: make database sheet check // Checks: // 1) no duplicates, - // 2) args do not have quotes, e.g. just `,,` and NOT `,"",` - parsed_samplesheet = DATABASE_CHECK ( dbsheet ) - .csv + + parsed_samplesheet = Channel.fromPath(dbsheet) .splitCsv ( header:true, sep:',' ) - .map { create_db_channels(it) } + .map { + validate_db_sheet(it) + create_db_channels(it) + } ch_dbs_for_untar = parsed_samplesheet .branch { @@ -29,12 +31,31 @@ workflow DB_CHECK { // TODO Filter to only run UNTAR on DBs of tools actually using? // TODO make optional whether to save UNTAR ( ch_dbs_for_untar.untar ) + ch_versions = ch_versions.mix(UNTAR.out.versions.first()) ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar ) emit: dbs = ch_final_dbs // channel: [ val(meta), [ db ] ] - versions = DATABASE_CHECK.out.versions.mix(UNTAR.out.versions.first()) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] +} + +def validate_db_sheet(LinkedHashMap row){ + + // check minimum number of columns + if (row.size() < 4) exit 1, "[nf-core/taxprofiler] error: Invalid database input sheet - malformed row (e.g. missing column). See documentation for more information. Error in: ${row}, " + + // all columns there + def expected_headers = ['tool', 'db_name', 'db_params', 'db_path'] + if ( !row.keySet().containsAll(expected_headers) ) exit 1, "[nf-core/taxprofiler] error: Invalid database input sheet - malformed column names. Please check input TSV. Column names should be: ${expected_keys.join(", ")}" + + // valid tools specified// TIFNISIH LIST + def expected_tools = [ "bracken", "centrifuge", "diamond", "kaiju", "kraken2", "malt", "metaphlan3" ] + + // detect quotes in params + if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] error: Invalid database db_params entry. No quotes allowed. Error in: ${row}" + if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] error: Invalid database db_params entry. No quotes allowed. Error in: ${row}" + } def create_db_channels(LinkedHashMap row) { @@ -51,3 +72,5 @@ def create_db_channels(LinkedHashMap row) { return array } + + diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index c9f002c..7bcfc3a 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -25,7 +25,7 @@ if ( params.input ) { exit 1, "Input samplesheet, or PEP config and base directory not specified" } -if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } +if (params.databases) { ch_databases = file(params.databases, checkIfExists: true) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." if (params.shortread_qc_includeunmerged && !params.shortread_qc_mergepairs) exit 1, "ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging is not turned on. Please specify --shortread_qc_mergepairs" From 460f3ca157bd618dde7dbc20f5c99bdd2bda731b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 12 Dec 2022 11:03:40 +0100 Subject: [PATCH 2/3] Add database sheet validation --- subworkflows/local/db_check.nf | 31 ++++++++++++++++++++----------- subworkflows/local/profiling.nf | 10 +++++----- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index ce26905..4ed786d 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -11,14 +11,22 @@ workflow DB_CHECK { main: ch_versions = Channel.empty() - // TODO: make database sheet check - // Checks: - // 1) no duplicates, + // special check to check _between_ rows, for which we must group rows together + Channel.fromPath(dbsheet) + .splitCsv ( header:true, sep:',' ) + .map {[it.tool, it.db_name] } + .groupTuple() + .map { + tool, db_name -> + def unique_names = db_name.unique(false) + if ( unique_names.size() < db_name.size() ) exit 1, "[nf-core/taxprofiler] ERROR: Each database for a tool must have a unique name, duplicated detected. Tool: ${tool}, Database name: ${unique_names}" + } + // normal checks for within-row validity, so can be moved to separate functions parsed_samplesheet = Channel.fromPath(dbsheet) .splitCsv ( header:true, sep:',' ) .map { - validate_db_sheet(it) + validate_db_rows(it) create_db_channels(it) } @@ -40,21 +48,22 @@ workflow DB_CHECK { versions = ch_versions // channel: [ versions.yml ] } -def validate_db_sheet(LinkedHashMap row){ +def validate_db_rows(LinkedHashMap row){ // check minimum number of columns - if (row.size() < 4) exit 1, "[nf-core/taxprofiler] error: Invalid database input sheet - malformed row (e.g. missing column). See documentation for more information. Error in: ${row}, " + if (row.size() < 4) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database input sheet - malformed row (e.g. missing column). See documentation for more information. Error in: ${row}" // all columns there def expected_headers = ['tool', 'db_name', 'db_params', 'db_path'] - if ( !row.keySet().containsAll(expected_headers) ) exit 1, "[nf-core/taxprofiler] error: Invalid database input sheet - malformed column names. Please check input TSV. Column names should be: ${expected_keys.join(", ")}" + if ( !row.keySet().containsAll(expected_headers) ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database input sheet - malformed column names. Please check input TSV. Column names should be: ${expected_keys.join(", ")}" // valid tools specified// TIFNISIH LIST - def expected_tools = [ "bracken", "centrifuge", "diamond", "kaiju", "kraken2", "malt", "metaphlan3" ] + def expected_tools = [ "bracken", "centrifuge", "diamond", "kaiju", "kraken2", "krakenuniq", "malt", "metaphlan3", "motus" ] + if ( !expected_tools.contains(row.tool) ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid tool name. Please see documentation for all supported profilers. Error in: ${row}" // detect quotes in params - if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] error: Invalid database db_params entry. No quotes allowed. Error in: ${row}" - if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] error: Invalid database db_params entry. No quotes allowed. Error in: ${row}" + if ( row.db_params.contains('"') ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}" + if ( row.db_params.contains("'") ) exit 1, "[nf-core/taxprofiler] ERROR: Invalid database db_params entry. No quotes allowed. Error in: ${row}" } @@ -66,7 +75,7 @@ def create_db_channels(LinkedHashMap row) { def array = [] if (!file(row.db_path, type: 'dir').exists()) { - exit 1, "ERROR: Please check input samplesheet -> database could not be found!\n${row.db_path}" + exit 1, "ERROR: Please check input samplesheet -> database path could not be found!\n${row.db_path}" } array = [ meta, file(row.db_path) ] diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index f5c970c..6613648 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -41,14 +41,14 @@ workflow PROFILING { } .combine(databases) .branch { - malt: it[2]['tool'] == 'malt' - kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken - metaphlan3: it[2]['tool'] == 'metaphlan3' centrifuge: it[2]['tool'] == 'centrifuge' - kaiju: it[2]['tool'] == 'kaiju' diamond: it[2]['tool'] == 'diamond' - motus: it[2]['tool'] == 'motus' + kaiju: it[2]['tool'] == 'kaiju' + kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken krakenuniq: it[2]['tool'] == 'krakenuniq' + malt: it[2]['tool'] == 'malt' + metaphlan3: it[2]['tool'] == 'metaphlan3' + motus: it[2]['tool'] == 'motus' unknown: true } From 7c66968fef7ec7bd2ca44e5cb1571814817acf16 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 12 Dec 2022 11:04:21 +0100 Subject: [PATCH 3/3] More comments --- subworkflows/local/db_check.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/db_check.nf b/subworkflows/local/db_check.nf index 4ed786d..14078d9 100644 --- a/subworkflows/local/db_check.nf +++ b/subworkflows/local/db_check.nf @@ -12,6 +12,7 @@ workflow DB_CHECK { ch_versions = Channel.empty() // special check to check _between_ rows, for which we must group rows together + // note: this will run in parallel to within-row validity, but we can assume this will run faster thus will fail first Channel.fromPath(dbsheet) .splitCsv ( header:true, sep:',' ) .map {[it.tool, it.db_name] }