diff --git a/README.md b/README.md index ddbaddf..11eb9a3 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,8 @@ On release, automated continuous integration tests run the pipeline on a full-si nextflow run nf-core/taxprofiler --input samplesheet.csv --databases database.csv --outdir --run_ --run_ -profile ``` +Note pipeline supports both CSV and PEP input sample sheets. Find out more [here](http://pep.databio.org/en/2.1.0/specification/). + ## Documentation The nf-core/taxprofiler pipeline comes with documentation about the pipeline [usage](https://nf-co.re/taxprofiler/usage), [parameters](https://nf-co.re/taxprofiler/parameters) and [output](https://nf-co.re/taxprofiler/output). diff --git a/assets/samplesheet_schema.yaml b/assets/samplesheet_schema.yaml new file mode 100644 index 0000000..88ff451 --- /dev/null +++ b/assets/samplesheet_schema.yaml @@ -0,0 +1,55 @@ +description: A schema for validation of samplesheet.csv for taxprofiler pipeline. +imports: + - https://schema.databio.org/pep/2.1.0.yaml +properties: + samples: + type: array + items: + type: object + properties: + sample: + type: string + description: "Sample identifier." + pattern: "^\\S*$" + run_accession: + type: string + description: "Run accession number." + instrument_platform: + type: string + description: "Name of the platform that sequenced the samples." + enum: + [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT", + ] + fastq1: + type: ["string", "null"] + description: "Optional FASTQ file for read 1 of paired-end sequenced libraries." + pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$" + fastq2: + type: ["string", "null"] + description: "Optional FASTQ file for read 2 of paired-end sequenced libraries." + pattern: "^[\\S]+.(fq\\.gz|fastq\\.gz)$" + fasta: + type: ["string", "null"] + description: "Optional FASTA file." + pattern: "^[\\S]+.(fa\\.gz|fasta\\.gz)$" + required: + - sample + - run_accession + - instrument_platform + files: + - fastq1 + - fastq2 + - fasta +required: + - samples diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index ca54ed9..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python - -from distutils import extension -import os -import sys -import errno -import argparse - - -def parse_args(args=None): - Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents." - - Epilog = "Example usage: python check_samplesheet.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) - - -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def print_error(error, context="Line", context_str=""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) - - -def check_samplesheet(file_in, file_out): - """ - This function checks that the samplesheet follows the following structure: - - sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta - 2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz - 2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz, - 2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,, - 2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz, - """ - - FQ_EXTENSIONS = (".fq.gz", ".fastq.gz") - FA_EXTENSIONS = ( - ".fa", - ".fa.gz", - ".fasta", - ".fasta.gz", - ".fna", - ".fna.gz", - ".fas", - ".fas.gz", - ) - INSTRUMENT_PLATFORMS = [ - "ABI_SOLID", - "BGISEQ", - "CAPILLARY", - "COMPLETE_GENOMICS", - "DNBSEQ", - "HELICOS", - "ILLUMINA", - "ION_TORRENT", - "LS454", - "OXFORD_NANOPORE", - "PACBIO_SMRT", - ] - - sample_mapping_dict = {} - with open(file_in, "r") as fin: - - ## Check header - MIN_COLS = 4 - HEADER = [ - "sample", - "run_accession", - "instrument_platform", - "fastq_1", - "fastq_2", - "fasta", - ] - header = [x.strip('"') for x in fin.readline().strip().split(",")] - - ## Check for missing mandatory columns - missing_columns = list(set(HEADER) - set(header)) - if len(missing_columns) > 0: - print( - "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format( - ",".join(missing_columns) - ) - ) - sys.exit(1) - - ## Find locations of mandatory columns - header_locs = {} - for i in HEADER: - header_locs[i] = header.index(i) - - ## Check sample entries - for line in fin: - - ## Pull out only relevant columns for downstream checking - line_parsed = [x.strip().strip('"') for x in line.strip().split(",")] - lspl = [line_parsed[i] for i in header_locs.values()] - - # Check valid number of columns per row - if len(lspl) < len(HEADER): - print_error( - "Invalid number of columns (minimum = {})!".format(len(HEADER)), - "Line", - line, - ) - num_cols = len([x for x in lspl if x]) - if num_cols < MIN_COLS: - print_error( - "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), - "Line", - line, - ) - - ## Check sample name entries - - ( - sample, - run_accession, - instrument_platform, - fastq_1, - fastq_2, - fasta, - ) = lspl[: len(HEADER)] - sample = sample.replace(" ", "_") - if not sample: - print_error("Sample entry has not been specified!", "Line", line) - - ## Check FastQ file extension - for fastq in [fastq_1, fastq_2]: - if fastq: - if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(FQ_EXTENSIONS): - print_error( - f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !", - "Line", - line, - ) - if fasta: - if fasta.find(" ") != -1: - print_error("FastA file contains spaces!", "Line", line) - if not fasta.endswith(FA_EXTENSIONS): - print_error( - f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!", - "Line", - line, - ) - sample_info = [] - - # Check run_accession - if not run_accession: - print_error("Run accession has not been specified!", "Line", line) - else: - sample_info.append(run_accession) - - # Check instrument_platform - if not instrument_platform: - print_error("Instrument platform has not been specified!", "Line", line) - else: - if instrument_platform not in INSTRUMENT_PLATFORMS: - print_error( - f"Instrument platform {instrument_platform} is not supported!", - f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}", - "Line", - line, - ) - sample_info.append(instrument_platform) - - ## Auto-detect paired-end/single-end - if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info.extend(["0", fastq_1, fastq_2, fasta]) - elif sample and fastq_1 and not fastq_2: ## Single-end short/long fastq reads - sample_info.extend(["1", fastq_1, fastq_2, fasta]) - elif sample and fasta and not fastq_1 and not fastq_2: ## Single-end long reads - sample_info.extend(["1", fastq_1, fastq_2, fasta]) - elif fasta and (fastq_1 or fastq_2): - print_error( - "FastQ and FastA files cannot be specified together in the same library!", - "Line", - line, - ) - else: - print_error("Invalid combination of columns provided!", "Line", line) - - ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] } - if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [sample_info] - else: - if sample_info in sample_mapping_dict[sample]: - print_error("Samplesheet contains duplicate rows!", "Line", line) - else: - sample_mapping_dict[sample].append(sample_info) - - ## Write validated samplesheet with appropriate columns - HEADER_OUT = [ - "sample", - "run_accession", - "instrument_platform", - "single_end", - "fastq_1", - "fastq_2", - "fasta", - ] - if len(sample_mapping_dict) > 0: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - fout.write(",".join(HEADER_OUT) + "\n") - for sample in sorted(sample_mapping_dict.keys()): - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(f"{sample},{','.join(val)}\n") - else: - print_error("No entries to process!", "Samplesheet: {}".format(file_in)) - - -def main(args=None): - args = parse_args(args) - check_samplesheet(args.FILE_IN, args.FILE_OUT) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index 6099422..d2a0051 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,14 +12,6 @@ process { - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: DATABASE_CHECK { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -450,4 +442,12 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } } diff --git a/conf/test.config b/conf/test.config index a39a107..d5dcd67 100644 --- a/conf/test.config +++ b/conf/test.config @@ -57,4 +57,10 @@ process { withName: MEGAN_RMA2INFO_KRONA { maxForks = 1 } + withName: 'EIDO_VALIDATE' { + ext.args = '--st-index sample' + } + withName: 'EIDO_CONVERT' { + ext.args = '--st-index sample' + } } diff --git a/conf/test_pep.config b/conf/test_pep.config new file mode 100644 index 0000000..7f8c95d --- /dev/null +++ b/conf/test_pep.config @@ -0,0 +1,43 @@ +params { + config_profile_name = 'Test PEP profile' + config_profile_description = 'Minimal test dataset to check pipeline function with PEP file as an input.' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/pep/test_pep_format_files/config.yaml' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database.csv' + perform_shortread_qc = true + perform_longread_qc = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_malt = true + run_metaphlan3 = true + run_centrifuge = true + run_diamond = true + run_motus = false + run_krona = true + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + diamond_save_reads = true +} + + +process { + withName: MALT_RUN { + maxForks = 1 + } + withName: MEGAN_RMA2INFO { + maxForks = 1 + } +} diff --git a/docs/usage.md b/docs/usage.md index 8ae5257..252c84d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,6 +22,10 @@ This samplesheet is then specified on the command line as follows: --input '[path to samplesheet file]' --databases '[path to database sheet file]' ``` +Note pipeline supports both CSV and PEP input sample sheets. Find out more [here](http://pep.databio.org/en/2.1.0/specification/). +When using PEP as an input, the `samplesheet.csv` must be placed in the same folder +as `config.yaml` file. A path to `samplesheet.csv` within the config must be absolute. + ### Multiple runs of the same sample The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate different runs FASTQ files of the same sample before performing profiling, when `--perform_runmerging` is supplied. Below is an example for the same sample sequenced across 3 lanes: @@ -277,6 +281,9 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `test_pep` + - A profile with a complete configuration for running a pipeline with PEP as input + - Includes links to test data so needs no other parameters ### `-resume` diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index c90cf49..7883d70 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -75,7 +75,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" + log.error "Please provide an input samplesheet or PEP to the pipeline e.g. '--input samplesheet.csv'" System.exit(1) } } diff --git a/modules.json b/modules.json index f174b6f..ee96e49 100644 --- a/modules.json +++ b/modules.json @@ -41,6 +41,14 @@ "branch": "master", "git_sha": "3531824af826c16cd252bc5aa82ae169b244ebaa" }, + "eido/convert": { + "branch": "master", + "git_sha": "9764eef361ded86e9242075bda64c2662421386a" + }, + "eido/validate": { + "branch": "master", + "git_sha": "38383cfaefc06cd35e25de99989a3e6ab9ed2980" + }, "fastp": { "branch": "master", "git_sha": "2c70c1c1951aaf884d2e8d8d9c871db79f7b35aa" diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index dea4362..0000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,27 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - script: // This script is bundled with the pipeline, in nf-core/taxprofiler/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/eido/convert/main.nf b/modules/nf-core/modules/eido/convert/main.nf new file mode 100644 index 0000000..be4c02f --- /dev/null +++ b/modules/nf-core/modules/eido/convert/main.nf @@ -0,0 +1,38 @@ +process EIDO_CONVERT { + tag '$samplesheet' + label 'process_single' + + conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv1/eido_0.1.9_cv1.sif' : + 'biocontainers/eido:0.1.9_cv1' }" + + input: + path samplesheet + val format + path pep_input_base_dir + + output: + path "versions.yml" , emit: versions + path "${prefix}.${format}" , emit: samplesheet_converted + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "samplesheet_converted" + """ + eido \\ + convert \\ + -f $format \\ + $samplesheet \\ + $args \\ + -p samples=${prefix}.${format} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/eido/convert/meta.yml b/modules/nf-core/modules/eido/convert/meta.yml new file mode 100644 index 0000000..bd12e03 --- /dev/null +++ b/modules/nf-core/modules/eido/convert/meta.yml @@ -0,0 +1,39 @@ +name: "eido_convert" +description: Convert any PEP project or Nextflow samplesheet to any format +keywords: + - eido + - convert + - PEP + - format + - samplesheet +tools: + - "eido": + description: "Convert any PEP project or Nextflow samplesheet to any format" + homepage: "http://eido.databio.org/en/latest/" + documentation: "http://eido.databio.org/en/latest/" + doi: "10.1093/gigascience/giab077" + licence: "BSD-2-Clause" + +input: + - samplesheet: + type: file + description: Nextflow samplesheet or PEP project + pattern: "*.{yaml,yml,csv}" + - format: + type: value + description: Extension of an output file + - pep_input_base_dir: + type: file + description: Optional path to the directory where files specified in a PEP config file are stored. Any paths specified in the config will need to be relative to this base directory. + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - samplesheet_converted: + type: file + description: PEP project or samplesheet converted to csv file + +authors: + - "@rafalstepien" diff --git a/modules/nf-core/modules/eido/validate/main.nf b/modules/nf-core/modules/eido/validate/main.nf new file mode 100644 index 0000000..e564e83 --- /dev/null +++ b/modules/nf-core/modules/eido/validate/main.nf @@ -0,0 +1,33 @@ +process EIDO_VALIDATE { + tag '$samplesheet' + label 'process_single' + + conda (params.enable_conda ? "conda-forge::eido=0.1.9" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/eido/0.1.9_cv2/eido_0.1.9_cv2.sif' : + 'biocontainers/eido:0.1.9_cv2' }" + + input: + path samplesheet + path schema + path pep_input_base_dir + + output: + path "versions.yml" , emit: versions + path "*.log" , emit: log + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "validation" + """ + eido validate $args $samplesheet -s $schema -e > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eido: \$(echo \$(eido --version 2>&1) | sed 's/^.*eido //;s/ .*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/eido/validate/meta.yml b/modules/nf-core/modules/eido/validate/meta.yml new file mode 100644 index 0000000..eb7b295 --- /dev/null +++ b/modules/nf-core/modules/eido/validate/meta.yml @@ -0,0 +1,41 @@ +name: "eido_validate" +description: Validate samplesheet or PEP config against a schema +keywords: + - eido + - validate + - schema + - format + - pep +tools: + - "validate": + description: "Validate samplesheet or PEP config against a schema." + homepage: "http://eido.databio.org/en/latest/" + documentation: "http://eido.databio.org/en/latest/" + doi: "10.1093/gigascience/giab077" + licence: "BSD-2-Clause" + +input: + - samplesheet: + type: file + description: Samplesheet or PEP file to be validated + pattern: "*.{yaml,yml,csv}" + - schema: + type: file + description: Schema that the samplesheet will be validated against + pattern: "*.{yaml,yml}" + - pep_input_base_dir: + type: file + description: Optional path to the directory where files specified in a PEP config file are stored. Any paths specified in the config will need to be relative to this base directory. + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: File containing validation log. + pattern: "*.log" + +authors: + - "@rafalstepien" diff --git a/nextflow.config b/nextflow.config index df5b90b..80fce40 100644 --- a/nextflow.config +++ b/nextflow.config @@ -227,6 +227,7 @@ profiles { test_nopreprocessing { includeConfig 'conf/test_nopreprocessing.config' } test_nothing { includeConfig 'conf/test_nothing.config' } test_motus { includeConfig 'conf/test_motus.config' } + test_pep { includeConfig 'conf/test_pep.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index eb839ec..4a9237d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,13 +10,13 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "databases", "outdir"], + "required": ["input", "outdir", "databases"], "properties": { "input": { "type": "string", "format": "file-path", "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", + "pattern": "^\\S+\\.(csv|yaml|yml)$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples and libraries/runs.", "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index eb21b9d..e8d5e7a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,31 +2,41 @@ // Check input samplesheet and get read channels // -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main' +include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main' workflow INPUT_CHECK { take: - samplesheet // file: /path/to/samplesheet.csv + samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml + pep_input_base_dir main: - parsed_samplesheet = SAMPLESHEET_CHECK ( samplesheet ) - .csv + ch_versions = Channel.empty() + + EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir ) + ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions) + + EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir ) + ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions) + + ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted .splitCsv ( header:true, sep:',' ) + .map { check_missing_and_singleend_autodetect(it) } .branch { fasta: it['fasta'] != '' nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' fastq: true } - parsed_samplesheet.fastq + ch_parsed_samplesheet.fastq .map { create_fastq_channel(it) } .set { fastq } - parsed_samplesheet.nanopore + ch_parsed_samplesheet.nanopore .map { create_fastq_channel(it) } .set { nanopore } - parsed_samplesheet.fasta + ch_parsed_samplesheet.fasta .map { create_fasta_channel(it) } .set { fasta } @@ -34,7 +44,20 @@ workflow INPUT_CHECK { fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ] nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ] fasta = fasta ?: [] // channel: [ val(meta), fasta ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] +} + +// Function to validate input sheet and auto-detect R1/R2 +def check_missing_and_singleend_autodetect(LinkedHashMap row) { + + // Checks not supported by EIDO(?) + if ( ( row['fastq_1'] != "" || row['fastq_2'] != "" ) && row['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${row['sample']}" } + if ( row['fastq_1'] == "" && row['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${row['sample']}" } + + single_end = row['fastq_2'] == "" ? true : false + row['single_end'] = single_end + + return row } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] @@ -64,11 +87,12 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } } return fastq_meta + }// Function to get list of [ meta, fasta ] def create_fasta_channel(LinkedHashMap row) { def meta = [:] diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 223ba15..6f7becf 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,7 +17,13 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input ) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if ( params.input ) { + ch_input = file(params.input, checkIfExists: true) + pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : [] +} else { + exit 1, "Input samplesheet, or PEP config and base directory not specified" +} + if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } if (params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." @@ -98,7 +104,7 @@ workflow TAXPROFILER { SUBWORKFLOW: Read in samplesheet, validate and stage input files */ INPUT_CHECK ( - ch_input + ch_input, pep_input_base_dir ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)