From 1584d6fc517a34acf19bfd29352b0645480f95a7 Mon Sep 17 00:00:00 2001 From: Rafal Stepien <43926522+rafalstepien@users.noreply.github.com> Date: Mon, 19 Sep 2022 09:51:25 -0400 Subject: [PATCH 1/3] Update workflows/taxprofiler.nf Co-authored-by: James A. Fellows Yates --- nextflow_schema.json | 4 ++-- subworkflows/local/input_check.nf | 6 +++--- workflows/taxprofiler.nf | 22 +++++----------------- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d5309ed..4a9237d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,13 +10,13 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ "input", "outdir", "databases"], + "required": ["input", "outdir", "databases"], "properties": { "input": { "type": "string", "format": "file-path", "mimetype": "text/csv", - "pattern": "^\\S+\\.(csv|yaml)$", + "pattern": "^\\S+\\.(csv|yaml|yml)$", "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples and libraries/runs.", "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0a07538..447eb15 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,11 +9,11 @@ include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main' workflow INPUT_CHECK { take: samplesheet_or_pep_config // file: /path/to/samplesheet.csv or /path/to/pep/config.yaml - ch_pep_input_base_dir + pep_input_base_dir main: - EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), ch_pep_input_base_dir ) - converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", ch_pep_input_base_dir ) + EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir ) + converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir ) parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted ) .csv .splitCsv ( header:true, sep:',' ) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index b88b286..6f7becf 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -17,23 +17,11 @@ def checkPathParamList = [ params.input, params.databases, params.hostremoval_re for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if ( params.input.endsWith(".yaml") ) { - - if ( params.input.startsWith("http://") || params.input.startsWith("https://") ) { - ch_input = file(params.input) - ch_pep_input_base_dir = [] - } - else { - ch_input = file(params.input) - ch_pep_input_base_dir = new File(params.input).getParent() - } - -} else if ( params.input.endsWith(".csv") ) { - ch_input = file(params.input) - ch_pep_input_base_dir = [] - +if ( params.input ) { + ch_input = file(params.input, checkIfExists: true) + pep_input_base_dir = file(params.input).extension.matches("yaml|yml") ? file(file(params.input).getParent(), checkIfExists: true) : [] } else { - exit 1, 'Input samplesheet or PEP config not specified!' + exit 1, "Input samplesheet, or PEP config and base directory not specified" } if (params.databases) { ch_databases = file(params.databases) } else { exit 1, 'Input database sheet not specified!' } @@ -116,7 +104,7 @@ workflow TAXPROFILER { SUBWORKFLOW: Read in samplesheet, validate and stage input files */ INPUT_CHECK ( - ch_input, ch_pep_input_base_dir + ch_input, pep_input_base_dir ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) From 43a8aa4405e67e0616d63f71ef217ecb7b2e350b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 27 Sep 2022 15:32:51 +0200 Subject: [PATCH 2/3] Remove detect_reads.py and replace remaining checks with nextflow code instead --- assets/samplesheet_schema.yaml | 2 + bin/detect_reads.py | 125 ----------------------------- conf/modules.config | 8 -- modules/local/samplesheet_check.nf | 27 ------- subworkflows/local/input_check.nf | 34 +++++--- 5 files changed, 27 insertions(+), 169 deletions(-) delete mode 100644 bin/detect_reads.py delete mode 100644 modules/local/samplesheet_check.nf diff --git a/assets/samplesheet_schema.yaml b/assets/samplesheet_schema.yaml index 88ff451..366ee93 100644 --- a/assets/samplesheet_schema.yaml +++ b/assets/samplesheet_schema.yaml @@ -53,3 +53,5 @@ properties: - fasta required: - samples + - run_accession + - instrument_platform diff --git a/bin/detect_reads.py b/bin/detect_reads.py deleted file mode 100644 index 8a1430e..0000000 --- a/bin/detect_reads.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python - -import argparse -import csv -import sys -from typing import List, NoReturn - - -def parse_args(args=None) -> argparse.Namespace: - """ - Reformatting is based on detecting whether the reads are paired or single end. - Script appends appropriate column to samplesheet.csv file. - """ - Description = "Reformat nf-core/taxprofiler samplesheet file." - Epilog = "Example usage: python detect_reads.py " - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) - - -class ReadsModifier: - def __init__(self): - self.headers = None - self.sample_index = None - self.fastq_1_index = None - self.fastq_2_index = None - self.fasta_index = None - - def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn: - NEW_COLUMN_NAME = "single_end" - new_file_rows = [] - - with open(input_file_path, "r") as input_file: - csv_reader = csv.reader(input_file, delimiter=",") - self.headers = next(csv_reader) - self.headers.append(NEW_COLUMN_NAME) - - self._infer_column_indexes() - - for samplesheet_row in csv_reader: - - if self._is_paired_end_short_read(samplesheet_row): - new_file_rows.append([*samplesheet_row, "0"]) - - elif self._is_single_end_short_long_read(samplesheet_row): - new_file_rows.append([*samplesheet_row, "1"]) - - elif self._is_single_end_long_read(samplesheet_row): - new_file_rows.append([*samplesheet_row, "1"]) - - elif self._is_error_row(samplesheet_row): - self.print_error( - "FastQ and FastA files cannot be specified together in the same library!", - "Line", - ",".join(samplesheet_row), - ) - else: - self.print_error("Invalid combination of columns provided!", "Line", ",".join(samplesheet_row)) - - self.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path) - - def _get_row_values(self, samplesheet_row): - """ - This method extracts data from the columns for given row of samplesheet table, based on - previously infered column indexes. - """ - sample = samplesheet_row[self.sample_index] - fastq_1 = samplesheet_row[self.fastq_1_index] if self.fastq_1_index else None - fastq_2 = samplesheet_row[self.fastq_2_index] if self.fastq_2_index else None - fasta = samplesheet_row[self.fasta_index] if self.fasta_index else None - return sample, fastq_1, fastq_2, fasta - - def _infer_column_indexes(self): - """ - This method infers indexes of necessary columns from samplesheet table - """ - self.sample_index = self.headers.index("sample") - self.fastq_1_index = self.headers.index("fastq_1") if "fastq_1" in self.headers else None - self.fastq_2_index = self.headers.index("fastq_2") if "fastq_2" in self.headers else None - self.fasta_index = self.headers.index("fasta") if "fasta" in self.headers else None - - def _is_paired_end_short_read(self, samplesheet_row: List) -> bool: - sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) - return sample and fastq_1 and fastq_2 - - def _is_single_end_short_long_read(self, samplesheet_row: List) -> bool: - sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row) - return sample and fastq_1 and not fastq_2 - - def _is_single_end_long_read(self, samplesheet_row: List) -> bool: - sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) - return sample and fasta and not fastq_1 and not fastq_2 - - def _is_error_row(self, samplesheet_row: List) -> bool: - sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row) - return fasta and (fastq_1 or fastq_2) - - @staticmethod - def print_error(error: str, context: str = "Line", context_str: str = ""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) - - @staticmethod - def save_reformatted_samplesheet(new_file_rows: List[List], output_file_path: str) -> NoReturn: - """ - Write new samplesheet. - """ - with open(output_file_path, "w") as output_file: - csv.writer(output_file).writerows(new_file_rows) - - -def main(args=None): - args = parse_args(args) - ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index 0efd251..73cc042 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,14 +12,6 @@ process { - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: DATABASE_CHECK { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index 91e0f04..0000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,27 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/ - """ - python3 $projectDir/bin/detect_reads.py \\ - $samplesheet \\ - samplesheet_validated.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 447eb15..d54d2ad 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,7 +2,6 @@ // Check input samplesheet and get read channels // -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main' include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main' @@ -12,26 +11,43 @@ workflow INPUT_CHECK { pep_input_base_dir main: + ch_versions = Channel.empty() + EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir ) - converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir ) - parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted ) - .csv + ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions) + + EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir ) + ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions) + + ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted .splitCsv ( header:true, sep:',' ) + .map{ + + // Checks not supported by EIDO(?) + if ( ( it['fastq_1'] != "" || it['fastq_2'] != "" ) && it['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${it['sample']}" } + if ( it['fastq_1'] == "" && it['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${it['sample']}" } + + single_end = it['fastq_2'] == "" ? true : false + it['single_end'] = single_end + + [ it ] + } + .flatten() .branch { fasta: it['fasta'] != '' nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' fastq: true } - parsed_samplesheet.fastq + ch_parsed_samplesheet.fastq .map { create_fastq_channel(it) } .set { fastq } - parsed_samplesheet.nanopore + ch_parsed_samplesheet.nanopore .map { create_fastq_channel(it) } .set { nanopore } - parsed_samplesheet.fasta + ch_parsed_samplesheet.fasta .map { create_fasta_channel(it) } .set { fasta } @@ -39,7 +55,7 @@ workflow INPUT_CHECK { fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ] nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ] fasta = fasta ?: [] // channel: [ val(meta), fasta ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] @@ -69,7 +85,7 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } } From 87edc4569cef3290531ac0dbe56b1c407de26381 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 27 Sep 2022 15:46:17 +0200 Subject: [PATCH 3/3] Move to a function --- subworkflows/local/input_check.nf | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d54d2ad..e8d5e7a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -21,18 +21,7 @@ workflow INPUT_CHECK { ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted .splitCsv ( header:true, sep:',' ) - .map{ - - // Checks not supported by EIDO(?) - if ( ( it['fastq_1'] != "" || it['fastq_2'] != "" ) && it['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${it['sample']}" } - if ( it['fastq_1'] == "" && it['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${it['sample']}" } - - single_end = it['fastq_2'] == "" ? true : false - it['single_end'] = single_end - - [ it ] - } - .flatten() + .map { check_missing_and_singleend_autodetect(it) } .branch { fasta: it['fasta'] != '' nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE' @@ -58,6 +47,19 @@ workflow INPUT_CHECK { versions = ch_versions // channel: [ versions.yml ] } +// Function to validate input sheet and auto-detect R1/R2 +def check_missing_and_singleend_autodetect(LinkedHashMap row) { + + // Checks not supported by EIDO(?) + if ( ( row['fastq_1'] != "" || row['fastq_2'] != "" ) && row['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${row['sample']}" } + if ( row['fastq_1'] == "" && row['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${row['sample']}" } + + single_end = row['fastq_2'] == "" ? true : false + row['single_end'] = single_end + + return row +} + // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] def create_fastq_channel(LinkedHashMap row) { // create meta map @@ -90,6 +92,7 @@ def create_fastq_channel(LinkedHashMap row) { } return fastq_meta + }// Function to get list of [ meta, fasta ] def create_fasta_channel(LinkedHashMap row) { def meta = [:]