Merge branch 'add-pep-support' of github.com:nf-core/taxprofiler into nf-core-add-pep-support

2024-11-22 20:39:55 +00:00 · 2022-09-28 14:51:13 -04:00 · 2022-09-28 14:51:13 -04:00 · 00aa94a403
commit 00aa94a403
parent bfd260e9c8 87edc4569c
5 changed files with 30 additions and 161 deletions
--- a/assets/samplesheet_schema.yaml
+++ b/assets/samplesheet_schema.yaml
@ -53,3 +53,5 @@ properties:
        - fasta
 required:
  - samples
  - run_accession
  - instrument_platform
--- a/bin/detect_reads.py
+++ b/bin/detect_reads.py
@ -1,117 +0,0 @@
 #!/usr/bin/env python
 import argparse
 import csv
 import logging
 import sys
 from enum import Enum
 from typing import List, NoReturn, Optional
 class ColumnNames(str, Enum):
    SAMPLE = "sample"
    FASTQ_1 = "fastq_1"
    FASTQ_2 = "fastq_2"
    FASTA = "fasta"
    SINGLE_END = "single_end"
 def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
    """
    Reformatting is based on detecting whether the reads are paired or single end.
    Script appends appropriate column to samplesheet.csv file.
    """
    parser = argparse.ArgumentParser(
        description="Reformat nf-core/taxprofiler samplesheet file.",
        epilog="Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>",
    )
    parser.add_argument("FILE_IN", help="Input samplesheet file.")
    parser.add_argument("FILE_OUT", help="Output file.")
    return parser.parse_args(args)
 class ReadsModifier:
    def __init__(self):
        self.headers = None
        self.sample_index = None
        self.fastq_1_index = None
        self.fastq_2_index = None
        self.fasta_index = None
    def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
        new_file_rows = []
        with open(input_file_path, "r", newline="") as input_file:
            csv_reader = csv.DictReader(input_file, delimiter=",")
            self.headers = csv_reader.fieldnames
            self.headers.append("single_end")
            for samplesheet_row in csv_reader:
                if self._is_paired_end_short_read(samplesheet_row):
                    samplesheet_row[ColumnNames.SINGLE_END] = "0"
                    new_file_rows.append(samplesheet_row.values())
                elif self._is_single_end_short_long_read(samplesheet_row):
                    samplesheet_row[ColumnNames.SINGLE_END] = "1"
                    new_file_rows.append(samplesheet_row.values())
                elif self._is_single_end_long_read(samplesheet_row):
                    samplesheet_row[ColumnNames.SINGLE_END] = "1"
                    new_file_rows.append(samplesheet_row.values())
                elif self._is_error_row(samplesheet_row):
                    logging.error(
                        "FastQ and FastA files cannot be specified together in the same library!",
                        "Line",
                        ",".join(samplesheet_row.values()),
                    )
                else:
                    logging.error(
                        "Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values())
                    )
        ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
    def _get_row_values(self, samplesheet_row: dict):
        """
        This method extracts data from the columns for given row of samplesheet table.
        """
        return (
            samplesheet_row.get(ColumnNames.SAMPLE),
            samplesheet_row.get(ColumnNames.FASTQ_1),
            samplesheet_row.get(ColumnNames.FASTQ_2),
            samplesheet_row.get(ColumnNames.FASTA),
        )
    def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
        return sample and fastq_1 and fastq_2
    def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
        return sample and fastq_1 and not fastq_2
    def _is_single_end_long_read(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
        return sample and fasta and not fastq_1 and not fastq_2
    def _is_error_row(self, samplesheet_row: dict) -> bool:
        sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
        return fasta and (fastq_1 or fastq_2)
    @classmethod
    def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn:
        """
        Write new samplesheet.
        """
        with open(output_file_path, "w") as output_file:
            csv.writer(output_file).writerows(new_file_rows)
 def main(args=None):
    args = parse_args(args)
    ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT)
 if __name__ == "__main__":
    sys.exit(main())
--- a/conf/modules.config
+++ b/conf/modules.config
@ -12,14 +12,6 @@
 process {
    withName: SAMPLESHEET_CHECK {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
            mode: params.publish_dir_mode,
            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
        ]
    }
    withName: DATABASE_CHECK {
        publishDir = [
            path: { "${params.outdir}/pipeline_info" },
--- a/modules/local/samplesheet_check.nf
+++ b/modules/local/samplesheet_check.nf
@ -1,27 +0,0 @@
 process SAMPLESHEET_CHECK {
    tag "$samplesheet"
    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
        'quay.io/biocontainers/python:3.8.3' }"
    input:
    path samplesheet
    output:
    path '*.csv'       , emit: csv
    path "versions.yml", emit: versions
    script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/
    """
    python3 $projectDir/bin/detect_reads.py \\
        $samplesheet \\
        samplesheet_validated.csv
    cat <<-END_VERSIONS > versions.yml
    "${task.process}":
        python: \$(python --version | sed 's/Python //g')
    END_VERSIONS
    """
 }
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@ -2,7 +2,6 @@
 // Check input samplesheet and get read channels
 //
 include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check'
 include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main'
 include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
@ -12,26 +11,32 @@ workflow INPUT_CHECK {
    pep_input_base_dir
    main:
    ch_versions = Channel.empty()
    EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
-    converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
+    ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions)
-    parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
+
-        .csv
+    EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
    ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions)
    ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted
        .splitCsv ( header:true, sep:',' )
        .map { check_missing_and_singleend_autodetect(it) }
        .branch {
            fasta: it['fasta'] != ''
            nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
            fastq: true
        }
-    parsed_samplesheet.fastq
+    ch_parsed_samplesheet.fastq
        .map { create_fastq_channel(it) }
        .set { fastq }
-    parsed_samplesheet.nanopore
+    ch_parsed_samplesheet.nanopore
        .map { create_fastq_channel(it) }
        .set { nanopore }
-    parsed_samplesheet.fasta
+    ch_parsed_samplesheet.fasta
        .map { create_fasta_channel(it) }
        .set { fasta }
@ -39,7 +44,20 @@ workflow INPUT_CHECK {
    fastq = fastq ?: []                       // channel: [ val(meta), [ reads ] ]
    nanopore = nanopore ?: []                 // channel: [ val(meta), [ reads ] ]
    fasta = fasta ?: []                       // channel: [ val(meta), fasta ]
-    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
+    versions = ch_versions                    // channel: [ versions.yml ]
 }
 // Function to validate input sheet and auto-detect R1/R2
 def check_missing_and_singleend_autodetect(LinkedHashMap row) {
            // Checks not supported by EIDO(?)
            if ( ( row['fastq_1'] != "" || row['fastq_2'] != "" ) && row['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${row['sample']}" }
            if ( row['fastq_1'] == "" && row['fastq_2'] != "" )                            { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${row['sample']}" }
            single_end = row['fastq_2'] == "" ? true : false
            row['single_end'] = single_end
            return row
 }
 // Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
@ -74,6 +92,7 @@ def create_fastq_channel(LinkedHashMap row) {
    }
    return fastq_meta
 }// Function to get list of [ meta, fasta ]
 def create_fasta_channel(LinkedHashMap row) {
    def meta = [:]