1
0
Fork 0
mirror of https://github.com/MillironX/taxprofiler.git synced 2024-11-22 11:09:55 +00:00

Merge branch 'nf-core-add-pep-support' into add-pep-support

This commit is contained in:
Rafal Stepien 2022-09-28 14:51:38 -04:00
commit 04d28e6520
5 changed files with 30 additions and 161 deletions

View file

@ -53,3 +53,5 @@ properties:
- fasta
required:
- samples
- run_accession
- instrument_platform

View file

@ -1,117 +0,0 @@
#!/usr/bin/env python
import argparse
import csv
import logging
import sys
from enum import Enum
from typing import List, NoReturn, Optional
class ColumnNames(str, Enum):
SAMPLE = "sample"
FASTQ_1 = "fastq_1"
FASTQ_2 = "fastq_2"
FASTA = "fasta"
SINGLE_END = "single_end"
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
"""
Reformatting is based on detecting whether the reads are paired or single end.
Script appends appropriate column to samplesheet.csv file.
"""
parser = argparse.ArgumentParser(
description="Reformat nf-core/taxprofiler samplesheet file.",
epilog="Example usage: python detect_reads.py <FILE_IN> <FILE_OUT>",
)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)
class ReadsModifier:
def __init__(self):
self.headers = None
self.sample_index = None
self.fastq_1_index = None
self.fastq_2_index = None
self.fasta_index = None
def detect_reads_and_reformat(self, input_file_path: str, output_file_path: str) -> NoReturn:
new_file_rows = []
with open(input_file_path, "r", newline="") as input_file:
csv_reader = csv.DictReader(input_file, delimiter=",")
self.headers = csv_reader.fieldnames
self.headers.append("single_end")
for samplesheet_row in csv_reader:
if self._is_paired_end_short_read(samplesheet_row):
samplesheet_row[ColumnNames.SINGLE_END] = "0"
new_file_rows.append(samplesheet_row.values())
elif self._is_single_end_short_long_read(samplesheet_row):
samplesheet_row[ColumnNames.SINGLE_END] = "1"
new_file_rows.append(samplesheet_row.values())
elif self._is_single_end_long_read(samplesheet_row):
samplesheet_row[ColumnNames.SINGLE_END] = "1"
new_file_rows.append(samplesheet_row.values())
elif self._is_error_row(samplesheet_row):
logging.error(
"FastQ and FastA files cannot be specified together in the same library!",
"Line",
",".join(samplesheet_row.values()),
)
else:
logging.error(
"Invalid combination of columns provided!", "Line", ",".join(samplesheet_row.values())
)
ReadsModifier.save_reformatted_samplesheet([self.headers] + new_file_rows, output_file_path)
def _get_row_values(self, samplesheet_row: dict):
"""
This method extracts data from the columns for given row of samplesheet table.
"""
return (
samplesheet_row.get(ColumnNames.SAMPLE),
samplesheet_row.get(ColumnNames.FASTQ_1),
samplesheet_row.get(ColumnNames.FASTQ_2),
samplesheet_row.get(ColumnNames.FASTA),
)
def _is_paired_end_short_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
return sample and fastq_1 and fastq_2
def _is_single_end_short_long_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, _ = self._get_row_values(samplesheet_row)
return sample and fastq_1 and not fastq_2
def _is_single_end_long_read(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
return sample and fasta and not fastq_1 and not fastq_2
def _is_error_row(self, samplesheet_row: dict) -> bool:
sample, fastq_1, fastq_2, fasta = self._get_row_values(samplesheet_row)
return fasta and (fastq_1 or fastq_2)
@classmethod
def save_reformatted_samplesheet(cls, new_file_rows: List[List], output_file_path: str) -> NoReturn:
"""
Write new samplesheet.
"""
with open(output_file_path, "w") as output_file:
csv.writer(output_file).writerows(new_file_rows)
def main(args=None):
args = parse_args(args)
ReadsModifier().detect_reads_and_reformat(args.FILE_IN, args.FILE_OUT)
if __name__ == "__main__":
sys.exit(main())

View file

@ -12,14 +12,6 @@
process {
withName: SAMPLESHEET_CHECK {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
withName: DATABASE_CHECK {
publishDir = [
path: { "${params.outdir}/pipeline_info" },

View file

@ -1,27 +0,0 @@
process SAMPLESHEET_CHECK {
tag "$samplesheet"
conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/python:3.8.3' :
'quay.io/biocontainers/python:3.8.3' }"
input:
path samplesheet
output:
path '*.csv' , emit: csv
path "versions.yml", emit: versions
script: // detect_reads.py script is bundled with the pipeline, in nf-core/taxprofiler/bin/
"""
python3 $projectDir/bin/detect_reads.py \\
$samplesheet \\
samplesheet_validated.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}

View file

@ -2,7 +2,6 @@
// Check input samplesheet and get read channels
//
include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check'
include { EIDO_VALIDATE } from '../../modules/nf-core/modules/eido/validate/main'
include { EIDO_CONVERT } from '../../modules/nf-core/modules/eido/convert/main'
@ -12,26 +11,32 @@ workflow INPUT_CHECK {
pep_input_base_dir
main:
ch_versions = Channel.empty()
EIDO_VALIDATE ( samplesheet_or_pep_config, file("$projectDir/assets/samplesheet_schema.yaml"), pep_input_base_dir )
converted_samplesheet = EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
parsed_samplesheet = SAMPLESHEET_CHECK ( converted_samplesheet.samplesheet_converted )
.csv
ch_versions = ch_versions.mix(EIDO_VALIDATE.out.versions)
EIDO_CONVERT ( samplesheet_or_pep_config, "csv", pep_input_base_dir )
ch_versions = ch_versions.mix(EIDO_CONVERT.out.versions)
ch_parsed_samplesheet = EIDO_CONVERT.out.samplesheet_converted
.splitCsv ( header:true, sep:',' )
.map { check_missing_and_singleend_autodetect(it) }
.branch {
fasta: it['fasta'] != ''
nanopore: it['instrument_platform'] == 'OXFORD_NANOPORE'
fastq: true
}
parsed_samplesheet.fastq
ch_parsed_samplesheet.fastq
.map { create_fastq_channel(it) }
.set { fastq }
parsed_samplesheet.nanopore
ch_parsed_samplesheet.nanopore
.map { create_fastq_channel(it) }
.set { nanopore }
parsed_samplesheet.fasta
ch_parsed_samplesheet.fasta
.map { create_fasta_channel(it) }
.set { fasta }
@ -39,7 +44,20 @@ workflow INPUT_CHECK {
fastq = fastq ?: [] // channel: [ val(meta), [ reads ] ]
nanopore = nanopore ?: [] // channel: [ val(meta), [ reads ] ]
fasta = fasta ?: [] // channel: [ val(meta), fasta ]
versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
versions = ch_versions // channel: [ versions.yml ]
}
// Function to validate input sheet and auto-detect R1/R2
def check_missing_and_singleend_autodetect(LinkedHashMap row) {
// Checks not supported by EIDO(?)
if ( ( row['fastq_1'] != "" || row['fastq_2'] != "" ) && row['fasta'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: FastQ and FastA files cannot be specified together in the same library. Check input samplesheet! Check sample: ${row['sample']}" }
if ( row['fastq_1'] == "" && row['fastq_2'] != "" ) { exit 1, "[nf-core/taxprofiler] ERROR: Input samplesheet has a missing fastq_1 when fastq_2 is specified. Check sample: ${row['sample']}" }
single_end = row['fastq_2'] == "" ? true : false
row['single_end'] = single_end
return row
}
// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
@ -69,11 +87,12 @@ def create_fastq_channel(LinkedHashMap row) {
if (!file(row.fastq_2).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
}
fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
}
}
return fastq_meta
}// Function to get list of [ meta, fasta ]
def create_fasta_channel(LinkedHashMap row) {
def meta = [:]