2022-02-18 06:55:14 +00:00
#!/usr/bin/env python
# TODO nf-core: Update the script to check the samplesheet
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
2022-02-18 10:53:13 +00:00
from distutils import extension
2022-02-18 06:55:14 +00:00
import os
import sys
import errno
import argparse
def parse_args ( args = None ) :
2022-02-18 10:53:13 +00:00
Description = (
" Reformat nf-core/taxprofiler samplesheet file and check its contents. "
)
2022-02-18 06:55:14 +00:00
Epilog = " Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT> "
parser = argparse . ArgumentParser ( description = Description , epilog = Epilog )
parser . add_argument ( " FILE_IN " , help = " Input samplesheet file. " )
parser . add_argument ( " FILE_OUT " , help = " Output file. " )
return parser . parse_args ( args )
def make_dir ( path ) :
if len ( path ) > 0 :
try :
os . makedirs ( path )
except OSError as exception :
if exception . errno != errno . EEXIST :
raise exception
def print_error ( error , context = " Line " , context_str = " " ) :
error_str = " ERROR: Please check samplesheet -> {} " . format ( error )
if context != " " and context_str != " " :
error_str = " ERROR: Please check samplesheet -> {} \n {} : ' {} ' " . format (
error , context . strip ( ) , context_str . strip ( )
)
print ( error_str )
sys . exit ( 1 )
# TODO nf-core: Update the check_samplesheet function
def check_samplesheet ( file_in , file_out ) :
"""
This function checks that the samplesheet follows the following structure :
2022-02-18 10:53:13 +00:00
sample , run_accession , instrument_platform , fastq_1 , fastq_2 , fasta
2611 , ERR5766174 , ILLUMINA , NA , NA , FA_EXTENSIONSERX5474930_ERR5766174_1 . fa . gz
2612 , ERR5766176 , ILLUMINA , ERX5474932_ERR5766176_1 . fastq . gz , ERX5474932_ERR5766176_2 . fastq . gz , NA
2612 , ERR5766174 , ILLUMINA , ERX5474936_ERR5766180_1 . fastq . gz , NA , NA
2613 , ERR5766181 , ILLUMINA , ERX5474930_ERR5766174_1 . fa . gz , ERX5474930_ERR5766174_2 . fa . gz , NA
2022-02-18 06:55:14 +00:00
For an example see :
https : / / raw . githubusercontent . com / nf - core / test - datasets / viralrecon / samplesheet / samplesheet_test_illumina_amplicon . csv
"""
2022-02-18 10:53:13 +00:00
FQ_EXTENSIONS = ( " .fq " , " .fq.gz " , " .fastq " , " .fastq.gz " )
FA_EXTENSIONS = (
" .fa " ,
" .fa.gz " ,
" .fasta " ,
" .fasta.gz " ,
" .fna " ,
" .fna.gz " ,
" .fas " ,
" .fas.gz " ,
)
INSTRUMENT_PLATFORMS = [
" ABI_SOLID " ,
" BGISEQ " ,
" CAPILLARY " ,
" COMPLETE_GENOMICS " ,
" DNBSEQ " ,
" HELICOS " ,
" ILLUMINA " ,
" ION_TORRENT " ,
" LS454 " ,
" OXFORD_NANOPORE " ,
" PACBIO_SMRT " ,
]
2022-02-18 06:55:14 +00:00
sample_mapping_dict = { }
with open ( file_in , " r " ) as fin :
## Check header
2022-02-18 10:53:13 +00:00
MIN_COLS = 4
2022-02-18 06:55:14 +00:00
# TODO nf-core: Update the column names for the input samplesheet
2022-02-18 10:53:13 +00:00
HEADER = [
" sample " ,
" run_accession " ,
" instrument_platform " ,
" fastq_1 " ,
" fastq_2 " ,
" fasta " ,
]
2022-02-18 06:55:14 +00:00
header = [ x . strip ( ' " ' ) for x in fin . readline ( ) . strip ( ) . split ( " , " ) ]
if header [ : len ( HEADER ) ] != HEADER :
2022-02-18 10:53:13 +00:00
print (
" ERROR: Please check samplesheet header -> {} != {} " . format (
" , " . join ( header ) , " , " . join ( HEADER )
)
)
2022-02-18 06:55:14 +00:00
sys . exit ( 1 )
## Check sample entries
for line in fin :
lspl = [ x . strip ( ) . strip ( ' " ' ) for x in line . strip ( ) . split ( " , " ) ]
# Check valid number of columns per row
if len ( lspl ) < len ( HEADER ) :
print_error (
" Invalid number of columns (minimum = {} )! " . format ( len ( HEADER ) ) ,
" Line " ,
line ,
)
num_cols = len ( [ x for x in lspl if x ] )
if num_cols < MIN_COLS :
print_error (
2022-02-18 10:53:13 +00:00
" Invalid number of populated columns (minimum = {} )! " . format (
MIN_COLS
) ,
2022-02-18 06:55:14 +00:00
" Line " ,
line ,
)
## Check sample name entries
2022-02-18 10:53:13 +00:00
(
sample ,
run_accession ,
instrument_platform ,
fastq_1 ,
fastq_2 ,
fasta ,
) = lspl [ : len ( HEADER ) ]
2022-02-18 06:55:14 +00:00
sample = sample . replace ( " " , " _ " )
if not sample :
print_error ( " Sample entry has not been specified! " , " Line " , line )
## Check FastQ file extension
for fastq in [ fastq_1 , fastq_2 ] :
if fastq :
if fastq . find ( " " ) != - 1 :
print_error ( " FastQ file contains spaces! " , " Line " , line )
2022-02-18 10:53:13 +00:00
if not fastq . endswith ( FQ_EXTENSIONS ) :
2022-02-18 06:55:14 +00:00
print_error (
2022-02-18 10:53:13 +00:00
f " FastQ file does not have extension { ' or ' . join ( list ( FQ_EXTENSIONS ) ) } ! " ,
2022-02-18 06:55:14 +00:00
" Line " ,
line ,
)
2022-02-18 10:53:13 +00:00
if fasta :
if fasta . find ( " " ) != - 1 :
print_error ( " FastA file contains spaces! " , " Line " , line )
if not fasta . endswith ( FA_EXTENSIONS ) :
print_error (
f " FastA file does not have extension { ' or ' . join ( list ( FA_EXTENSIONS ) ) } ! " ,
" Line " ,
line ,
)
sample_info = [ ]
# Check run_accession
if not run_accession :
print_error ( " Run accession has not been specified! " , " Line " , line )
else :
sample_info . append ( run_accession )
# Check instrument_platform
if not instrument_platform :
print_error ( " Instrument platform has not been specified! " , " Line " , line )
else :
if instrument_platform not in INSTRUMENT_PLATFORMS :
print_error (
f " Instrument platform { instrument_platform } is not supported! " ,
f " List of supported platforms { ' , ' . join ( INSTRUMENT_PLATFORMS ) } " ,
" Line " ,
line ,
)
sample_info . append ( instrument_platform )
2022-02-18 06:55:14 +00:00
## Auto-detect paired-end/single-end
if sample and fastq_1 and fastq_2 : ## Paired-end short reads
2022-02-18 10:53:13 +00:00
sample_info . extend ( [ " 0 " , fastq_1 , fastq_2 , fasta ] )
2022-02-18 06:55:14 +00:00
elif sample and fastq_1 and not fastq_2 : ## Single-end short reads
2022-02-18 10:53:13 +00:00
sample_info . extend ( [ " 1 " , fastq_1 , fastq_2 , fasta ] )
elif (
sample and fasta and not fastq_1 and not fastq_2
) : ## Single-end long reads
sample_info . extend ( [ " 1 " , fastq_1 , fastq_2 , fasta ] )
2022-02-18 06:55:14 +00:00
else :
print_error ( " Invalid combination of columns provided! " , " Line " , line )
2022-02-18 10:53:13 +00:00
## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 , fasta, run_accession, instrument_platform] }
2022-02-18 06:55:14 +00:00
if sample not in sample_mapping_dict :
sample_mapping_dict [ sample ] = [ sample_info ]
else :
if sample_info in sample_mapping_dict [ sample ] :
print_error ( " Samplesheet contains duplicate rows! " , " Line " , line )
else :
sample_mapping_dict [ sample ] . append ( sample_info )
## Write validated samplesheet with appropriate columns
2022-02-18 10:53:13 +00:00
HEADER_OUT = [
" sample " ,
" run_accession " ,
" instrument_platform " ,
" single_end " ,
" fastq_1 " ,
" fastq_2 " ,
" fasta " ,
]
2022-02-18 06:55:14 +00:00
if len ( sample_mapping_dict ) > 0 :
out_dir = os . path . dirname ( file_out )
make_dir ( out_dir )
with open ( file_out , " w " ) as fout :
2022-02-18 10:53:13 +00:00
fout . write ( " , " . join ( HEADER_OUT ) + " \n " )
2022-02-18 06:55:14 +00:00
for sample in sorted ( sample_mapping_dict . keys ( ) ) :
## Check that multiple runs of the same sample are of the same datatype
2022-02-18 10:53:13 +00:00
if not all (
x [ 0 ] == sample_mapping_dict [ sample ] [ 0 ] [ 0 ]
for x in sample_mapping_dict [ sample ]
) :
print_error (
" Multiple runs of a sample must be of the same datatype! " ,
" Sample: {} " . format ( sample ) ,
)
2022-02-18 06:55:14 +00:00
for idx , val in enumerate ( sample_mapping_dict [ sample ] ) :
2022-02-18 10:53:13 +00:00
fout . write (
" , " . join ( [ " {} _T {} " . format ( sample , idx + 1 ) ] + val ) + " \n "
)
2022-02-18 06:55:14 +00:00
else :
print_error ( " No entries to process! " , " Samplesheet: {} " . format ( file_in ) )
def main ( args = None ) :
args = parse_args ( args )
check_samplesheet ( args . FILE_IN , args . FILE_OUT )
if __name__ == " __main__ " :
sys . exit ( main ( ) )