taxprofiler/bin/check_samplesheet.py

#!/usr/bin/env python

from distutils import extension
import os
import sys
import errno
import argparse


def parse_args(args=None):
    Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."

    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
    parser.add_argument("FILE_IN", help="Input samplesheet file.")
    parser.add_argument("FILE_OUT", help="Output file.")
    return parser.parse_args(args)


def make_dir(path):
    if len(path) > 0:
        try:
            os.makedirs(path)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise exception


def print_error(error, context="Line", context_str=""):
    error_str = "ERROR: Please check samplesheet -> {}".format(error)
    if context != "" and context_str != "":
        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
            error, context.strip(), context_str.strip()
        )
    print(error_str)
    sys.exit(1)


def check_samplesheet(file_in, file_out):
    """
    This function checks that the samplesheet follows the following structure:

    sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
    2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz
    2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,
    2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,,
    2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz,
    """

    FQ_EXTENSIONS = (".fq.gz", ".fastq.gz")
    FA_EXTENSIONS = (
        ".fa",
        ".fa.gz",
        ".fasta",
        ".fasta.gz",
        ".fna",
        ".fna.gz",
        ".fas",
        ".fas.gz",
    )
    INSTRUMENT_PLATFORMS = [
        "ABI_SOLID",
        "BGISEQ",
        "CAPILLARY",
        "COMPLETE_GENOMICS",
        "DNBSEQ",
        "HELICOS",
        "ILLUMINA",
        "ION_TORRENT",
        "LS454",
        "OXFORD_NANOPORE",
        "PACBIO_SMRT",
    ]

    sample_mapping_dict = {}
    with open(file_in, "r") as fin:

        ## Check header
        MIN_COLS = 4
        HEADER = [
            "sample",
            "run_accession",
            "instrument_platform",
            "fastq_1",
            "fastq_2",
            "fasta",
        ]
        header = [x.strip('"') for x in fin.readline().strip().split(",")]

        ## Check for missing mandatory columns
        missing_columns = list(set(HEADER) - set(header))
        if len(missing_columns) > 0:
            print(
                "ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format(
                    ",".join(missing_columns)
                )
            )
            sys.exit(1)

        ## Find locations of mandatory columns
        header_locs = {}
        for i in HEADER:
            header_locs[i] = header.index(i)

        ## Check sample entries
        for line in fin:

            ## Pull out only relevant columns for downstream checking
            line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]
            lspl = [line_parsed[i] for i in header_locs.values()]

            # Check valid number of columns per row
            if len(lspl) < len(HEADER):
                print_error(
                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
                    "Line",
                    line,
                )
            num_cols = len([x for x in lspl if x])
            if num_cols < MIN_COLS:
                print_error(
                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
                    "Line",
                    line,
                )

            ## Check sample name entries

            (
                sample,
                run_accession,
                instrument_platform,
                fastq_1,
                fastq_2,
                fasta,
            ) = lspl[: len(HEADER)]
            sample = sample.replace(" ", "_")
            if not sample:
                print_error("Sample entry has not been specified!", "Line", line)

            ## Check FastQ file extension
            for fastq in [fastq_1, fastq_2]:
                if fastq:
                    if fastq.find(" ") != -1:
                        print_error("FastQ file contains spaces!", "Line", line)
                    if not fastq.endswith(FQ_EXTENSIONS):
                        print_error(
                            f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",
                            "Line",
                            line,
                        )
            if fasta:
                if fasta.find(" ") != -1:
                    print_error("FastA file contains spaces!", "Line", line)
                if not fasta.endswith(FA_EXTENSIONS):
                    print_error(
                        f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",
                        "Line",
                        line,
                    )
            sample_info = []

            # Check run_accession
            if not run_accession:
                print_error("Run accession has not been specified!", "Line", line)
            else:
                sample_info.append(run_accession)

            # Check instrument_platform
            if not instrument_platform:
                print_error("Instrument platform has not been specified!", "Line", line)
            else:
                if instrument_platform not in INSTRUMENT_PLATFORMS:
                    print_error(
                        f"Instrument platform {instrument_platform} is not supported!",
                        f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",
                        "Line",
                        line,
                    )
                sample_info.append(instrument_platform)

            ## Auto-detect paired-end/single-end
            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
                sample_info.extend(["0", fastq_1, fastq_2, fasta])
            elif sample and fastq_1 and not fastq_2:  ## Single-end short/long fastq reads
                sample_info.extend(["1", fastq_1, fastq_2, fasta])
            elif sample and fasta and not fastq_1 and not fastq_2:  ## Single-end long reads
                sample_info.extend(["1", fastq_1, fastq_2, fasta])
            elif fasta and (fastq_1 or fastq_2):
                print_error(
                    "FastQ and FastA files cannot be specified together in the same library!",
                    "Line",
                    line,
                )
            else:
                print_error("Invalid combination of columns provided!", "Line", line)

            ## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] }
            if sample not in sample_mapping_dict:
                sample_mapping_dict[sample] = [sample_info]
            else:
                if sample_info in sample_mapping_dict[sample]:
                    print_error("Samplesheet contains duplicate rows!", "Line", line)
                else:
                    sample_mapping_dict[sample].append(sample_info)

    ## Write validated samplesheet with appropriate columns
    HEADER_OUT = [
        "sample",
        "run_accession",
        "instrument_platform",
        "single_end",
        "fastq_1",
        "fastq_2",
        "fasta",
    ]
    if len(sample_mapping_dict) > 0:
        out_dir = os.path.dirname(file_out)
        make_dir(out_dir)
        with open(file_out, "w") as fout:
            fout.write(",".join(HEADER_OUT) + "\n")
            for sample in sorted(sample_mapping_dict.keys()):
                for idx, val in enumerate(sample_mapping_dict[sample]):
                    fout.write(f"{sample},{','.join(val)}\n")
    else:
        print_error("No entries to process!", "Samplesheet: {}".format(file_in))


def main(args=None):
    args = parse_args(args)
    check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
    sys.exit(main())
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`#!/usr/bin/env python`

add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`from distutils import extension`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`import os`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00			`import sys`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`import errno`
			`import argparse`

Black linting I hope 2022-09-02 09:55:19 +00:00
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`def parse_args(args=None):`
Black linting I hope 2022-09-02 09:55:19 +00:00			`Description = "Reformat nf-core/taxprofiler samplesheet file and check its contents."`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`parser = argparse.ArgumentParser(description=Description, epilog=Epilog)`
			`parser.add_argument("FILE_IN", help="Input samplesheet file.")`
			`parser.add_argument("FILE_OUT", help="Output file.")`
			`return parser.parse_args(args)`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00
Black linting I hope 2022-09-02 09:55:19 +00:00
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`def make_dir(path):`
			`if len(path) > 0:`
			`try:`
			`os.makedirs(path)`
			`except OSError as exception:`
			`if exception.errno != errno.EEXIST:`
			`raise exception`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00
Black linting I hope 2022-09-02 09:55:19 +00:00
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`def print_error(error, context="Line", context_str=""):`
			`error_str = "ERROR: Please check samplesheet -> {}".format(error)`
			`if context != "" and context_str != "":`
			`error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(`
			`error, context.strip(), context_str.strip()`
			`)`
			`print(error_str)`
			`sys.exit(1)`

Black linting I hope 2022-09-02 09:55:19 +00:00
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00			`def check_samplesheet(file_in, file_out):`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`"""`
			`This function checks that the samplesheet follows the following structure:`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta`
update samplesheet specs 2022-02-18 12:11:18 +00:00			`2611,ERR5766174,ILLUMINA,,,ERX5474930_ERR5766174_1.fa.gz`
			`2612,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz,`
			`2612,ERR5766174,ILLUMINA,ERX5474936_ERR5766180_1.fastq.gz,,`
			`2613,ERR5766181,ILLUMINA,ERX5474937_ERR5766181_1.fastq.gz,ERX5474937_ERR5766181_2.fastq.gz,`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00			`"""`

Remove support for uncompressed FASTQ files 2022-06-09 06:21:48 +00:00			`FQ_EXTENSIONS = (".fq.gz", ".fastq.gz")`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`FA_EXTENSIONS = (`
			`".fa",`
			`".fa.gz",`
			`".fasta",`
			`".fasta.gz",`
			`".fna",`
			`".fna.gz",`
			`".fas",`
			`".fas.gz",`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00			`)`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`INSTRUMENT_PLATFORMS = [`
			`"ABI_SOLID",`
			`"BGISEQ",`
			`"CAPILLARY",`
			`"COMPLETE_GENOMICS",`
			`"DNBSEQ",`
			`"HELICOS",`
			`"ILLUMINA",`
			`"ION_TORRENT",`
			`"LS454",`
			`"OXFORD_NANOPORE",`
			`"PACBIO_SMRT",`
			`]`

initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`sample_mapping_dict = {}`
			`with open(file_in, "r") as fin:`

			`## Check header`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`MIN_COLS = 4`
			`HEADER = [`
			`"sample",`
			`"run_accession",`
			`"instrument_platform",`
			`"fastq_1",`
			`"fastq_2",`
			`"fasta",`
			`]`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`header = [x.strip('"') for x in fin.readline().strip().split(",")]`
Do not require fixed column order 2022-06-13 19:02:12 +00:00
			`## Check for missing mandatory columns`
			`missing_columns = list(set(HEADER) - set(header))`
			`if len(missing_columns) > 0:`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`print(`
Do not require fixed column order 2022-06-13 19:02:12 +00:00			`"ERROR: Missing required column header -> {}. Note some columns can otherwise be empty. See pipeline documentation (https://nf-co.re/taxprofiler/usage).".format(`
			`",".join(missing_columns)`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`)`
			`)`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`sys.exit(1)`
Template update for nf-core/tools version 2.3 2022-03-15 21:05:37 +00:00
Replace header tuple with sub-set list of only relevant columns for sample checking 2022-06-14 20:09:48 +00:00			`## Find locations of mandatory columns`
Update bin/check_samplesheet.py Co-authored-by: Moritz E. Beber <midnighter@posteo.net> 2022-08-23 13:44:53 +00:00			`header_locs = {}`
Replace header tuple with sub-set list of only relevant columns for sample checking 2022-06-14 20:09:48 +00:00			`for i in HEADER:`
			`header_locs[i] = header.index(i)`

initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`## Check sample entries`
			`for line in fin:`
Replace header tuple with sub-set list of only relevant columns for sample checking 2022-06-14 20:09:48 +00:00
			`## Pull out only relevant columns for downstream checking`
			`line_parsed = [x.strip().strip('"') for x in line.strip().split(",")]`
			`lspl = [line_parsed[i] for i in header_locs.values()]`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00
			`# Check valid number of columns per row`
			`if len(lspl) < len(HEADER):`
			`print_error(`
			`"Invalid number of columns (minimum = {})!".format(len(HEADER)),`
			`"Line",`
			`line,`
			`)`
			`num_cols = len([x for x in lspl if x])`
			`if num_cols < MIN_COLS:`
			`print_error(`
Black linting I hope 2022-09-02 09:55:19 +00:00			`"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`"Line",`
			`line,`
			`)`

			`## Check sample name entries`
Replace header tuple with sub-set list of only relevant columns for sample checking 2022-06-14 20:09:48 +00:00
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`(`
			`sample,`
			`run_accession,`
			`instrument_platform,`
			`fastq_1,`
			`fastq_2,`
			`fasta,`
			`) = lspl[: len(HEADER)]`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`sample = sample.replace(" ", "_")`
			`if not sample:`
			`print_error("Sample entry has not been specified!", "Line", line)`

			`## Check FastQ file extension`
			`for fastq in [fastq_1, fastq_2]:`
			`if fastq:`
			`if fastq.find(" ") != -1:`
			`print_error("FastQ file contains spaces!", "Line", line)`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`if not fastq.endswith(FQ_EXTENSIONS):`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`print_error(`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`f"FastQ file does not have extension {' or '.join(list(FQ_EXTENSIONS))} !",`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`"Line",`
			`line,`
			`)`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`if fasta:`
			`if fasta.find(" ") != -1:`
			`print_error("FastA file contains spaces!", "Line", line)`
			`if not fasta.endswith(FA_EXTENSIONS):`
			`print_error(`
			`f"FastA file does not have extension {' or '.join(list(FA_EXTENSIONS))}!",`
			`"Line",`
			`line,`
			`)`
			`sample_info = []`

			`# Check run_accession`
			`if not run_accession:`
			`print_error("Run accession has not been specified!", "Line", line)`
			`else:`
			`sample_info.append(run_accession)`

			`# Check instrument_platform`
			`if not instrument_platform:`
			`print_error("Instrument platform has not been specified!", "Line", line)`
			`else:`
			`if instrument_platform not in INSTRUMENT_PLATFORMS:`
			`print_error(`
			`f"Instrument platform {instrument_platform} is not supported!",`
			`f"List of supported platforms {', '.join(INSTRUMENT_PLATFORMS)}",`
			`"Line",`
			`line,`
			`)`
			`sample_info.append(instrument_platform)`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00
			`## Auto-detect paired-end/single-end`
			`if sample and fastq_1 and fastq_2: ## Paired-end short reads`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`sample_info.extend(["0", fastq_1, fastq_2, fasta])`
Black linting I hope 2022-09-02 09:55:19 +00:00			`elif sample and fastq_1 and not fastq_2: ## Single-end short/long fastq reads`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`sample_info.extend(["1", fastq_1, fastq_2, fasta])`
Black linting I hope 2022-09-02 09:55:19 +00:00			`elif sample and fasta and not fastq_1 and not fastq_2: ## Single-end long reads`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`sample_info.extend(["1", fastq_1, fastq_2, fasta])`
add check for fastq with fasta 2022-02-18 12:27:10 +00:00			`elif fasta and (fastq_1 or fastq_2):`
			`print_error(`
			`"FastQ and FastA files cannot be specified together in the same library!",`
			`"Line",`
			`line,`
			`)`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`else:`
			`print_error("Invalid combination of columns provided!", "Line", line)`

cleanup 2022-02-18 12:15:30 +00:00			`## Create sample mapping dictionary = { sample: [ run_accession, instrument_platform, single_end, fastq_1, fastq_2 , fasta ] }`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`if sample not in sample_mapping_dict:`
			`sample_mapping_dict[sample] = [sample_info]`
			`else:`
			`if sample_info in sample_mapping_dict[sample]:`
			`print_error("Samplesheet contains duplicate rows!", "Line", line)`
			`else:`
			`sample_mapping_dict[sample].append(sample_info)`

			`## Write validated samplesheet with appropriate columns`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`HEADER_OUT = [`
			`"sample",`
			`"run_accession",`
			`"instrument_platform",`
			`"single_end",`
			`"fastq_1",`
			`"fastq_2",`
			`"fasta",`
			`]`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`if len(sample_mapping_dict) > 0:`
			`out_dir = os.path.dirname(file_out)`
			`make_dir(out_dir)`
			`with open(file_out, "w") as fout:`
add more columns to samplesheet 2022-02-18 10:53:13 +00:00			`fout.write(",".join(HEADER_OUT) + "\n")`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`for sample in sorted(sample_mapping_dict.keys()):`
			`for idx, val in enumerate(sample_mapping_dict[sample]):`
update samplesheet specs 2022-02-18 12:11:18 +00:00			`fout.write(f"{sample},{','.join(val)}\n")`
initial template build from nf-core/tools, version 2.2 2022-02-18 06:55:14 +00:00			`else:`
			`print_error("No entries to process!", "Samplesheet: {}".format(file_in))`


			`def main(args=None):`
			`args = parse_args(args)`
			`check_samplesheet(args.FILE_IN, args.FILE_OUT)`


			`if __name__ == "__main__":`
			`sys.exit(main())`