From c4e366d47dd4ded53248611e61f86155c6860261 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 2 Feb 2023 11:59:26 +0100 Subject: [PATCH 1/2] Add full test data and documentation of the test data --- CITATIONS.md | 10 +++++++ README.md | 6 ++--- conf/test_full.config | 62 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index daf9022..825a2f9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -92,3 +92,13 @@ - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + +## Data + +- [Maixner (2021)](https://doi.org/10.1016/j.cub.2021.09.031) (CI Test Data) + + > Maixner, Frank, Mohamed S. Sarhan, Kun D. Huang, Adrian Tett, Alexander Schoenafinger, Stefania Zingale, Aitor Blanco-Míguez, et al. 2021. “Hallstatt Miners Consumed Blue Cheese and Beer during the Iron Age and Retained a Non-Westernized Gut Microbiome until the Baroque Period.” Current Biology: CB 31 (23): 5149–62.e6. doi: 10.1016/j.cub.2021.09.031. + +- [Meslier (2022)](https://doi.org/10.1038/s41597-022-01762-z) (AWS Full Test data) + + > Meslier, Victoria, Benoit Quinquis, Kévin Da Silva, Florian Plaza Oñate, Nicolas Pons, Hugo Roume, Mircea Podar, and Mathieu Almeida. 2022. “Benchmarking Second and Third-Generation Sequencing Platforms for Microbial Metagenomics.” Scientific Data 9 (1): 694. doi: 10.1038/s41597-022-01762-z. \ No newline at end of file diff --git a/README.md b/README.md index 080a75b..9c38e9b 100644 --- a/README.md +++ b/README.md @@ -14,16 +14,14 @@ > ⚠️ This pipeline is still under development! While the pipeline is usable, not all functionality will be available! - - **nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic classification and profiling of shotgun metagenomic data. It allows for in-parallel taxonomic identification of reads or taxonomic abundance estimation with multiple classification and profiling tools against multiple databases, produces standardised output tables. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - - On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/taxprofiler/results). +The nf-core/taxprofiler CI test dataset uses sequencing data from [Maixer et al. (2021) Curr. Bio.](https://doi.org/10.1016/j.cub.2021.09.031). The AWS full test dataset uses sequencing data and reference genomes from [Meslier (2022) _Sci. Data_](https://doi.org/10.1038/s41597-022-01762-z) + ## Pipeline summary diff --git a/conf/test_full.config b/conf/test_full.config index 49a10a0..cf8d873 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -1,12 +1,10 @@ /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running full-size tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a full size pipeline test. - Use as follows: nextflow run nf-core/taxprofiler -profile test_full, --outdir - ---------------------------------------------------------------------------------------- */ @@ -15,10 +13,58 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + input = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_full.csv' + databases = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/database_full.csv' // Genome references - genome = 'R64-1-1' + hostremoval_reference = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/819/615/GCA_000819615.1_ViralProj14015/GCA_000819615.1_ViralProj14015_genomic.fna.gz' + + save_preprocessed_reads = true + + perform_shortread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + save_complexityfiltered_reads = true + + perform_longread_qc = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + save_hostremoval_index = true + save_hostremoval_mapped = true + save_hostremoval_unmapped = true + + perform_runmerging = true + save_runmerged_reads = true + + run_centrifuge = true + centrifuge_save_reads = true + + run_diamond = true + + run_kaiju = true + + run_kraken2 = true + kraken2_save_reads = true + kraken2_save_readclassification = true + kraken2_save_minimizers = true + + run_krakenuniq = true + krakenuniq_save_reads = true + krakenuniq_save_readclassifications = true + + run_bracken = true + + run_malt = true + malt_save_reads = true + malt_generate_megansummary = true + + run_metaphlan3 = true + + run_motus = true + motus_save_mgc_read_counts = true + + run_profile_standardisation = true + run_krona = true } + +cleanup = true \ No newline at end of file From 7ce55a99e813d1890cf9b28a3b620433eba45ead Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 2 Feb 2023 13:08:16 +0100 Subject: [PATCH 2/2] Fix linting --- CITATIONS.md | 2 +- bin/check_samplesheet.py | 2 -- conf/test_full.config | 16 ++++++++-------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 825a2f9..2f75fdb 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -101,4 +101,4 @@ - [Meslier (2022)](https://doi.org/10.1038/s41597-022-01762-z) (AWS Full Test data) - > Meslier, Victoria, Benoit Quinquis, Kévin Da Silva, Florian Plaza Oñate, Nicolas Pons, Hugo Roume, Mircea Podar, and Mathieu Almeida. 2022. “Benchmarking Second and Third-Generation Sequencing Platforms for Microbial Metagenomics.” Scientific Data 9 (1): 694. doi: 10.1038/s41597-022-01762-z. \ No newline at end of file + > Meslier, Victoria, Benoit Quinquis, Kévin Da Silva, Florian Plaza Oñate, Nicolas Pons, Hugo Roume, Mircea Podar, and Mathieu Almeida. 2022. “Benchmarking Second and Third-Generation Sequencing Platforms for Microbial Metagenomics.” Scientific Data 9 (1): 694. doi: 10.1038/s41597-022-01762-z. diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index f5b0e6a..9f0f7a6 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -71,7 +71,6 @@ def check_samplesheet(file_in, file_out): sample_mapping_dict = {} with open(file_in, "r") as fin: - ## Check header MIN_COLS = 4 HEADER = [ @@ -101,7 +100,6 @@ def check_samplesheet(file_in, file_out): ## Check sample entries for line in fin: - ## Pull out only relevant columns for downstream checking line_parsed = [x.strip().strip('"') for x in line.strip().split(",")] diff --git a/conf/test_full.config b/conf/test_full.config index cf8d873..07099d0 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -20,12 +20,12 @@ params { hostremoval_reference = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/819/615/GCA_000819615.1_ViralProj14015/GCA_000819615.1_ViralProj14015_genomic.fna.gz' save_preprocessed_reads = true - + perform_shortread_qc = true shortread_qc_mergepairs = true perform_shortread_complexityfilter = true save_complexityfiltered_reads = true - + perform_longread_qc = true perform_shortread_hostremoval = true perform_longread_hostremoval = true @@ -46,20 +46,20 @@ params { run_kraken2 = true kraken2_save_reads = true kraken2_save_readclassification = true - kraken2_save_minimizers = true - - run_krakenuniq = true + kraken2_save_minimizers = true + + run_krakenuniq = true krakenuniq_save_reads = true krakenuniq_save_readclassifications = true run_bracken = true - + run_malt = true malt_save_reads = true malt_generate_megansummary = true run_metaphlan3 = true - + run_motus = true motus_save_mgc_read_counts = true @@ -67,4 +67,4 @@ params { run_krona = true } -cleanup = true \ No newline at end of file +cleanup = true