From c4e366d47dd4ded53248611e61f86155c6860261 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Thu, 2 Feb 2023 11:59:26 +0100
Subject: [PATCH] Add full test data and documentation of the test data

---
 CITATIONS.md          | 10 +++++++
 README.md             |  6 ++---
 conf/test_full.config | 62 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/CITATIONS.md b/CITATIONS.md
index daf9022..825a2f9 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -92,3 +92,13 @@
 
 - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
   > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
+
+## Data
+
+- [Maixner (2021)](https://doi.org/10.1016/j.cub.2021.09.031) (CI Test Data)
+
+  > Maixner, Frank, Mohamed S. Sarhan, Kun D. Huang, Adrian Tett, Alexander Schoenafinger, Stefania Zingale, Aitor Blanco-Míguez, et al. 2021. “Hallstatt Miners Consumed Blue Cheese and Beer during the Iron Age and Retained a Non-Westernized Gut Microbiome until the Baroque Period.” Current Biology: CB 31 (23): 5149–62.e6. doi: 10.1016/j.cub.2021.09.031.
+
+- [Meslier (2022)](https://doi.org/10.1038/s41597-022-01762-z) (AWS Full Test data)
+
+  > Meslier, Victoria, Benoit Quinquis, Kévin Da Silva, Florian Plaza Oñate, Nicolas Pons, Hugo Roume, Mircea Podar, and Mathieu Almeida. 2022. “Benchmarking Second and Third-Generation Sequencing Platforms for Microbial Metagenomics.” Scientific Data 9 (1): 694. doi: 10.1038/s41597-022-01762-z.
\ No newline at end of file
diff --git a/README.md b/README.md
index 080a75b..9c38e9b 100644
--- a/README.md
+++ b/README.md
@@ -14,16 +14,14 @@
 
 > ⚠️ This pipeline is still under development! While the pipeline is usable, not all functionality will be available!
 
-<!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
-
 **nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic classification and profiling of shotgun metagenomic data. It allows for in-parallel taxonomic identification of reads or taxonomic abundance estimation with multiple classification and profiling tools against multiple databases, produces standardised output tables.
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 
-<!-- TODO nf-core: Add full-sized test dataset and amend the paragraph below if applicable -->
-
 On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/taxprofiler/results).
 
+The nf-core/taxprofiler CI test dataset uses sequencing data from [Maixer et al. (2021) Curr. Bio.](https://doi.org/10.1016/j.cub.2021.09.031). The AWS full test dataset uses sequencing data and reference genomes from [Meslier (2022) _Sci. Data_](https://doi.org/10.1038/s41597-022-01762-z)
+
 ## Pipeline summary
 
 <!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
diff --git a/conf/test_full.config b/conf/test_full.config
index 49a10a0..cf8d873 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -1,12 +1,10 @@
 /*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     Nextflow config file for running full-size tests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     Defines input files and everything required to run a full size pipeline test.
-
     Use as follows:
         nextflow run nf-core/taxprofiler -profile test_full,<docker/singularity> --outdir <OUTDIR>
-
 ----------------------------------------------------------------------------------------
 */
 
@@ -15,10 +13,58 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
+    input     = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_full.csv'
+    databases = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/database_full.csv'
 
     // Genome references
-    genome = 'R64-1-1'
+    hostremoval_reference = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/819/615/GCA_000819615.1_ViralProj14015/GCA_000819615.1_ViralProj14015_genomic.fna.gz'
+
+    save_preprocessed_reads = true
+     
+    perform_shortread_qc = true
+    shortread_qc_mergepairs = true
+    perform_shortread_complexityfilter = true
+    save_complexityfiltered_reads = true
+     
+    perform_longread_qc = true
+    perform_shortread_hostremoval = true
+    perform_longread_hostremoval = true
+    save_hostremoval_index = true
+    save_hostremoval_mapped = true
+    save_hostremoval_unmapped = true
+
+    perform_runmerging = true
+    save_runmerged_reads = true
+
+    run_centrifuge = true
+    centrifuge_save_reads = true
+
+    run_diamond = true
+
+    run_kaiju = true
+
+    run_kraken2 = true
+    kraken2_save_reads = true
+    kraken2_save_readclassification = true
+    kraken2_save_minimizers  = true 
+    
+    run_krakenuniq = true   
+    krakenuniq_save_reads = true
+    krakenuniq_save_readclassifications = true
+
+    run_bracken = true
+    
+    run_malt = true
+    malt_save_reads = true
+    malt_generate_megansummary = true
+
+    run_metaphlan3 = true
+    
+    run_motus = true
+    motus_save_mgc_read_counts = true
+
+    run_profile_standardisation = true
+    run_krona = true
 }
+
+cleanup = true
\ No newline at end of file