From 1f9512b70f665d82e3237f03452c7fa67b1d21f2 Mon Sep 17 00:00:00 2001
From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com>
Date: Mon, 13 May 2019 10:09:48 -0600
Subject: [PATCH] Initial commit

---
 fastq-to-taxonomy.sh     |  26 ++++++
 fetchmetadata.R          |   8 ++
 main.sh                  | 169 +++++++++++++++++++++++++++++++++++++++
 manipulatefeaturetable.R |  48 +++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 fastq-to-taxonomy.sh
 create mode 100644 fetchmetadata.R
 create mode 100644 main.sh
 create mode 100644 manipulatefeaturetable.R

diff --git a/fastq-to-taxonomy.sh b/fastq-to-taxonomy.sh
new file mode 100644
index 0000000..4ecc1f7
--- /dev/null
+++ b/fastq-to-taxonomy.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --account=cowusda2016
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16000
+#SBATCH --time="2-00:00:00"
+
+module restore system
+module load metaxa2
+module load blast-legacy
+
+R1filename=${1%}
+R2filename="${R1filename/R1/R2}"
+Samplename="${R1filename/_R1_001.fastq.gz/}"
+Samplename="${Samplename/\.\//}"
+	
+# Extract rRNA to files
+metaxa2 -1 "$R1filename" -2 "$R2filename" -o "$Samplename" -f q -cpu 4 \
+ --summary F --graphical F --fasta F --taxonomy T
+
+# Switch modes to extract taxonomy
+INPUT2="${Samplename}.taxonomy.txt"
+
+# Compile the taxonomy
+metaxa2_ttt -i "$INPUT2" -o "$Samplename" -cpu4 -m 7 -n 7 --summary F
\ No newline at end of file
diff --git a/fetchmetadata.R b/fetchmetadata.R
new file mode 100644
index 0000000..c9bfdaf
--- /dev/null
+++ b/fetchmetadata.R
@@ -0,0 +1,8 @@
+metadata <- read.delim("metadata.tsv", header=FALSE)
+metadata_columns <- head(metadata, 2)
+metadata_columns <- as.data.frame(t(metadata_columns))
+colnames(metadata_columns) <- c("Name", "DataType")
+numcols <- subset(metadata_columns, DataType == "numeric")[,1]
+catcols <- subset(metadata_columns, DataType == "categorical")[,1]
+write.table(numcols,"numcols.txt",row.names = FALSE, col.names = FALSE, quote = FALSE)
+write.table(catcols,"catcols.txt",row.names = FALSE, col.names = FALSE, quote = FALSE)
\ No newline at end of file
diff --git a/main.sh b/main.sh
new file mode 100644
index 0000000..3bc9166
--- /dev/null
+++ b/main.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+#SBATCH --account=cowusda2016
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=32G
+#SBATCH --time="3-00:00:00"
+
+# DEPENDENCIES:
+# fastq-to-taxonomy.sh
+# manipulatefeaturetable.R
+# fetchmetadata.R
+
+# Modules to load
+module load swset
+module load gcc
+module load parallel
+module load miniconda3
+module load metaxa2
+module load r
+
+# Generate Level-7 taxonomy summaries for all samples using paired-end
+# read FASTQ files in Metaxa2
+# This step can be executed in parallel for all the files, but since
+# Metaxa2 uses 4 cpus, we need to make sure that each instance has
+# enough cpus to run
+echo "--^-- X: Reading FASTQ sequences..."
+find . -maxdepth 1 -name "*R1_001.fastq.gz" | \
+parallel -j 4 ./fastq-to-taxonomy.sh
+echo "--^-- X: Reading FASTQ sequences...Done!"
+
+# Compile those pesky individual taxonomic tables into a single
+# OTU feature table
+echo "--^-- X: Compiling feature table..."
+metaxa2_dc -i *.level_7.txt -o metaxa-feature-table.tsv
+echo "--^-- X: Compiling feature table...Done!"
+
+
+# Rearrange the feature table to something QIIME likes a little bit better
+echo "--^-- X: Rearranging feature table..."
+Rscript ./manipulatefeaturetable.R
+echo "--^-- X: Rearranging feature table...Done!"
+
+
+# Pull the column names from the metadata table
+echo "--^-- X: Finding metadata columns..."
+Rscript ./fetchmetadata.R
+echo "--^-- X: Finding metadata columns...Done!"
+
+# Start up QIIME
+source activate qiime2
+
+# Our minimum taxa count is 11123 - this will be needed for rarefaction
+RAREFACTION=11123
+
+# Convert the feature table into BIOM format
+echo "--^-- X: Importing data..."
+biom convert \
+-i feature-table.tsv \
+-o feature-table.hdf5.biom \
+--table-type="OTU table" \
+--to-hdf5 \
+--process-obs-metadata taxonomy
+
+# Now convert the BIOM table into QIIME format (good grief!)
+qiime tools import \
+ --input-path feature-table.hdf5.biom \
+ --type 'FeatureTable[Frequency]' \
+ --input-format 'BIOMV210Format' \
+ --output-path feature-table.qza
+ 
+qiime tools import \
+ --input-path feature-table.hdf5.biom \
+ --output-path taxonomy.qza \
+ --type 'FeatureData[Taxonomy]' \
+ --input-format 'BIOMV210Format' 
+echo "--^-- X: Importing data...Done!"
+
+# We will need to run core-metrics to generate information further down
+echo "--^-- X: Running core-metrics..."
+rm -r "core-metrics-results"
+# This is one of the few QIIME commands that can use multithreading
+qiime diversity core-metrics \
+ --i-table feature-table.qza \
+ --p-sampling-depth "$RAREFACTION" \
+ --m-metadata-file metadata.tsv \
+ --p-n-jobs 16 \
+ --output-dir core-metrics-results \
+ --verbose
+echo "--^-- X: Running core-metrics...Done!"
+
+# Clean out the visualizations, or else QIIME will throw a fit
+rm -r "visualizations"
+mkdir visualizations
+
+# Create a pretty barplot as a reward for all that effort
+echo "--^-- X: Generating barplot..."
+qiime taxa barplot \
+ --i-table feature-table.qza \
+ --i-taxonomy taxonomy.qza \
+ --m-metadata-file metadata.tsv \
+ --o-visualization visualizations/barplot.qzv
+echo "--^-- X: Generating barplot...Done!"
+
+ # Run alpha-diversity group significance: this will automatically include all the columns
+ # Evenness first
+echo "--^-- X: Finding alpha-group-significance..."
+qiime diversity alpha-group-significance \
+ --i-alpha-diversity core-metrics-results/evenness_vector.qza \
+ --m-metadata-file metadata.tsv \
+ --o-visualization visualizations/evenness-group-significance.qzv \
+ --verbose
+  
+# Now richness
+qiime diversity alpha-group-significance \
+ --i-alpha-diversity core-metrics-results/shannon_vector.qza \
+ --m-metadata-file metadata.tsv \
+ --o-visualization visualizations/shannon-group-significance.qzv \
+ --verbose
+echo "--^-- X: Finding alpha-group-significance...Done!"
+  
+# Now let's find the correlation between alpha-diversity and the numeric traits
+echo "--^-- X: Finding alpha-correlations..."
+qiime diversity alpha-correlation \
+ --i-alpha-diversity core-metrics-results/evenness_vector.qza \
+ --m-metadata-file metadata.tsv \
+ --p-method pearson \
+ --o-visualization visualizations/evenness-correlation.qzv \
+ --verbose
+ 
+qiime diversity alpha-correlation \
+ --i-alpha-diversity core-metrics-results/shannon_vector.qza \
+ --m-metadata-file metadata.tsv \
+ --p-method pearson \
+ --o-visualization visualizations/shannon-correlation.qzv \
+ --verbose
+echo "--^-- X: Finding alpha-correlations...Done!"
+
+# Now for the tricky part: beta-diversity
+echo "--^-- X: Checking entries for beta-significance..."
+# QIIME only uses one processor for these, so we can parallelize this step
+parallel -i -a catcols.txt \
+    qiime diversity beta-group-significance \
+	  --i-distance-matrix core-metrics-results/bray_curtis_distance_matrix.qza \
+	  --m-metadata-file metadata.tsv \
+	  --m-metadata-column {} \
+	  --p-pairwise \
+	  --o-visualization "visualizations/bray-curtis-{}-significance.qzv" \
+	  --verbose
+echo "--^-- X: Checking entries for beta-significance...Done!"
+
+echo "--^-- X: Performing ANCOM..."
+# We will try to use ancom on the full dataset, although it might kill us
+# Extract pseudocount
+qiime composition add-pseudocount \
+ --i-table feature-table.qza \
+ --o-composition-table composition-table.qza
+
+# Run ancom for CowID, Age, TrmtGroup
+# Once again, QIIME only uses one processor (even though this
+# is a HUGE task), so we should parallelize it for speed
+parallel -i -a catcols.txt \
+	qiime composition ancom \
+     --i-table composition-table.qza \
+	 --m-metadata-file metadata.tsv \
+     --m-metadata-column {} \
+	 --o-visualization "visualizations/ancom-{}.qzv" \
+	 --verbose
+echo "--^-- X: Performing ANCOM...Done!"	 
+
+echo "All Done!"
\ No newline at end of file
diff --git a/manipulatefeaturetable.R b/manipulatefeaturetable.R
new file mode 100644
index 0000000..64c09c0
--- /dev/null
+++ b/manipulatefeaturetable.R
@@ -0,0 +1,48 @@
+# Read the inital feature table in
+feature_table <- read.table("metaxa-feature-table.tsv",
+							header = TRUE,
+                            sep = "\t",
+                            quote = "",
+                            strip.white = TRUE,
+							check.names = FALSE)
+
+# Get the dimensions of the table
+numSamples <- ncol(feature_table) - 1
+numFeatures <- nrow(feature_table)
+
+# Rearrange taxonomy so biom and QIIME like it
+feature_table$taxonomy <- feature_table$Taxa
+feature_table$Taxa <- NULL
+
+# Add unique SampleIds for QIIME to work with
+ids <- vector(length = numFeatures)
+for (i in 1:numFeatures){
+  # ARCC won't let us install packages, so we have to deal
+  # with generating UUIDs ourselves using the code from:
+  # https://stackoverflow.com/a/10493590/3922521
+  baseuuid <- paste(sample(c(letters[1:6],0:9),30,replace=TRUE),collapse="")
+  ids[i] <- paste(
+    substr(baseuuid,1,8),
+    "-",
+    substr(baseuuid,9,12),
+    "-",
+    "4",
+    substr(baseuuid,13,15),
+    "-",
+    sample(c("8","9","a","b"),1),
+    substr(baseuuid,16,18),
+    "-",
+    substr(baseuuid,19,30),
+    sep="",
+    collapse=""
+  )
+}
+feature_table$'#SampleId' <- ids
+feature_table <- feature_table[c(numSamples+2, 1:(numSamples+1))]
+
+# Write the file out
+write.table(feature_table,
+            file="feature-table.tsv",
+            sep = "\t",
+            quote = FALSE,
+            row.names = FALSE)
\ No newline at end of file