mirror of
https://github.com/MillironX/cowcalf-rumen-metagenomic-pipeline.git
synced 2024-12-03 10:29:55 +00:00
Initial commit
This commit is contained in:
commit
1f9512b70f
4 changed files with 251 additions and 0 deletions
26
fastq-to-taxonomy.sh
Normal file
26
fastq-to-taxonomy.sh
Normal file
|
@ -0,0 +1,26 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --account=cowusda2016
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --mem=16000
|
||||
#SBATCH --time="2-00:00:00"
|
||||
|
||||
module restore system
|
||||
module load metaxa2
|
||||
module load blast-legacy
|
||||
|
||||
R1filename=${1%}
|
||||
R2filename="${R1filename/R1/R2}"
|
||||
Samplename="${R1filename/_R1_001.fastq.gz/}"
|
||||
Samplename="${Samplename/\.\//}"
|
||||
|
||||
# Extract rRNA to files
|
||||
metaxa2 -1 "$R1filename" -2 "$R2filename" -o "$Samplename" -f q -cpu 4 \
|
||||
--summary F --graphical F --fasta F --taxonomy T
|
||||
|
||||
# Switch modes to extract taxonomy
|
||||
INPUT2="${Samplename}.taxonomy.txt"
|
||||
|
||||
# Compile the taxonomy
|
||||
metaxa2_ttt -i "$INPUT2" -o "$Samplename" -cpu4 -m 7 -n 7 --summary F
|
8
fetchmetadata.R
Normal file
8
fetchmetadata.R
Normal file
|
@ -0,0 +1,8 @@
|
|||
metadata <- read.delim("metadata.tsv", header=FALSE)
|
||||
metadata_columns <- head(metadata, 2)
|
||||
metadata_columns <- as.data.frame(t(metadata_columns))
|
||||
colnames(metadata_columns) <- c("Name", "DataType")
|
||||
numcols <- subset(metadata_columns, DataType == "numeric")[,1]
|
||||
catcols <- subset(metadata_columns, DataType == "categorical")[,1]
|
||||
write.table(numcols,"numcols.txt",row.names = FALSE, col.names = FALSE, quote = FALSE)
|
||||
write.table(catcols,"catcols.txt",row.names = FALSE, col.names = FALSE, quote = FALSE)
|
169
main.sh
Normal file
169
main.sh
Normal file
|
@ -0,0 +1,169 @@
|
|||
#!/bin/bash
|
||||
#SBATCH --account=cowusda2016
|
||||
#SBATCH --cpus-per-task=16
|
||||
#SBATCH --mem=32G
|
||||
#SBATCH --time="3-00:00:00"
|
||||
|
||||
# DEPENDENCIES:
|
||||
# fastq-to-taxonomy.sh
|
||||
# manipulatefeaturetable.R
|
||||
# fetchmetadata.R
|
||||
|
||||
# Modules to load
|
||||
module load swset
|
||||
module load gcc
|
||||
module load parallel
|
||||
module load miniconda3
|
||||
module load metaxa2
|
||||
module load r
|
||||
|
||||
# Generate Level-7 taxonomy summaries for all samples using paired-end
|
||||
# read FASTQ files in Metaxa2
|
||||
# This step can be executed in parallel for all the files, but since
|
||||
# Metaxa2 uses 4 cpus, we need to make sure that each instance has
|
||||
# enough cpus to run
|
||||
echo "--^-- X: Reading FASTQ sequences..."
|
||||
find . -maxdepth 1 -name "*R1_001.fastq.gz" | \
|
||||
parallel -j 4 ./fastq-to-taxonomy.sh
|
||||
echo "--^-- X: Reading FASTQ sequences...Done!"
|
||||
|
||||
# Compile those pesky individual taxonomic tables into a single
|
||||
# OTU feature table
|
||||
echo "--^-- X: Compiling feature table..."
|
||||
metaxa2_dc -i *.level_7.txt -o metaxa-feature-table.tsv
|
||||
echo "--^-- X: Compiling feature table...Done!"
|
||||
|
||||
|
||||
# Rearrange the feature table to something QIIME likes a little bit better
|
||||
echo "--^-- X: Rearranging feature table..."
|
||||
Rscript ./manipulatefeaturetable.R
|
||||
echo "--^-- X: Rearranging feature table...Done!"
|
||||
|
||||
|
||||
# Pull the column names from the metadata table
|
||||
echo "--^-- X: Finding metadata columns..."
|
||||
Rscript ./fetchmetadata.R
|
||||
echo "--^-- X: Finding metadata columns...Done!"
|
||||
|
||||
# Start up QIIME
|
||||
source activate qiime2
|
||||
|
||||
# Our minimum taxa count is 11123 - this will be needed for rarefaction
|
||||
RAREFACTION=11123
|
||||
|
||||
# Convert the feature table into BIOM format
|
||||
echo "--^-- X: Importing data..."
|
||||
biom convert \
|
||||
-i feature-table.tsv \
|
||||
-o feature-table.hdf5.biom \
|
||||
--table-type="OTU table" \
|
||||
--to-hdf5 \
|
||||
--process-obs-metadata taxonomy
|
||||
|
||||
# Now convert the BIOM table into QIIME format (good grief!)
|
||||
qiime tools import \
|
||||
--input-path feature-table.hdf5.biom \
|
||||
--type 'FeatureTable[Frequency]' \
|
||||
--input-format 'BIOMV210Format' \
|
||||
--output-path feature-table.qza
|
||||
|
||||
qiime tools import \
|
||||
--input-path feature-table.hdf5.biom \
|
||||
--output-path taxonomy.qza \
|
||||
--type 'FeatureData[Taxonomy]' \
|
||||
--input-format 'BIOMV210Format'
|
||||
echo "--^-- X: Importing data...Done!"
|
||||
|
||||
# We will need to run core-metrics to generate information further down
|
||||
echo "--^-- X: Running core-metrics..."
|
||||
rm -r "core-metrics-results"
|
||||
# This is one of the few QIIME commands that can use multithreading
|
||||
qiime diversity core-metrics \
|
||||
--i-table feature-table.qza \
|
||||
--p-sampling-depth "$RAREFACTION" \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--p-n-jobs 16 \
|
||||
--output-dir core-metrics-results \
|
||||
--verbose
|
||||
echo "--^-- X: Running core-metrics...Done!"
|
||||
|
||||
# Clean out the visualizations, or else QIIME will throw a fit
|
||||
rm -r "visualizations"
|
||||
mkdir visualizations
|
||||
|
||||
# Create a pretty barplot as a reward for all that effort
|
||||
echo "--^-- X: Generating barplot..."
|
||||
qiime taxa barplot \
|
||||
--i-table feature-table.qza \
|
||||
--i-taxonomy taxonomy.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--o-visualization visualizations/barplot.qzv
|
||||
echo "--^-- X: Generating barplot...Done!"
|
||||
|
||||
# Run alpha-diversity group significance: this will automatically include all the columns
|
||||
# Evenness first
|
||||
echo "--^-- X: Finding alpha-group-significance..."
|
||||
qiime diversity alpha-group-significance \
|
||||
--i-alpha-diversity core-metrics-results/evenness_vector.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--o-visualization visualizations/evenness-group-significance.qzv \
|
||||
--verbose
|
||||
|
||||
# Now richness
|
||||
qiime diversity alpha-group-significance \
|
||||
--i-alpha-diversity core-metrics-results/shannon_vector.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--o-visualization visualizations/shannon-group-significance.qzv \
|
||||
--verbose
|
||||
echo "--^-- X: Finding alpha-group-significance...Done!"
|
||||
|
||||
# Now let's find the correlation between alpha-diversity and the numeric traits
|
||||
echo "--^-- X: Finding alpha-correlations..."
|
||||
qiime diversity alpha-correlation \
|
||||
--i-alpha-diversity core-metrics-results/evenness_vector.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--p-method pearson \
|
||||
--o-visualization visualizations/evenness-correlation.qzv \
|
||||
--verbose
|
||||
|
||||
qiime diversity alpha-correlation \
|
||||
--i-alpha-diversity core-metrics-results/shannon_vector.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--p-method pearson \
|
||||
--o-visualization visualizations/shannon-correlation.qzv \
|
||||
--verbose
|
||||
echo "--^-- X: Finding alpha-correlations...Done!"
|
||||
|
||||
# Now for the tricky part: beta-diversity
|
||||
echo "--^-- X: Checking entries for beta-significance..."
|
||||
# QIIME only uses one processor for these, so we can parallelize this step
|
||||
parallel -i -a catcols.txt \
|
||||
qiime diversity beta-group-significance \
|
||||
--i-distance-matrix core-metrics-results/bray_curtis_distance_matrix.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--m-metadata-column {} \
|
||||
--p-pairwise \
|
||||
--o-visualization "visualizations/bray-curtis-{}-significance.qzv" \
|
||||
--verbose
|
||||
echo "--^-- X: Checking entries for beta-significance...Done!"
|
||||
|
||||
echo "--^-- X: Performing ANCOM..."
|
||||
# We will try to use ancom on the full dataset, although it might kill us
|
||||
# Extract pseudocount
|
||||
qiime composition add-pseudocount \
|
||||
--i-table feature-table.qza \
|
||||
--o-composition-table composition-table.qza
|
||||
|
||||
# Run ancom for CowID, Age, TrmtGroup
|
||||
# Once again, QIIME only uses one processor (even though this
|
||||
# is a HUGE task), so we should parallelize it for speed
|
||||
parallel -i -a catcols.txt \
|
||||
qiime composition ancom \
|
||||
--i-table composition-table.qza \
|
||||
--m-metadata-file metadata.tsv \
|
||||
--m-metadata-column {} \
|
||||
--o-visualization "visualizations/ancom-{}.qzv" \
|
||||
--verbose
|
||||
echo "--^-- X: Performing ANCOM...Done!"
|
||||
|
||||
echo "All Done!"
|
48
manipulatefeaturetable.R
Normal file
48
manipulatefeaturetable.R
Normal file
|
@ -0,0 +1,48 @@
|
|||
# Read the inital feature table in
|
||||
feature_table <- read.table("metaxa-feature-table.tsv",
|
||||
header = TRUE,
|
||||
sep = "\t",
|
||||
quote = "",
|
||||
strip.white = TRUE,
|
||||
check.names = FALSE)
|
||||
|
||||
# Get the dimensions of the table
|
||||
numSamples <- ncol(feature_table) - 1
|
||||
numFeatures <- nrow(feature_table)
|
||||
|
||||
# Rearrange taxonomy so biom and QIIME like it
|
||||
feature_table$taxonomy <- feature_table$Taxa
|
||||
feature_table$Taxa <- NULL
|
||||
|
||||
# Add unique SampleIds for QIIME to work with
|
||||
ids <- vector(length = numFeatures)
|
||||
for (i in 1:numFeatures){
|
||||
# ARCC won't let us install packages, so we have to deal
|
||||
# with generating UUIDs ourselves using the code from:
|
||||
# https://stackoverflow.com/a/10493590/3922521
|
||||
baseuuid <- paste(sample(c(letters[1:6],0:9),30,replace=TRUE),collapse="")
|
||||
ids[i] <- paste(
|
||||
substr(baseuuid,1,8),
|
||||
"-",
|
||||
substr(baseuuid,9,12),
|
||||
"-",
|
||||
"4",
|
||||
substr(baseuuid,13,15),
|
||||
"-",
|
||||
sample(c("8","9","a","b"),1),
|
||||
substr(baseuuid,16,18),
|
||||
"-",
|
||||
substr(baseuuid,19,30),
|
||||
sep="",
|
||||
collapse=""
|
||||
)
|
||||
}
|
||||
feature_table$'#SampleId' <- ids
|
||||
feature_table <- feature_table[c(numSamples+2, 1:(numSamples+1))]
|
||||
|
||||
# Write the file out
|
||||
write.table(feature_table,
|
||||
file="feature-table.tsv",
|
||||
sep = "\t",
|
||||
quote = FALSE,
|
||||
row.names = FALSE)
|
Loading…
Reference in a new issue