From 9a91fed82f2e92be2eaf69b2cad73f252c4e3b3f Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 11 Apr 2025 09:27:43 -0500 Subject: [PATCH 1/4] feat: Bootstrap basic package --- .gitignore | 28 ++++++++++++++++++++++- Project.toml | 17 ++++++++++++++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 8 +++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 Project.toml create mode 100644 src/Cowcalf_rumen_metagenomic_pipeline.jl diff --git a/.gitignore b/.gitignore index 3b1733d..f9c1b1b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,30 @@ !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json -!.vscode/extensions.json \ No newline at end of file +!.vscode/extensions.json + +### Julia gitignore ### +## Files generated by invoking Julia with --code-coverage +*.jl.cov +*.jl.*.cov + +# Files generated by invoking Julia with --track-allocation +*.jl.mem + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package +docs/build/ +docs/site/ + +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. +Manifest.toml diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..d25e064 --- /dev/null +++ b/Project.toml @@ -0,0 +1,17 @@ +name = "Cowcalf_rumen_metagenomic_pipeline" +uuid = "7d0d08ee-6932-474e-8e49-cd3f4679ce2d" +authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.com> and contributors"] +version = "0.1.0" + +[deps] +Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" + +[compat] +Cowsay = "1.0.0" +julia = "1.11" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl new file mode 100644 index 0000000..2aac62f --- /dev/null +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -0,0 +1,8 @@ +module Cowcalf_rumen_metagenomic_pipeline + using Cowsay: cowsay + + function main() + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") + return 0 + end #function +end #module From 81f49a8ad8fa9d125a7231eca54794c7ebd67198 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 11 Apr 2025 21:01:45 -0500 Subject: [PATCH 2/4] feat: Add Conda environment installer --- Project.toml | 8 ++++++++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 24 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/Project.toml b/Project.toml index d25e064..9803d24 100644 --- a/Project.toml +++ b/Project.toml @@ -4,10 +4,18 @@ authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.co version = "0.1.0" [deps] +Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +Conda = "1.10.2" Cowsay = "1.0.0" +HTTP = "1.10.16" +URIs = "1.5.2" +YAML = "0.4.13" julia = "1.11" [extras] diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 2aac62f..6dc43fa 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -1,5 +1,29 @@ module Cowcalf_rumen_metagenomic_pipeline + using Conda: Conda, runconda using Cowsay: cowsay + using HTTP: HTTP + using URIs: URI + using YAML: YAML + + _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) + _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) + + function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) + ENV["CONDA_JL_USE_MINIFORGE"] = "1" + + # Install x86 packages via Rosetta2 on MacOS + if Sys.isapple() + ENV["CONDA_SUBDIR"] = "osx-64" + runconda(`config --env --set subdir osx-64`, env_name) + end #if + + conda_definition = _fetch_yaml_contents(yaml) + + map(c -> Conda.add_channel(c, env_name), conda_definition["channels"]) + Conda.add(conda_definition["dependencies"], env_name) + + return env_name + end #function function main() cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") From 9df8163123e79c39a23ef78571fa6226de961244 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 11 Apr 2025 21:05:13 -0500 Subject: [PATCH 3/4] style: Add formatter requirements --- .JuliaFormatter.toml | 1 + src/Cowcalf_rumen_metagenomic_pipeline.jl | 48 ++++++++++++----------- 2 files changed, 26 insertions(+), 23 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..1e72b50 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1 @@ +style="blue" diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 6dc43fa..7218120 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -1,32 +1,34 @@ module Cowcalf_rumen_metagenomic_pipeline - using Conda: Conda, runconda - using Cowsay: cowsay - using HTTP: HTTP - using URIs: URI - using YAML: YAML - _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) - _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) +using Conda: Conda, runconda +using Cowsay: cowsay +using HTTP: HTTP +using URIs: URI +using YAML: YAML - function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) - ENV["CONDA_JL_USE_MINIFORGE"] = "1" +_fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) +_fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) - # Install x86 packages via Rosetta2 on MacOS - if Sys.isapple() - ENV["CONDA_SUBDIR"] = "osx-64" - runconda(`config --env --set subdir osx-64`, env_name) - end #if +function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) + ENV["CONDA_JL_USE_MINIFORGE"] = "1" - conda_definition = _fetch_yaml_contents(yaml) + # Install x86 packages via Rosetta2 on MacOS + if Sys.isapple() + ENV["CONDA_SUBDIR"] = "osx-64" + runconda(`config --env --set subdir osx-64`, env_name) + end #if - map(c -> Conda.add_channel(c, env_name), conda_definition["channels"]) - Conda.add(conda_definition["dependencies"], env_name) + conda_definition = _fetch_yaml_contents(yaml) - return env_name - end #function + map(c -> Conda.add_channel(c, env_name), conda_definition["channels"]) + Conda.add(conda_definition["dependencies"], env_name) + + return env_name +end #function + +function main() + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") + return 0 +end #function - function main() - cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") - return 0 - end #function end #module From 3130e92395724b9115854b3019678dad20615d4d Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Mon, 14 Apr 2025 11:12:41 -0500 Subject: [PATCH 4/4] feat: Add metaxa steps --- Project.toml | 10 +++ conda_envs/metaxa2.yml | 8 +++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 58 ++++++++++++++++- src/Metaxa.jl | 77 +++++++++++++++++++++++ src/ProcessHelper.jl | 36 +++++++++++ 5 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 conda_envs/metaxa2.yml create mode 100644 src/Metaxa.jl create mode 100644 src/ProcessHelper.jl diff --git a/Project.toml b/Project.toml index 9803d24..ed6c11c 100644 --- a/Project.toml +++ b/Project.toml @@ -4,15 +4,25 @@ authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.co version = "0.1.0" [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" +Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +CSV = "0.10.15" Conda = "1.10.2" Cowsay = "1.0.0" +Dagger = "0.18.14" +DataFrames = "1.7.0" +Distributed = "1.11.0" +Glob = "1.3.1" HTTP = "1.10.16" URIs = "1.5.2" YAML = "0.4.13" diff --git a/conda_envs/metaxa2.yml b/conda_envs/metaxa2.yml new file mode 100644 index 0000000..4a27d02 --- /dev/null +++ b/conda_envs/metaxa2.yml @@ -0,0 +1,8 @@ +channels: + - bioconda + - conda-forge +dependencies: + - metaxa=2.2 + - blast-legacy=2.2.26 + - hmmer=3.1 + - mafft=7.525 diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 7218120..eba501a 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -2,10 +2,19 @@ module Cowcalf_rumen_metagenomic_pipeline using Conda: Conda, runconda using Cowsay: cowsay +using CSV: CSV +using DataFrames: DataFrame, rename! +using Distributed: pmap, @everywhere +using Glob: GlobMatch, glob using HTTP: HTTP using URIs: URI using YAML: YAML +export main + +include("Metaxa.jl") +using .Metaxa: Metaxa + _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) @@ -26,7 +35,54 @@ function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_nam return env_name end #function -function main() +function import_metadata(metadata_tsv::AbstractString) + df = DataFrame(CSV.File(metadata_tsv)) + rename!(df, 1 => :sample_name) + return df +end #function + +function sample_files(samplenames::Vector{<:AbstractString}) + function _s(samplename::AbstractString) + # Use explicit GlobMatch constructor b/c we need to interpolate values + return ( + samplename, + ( + abspath(first(glob(GlobMatch("$(samplename)*1*.fastq.gz")))), + abspath(first(glob(GlobMatch("$(samplename)*2*.fastq.gz")))), + ), + ) + end #function + + return map(_s, samplenames) +end #function + +function (@main)(ARGS) + metadata_file = pop!(ARGS) + + setup_remote_conda_environment( + URI( + "https://data.qiime2.org/distro/metagenome/qiime2-metagenome-2024.10-py310-osx-conda.yml", + ), + :qiime2, + ) + setup_remote_conda_environment( + joinpath(@__DIR__, "..", "conda_envs", "metaxa2.yml"), + :metaxa2, + ) + + metadata = import_metadata(metadata_file) + fastq_files = sample_files(metadata[!, :sample_name]) + + @eval begin + @everywhere begin + include(joinpath(@__DIR__, "Metaxa.jl")) + using .Metaxa: Metaxa + end #@everywhere + end #@eval + taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)), fastq_files) + feature_table = Metaxa.data_collector(taxonomy_files...) + cp(feature_table, pwd()) + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 end #function diff --git a/src/Metaxa.jl b/src/Metaxa.jl new file mode 100644 index 0000000..4b93d19 --- /dev/null +++ b/src/Metaxa.jl @@ -0,0 +1,77 @@ +module Metaxa + +using Conda: runconda + +include("ProcessHelper.jl") + +using .ProcessHelper: exec_in_temp_dir + +export taxonomy + +function _classifier( + samplename::AbstractString, + fastq1::AbstractString, + fastq2::AbstractString, +) + runconda( + `run metaxa2 \ + -1 $fastq1 \ + -2 $fastq2 \ + -o $samplename \ + --format fastq \ + --cpu 4 \ + --summary F \ + --graphical F \ + --fasta F \ + --taxonomy T + `, + :metaxa2, + ) + ispath("$samplename.taxonomy.txt") || + error("metaxa2 ran, but $samplename.taxonomy.txt was not found!") + return abspath("$samplename.taxonomy.txt") +end #function + +function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractString) + runconda( + `run metaxa2_ttt \ + -i $taxonomy \ + -o $samplename \ + -m 7 \ + -n 7 \ + --summary F + `, + :metaxa2, + ) + ispath("$samplename.level_7.txt") || + error("metaxa2 ran, but $samplename.level_7.txt was not found!") + return abspath("$samplename.level_7.txt") +end #function + +function taxonomy( + samplename::AbstractString, + fastq::Tuple{<:AbstractString,<:AbstractString}, +) + taxonomy_file = exec_in_temp_dir(_classifier, samplename, fastq...) + level_7_taxonomy_file = exec_in_temp_dir(_taxonomy_traversal, samplename, taxonomy_file) + return level_7_taxonomy_file +end #function + +function _dc(taxonomies::AbstractString...) + runconda( + `metaxa2_dc \ + -o feature-table.tsv \ + $(join(taxonomies, ' ')) + `, + :metaxa2, + ) + ispath("feature-table.tsv") || + error("metaxa2 ran, but feature-table.tsv was not found!") + return abspath("feature-table.tsv") +end #function + +function data_collector(taxonomies::AbstractString...) + return exec_in_temp_dir(_dc, taxonomies...) +end #function + +end #module diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl new file mode 100644 index 0000000..ca01370 --- /dev/null +++ b/src/ProcessHelper.jl @@ -0,0 +1,36 @@ +module ProcessHelper + +export sym_temp + +""" + sym_temp(file::AbstractString) -> (tmp_dir, link) + sym_temp(files::Tuple{<:AbstractString,<:AbstractString}) -> (tmp_dir, links) + +Copies `file(s)` to a new temporary directory named `tmp_dir` and symbolically links them +inside of that directory. Returns a tuple of the directory path and the path to the links. +""" +function sym_temp(files::AbstractString...) + tmp_dir = mktempdir(; cleanup = false) + @info "Creating temporary directory $tmp_dir" + + function _symlink(file::AbstractString) + symlink_path = joinpath(tmp_dir, basename(file)) + symlink(realpath(file), symlink_path) + @info "Symlinked $file to $symlink_path" + return symlink_path + end #function + + return (tmp_dir, map(_symlink, files)) +end #function + +function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractString...) + tmp_dir, tmp_files = sym_temp(files...) + return cd(() -> f(samplename, tmp_files...), tmp_dir) +end #function + +function exec_in_temp_dir(f::Function, files::AbstractString...) + tmp_dir, tmp_files = sym_temp(files...) + return cd(() -> f(tmp_files...), tmp_dir) +end #function + +end #module