From 9a91fed82f2e92be2eaf69b2cad73f252c4e3b3f Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 11 Apr 2025 09:27:43 -0500 Subject: [PATCH 01/21] feat: Bootstrap basic package --- .gitignore | 28 ++++++++++++++++++++++- Project.toml | 17 ++++++++++++++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 8 +++++++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 Project.toml create mode 100644 src/Cowcalf_rumen_metagenomic_pipeline.jl diff --git a/.gitignore b/.gitignore index 3b1733d..f9c1b1b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,30 @@ !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json -!.vscode/extensions.json \ No newline at end of file +!.vscode/extensions.json + +### Julia gitignore ### +## Files generated by invoking Julia with --code-coverage +*.jl.cov +*.jl.*.cov + +# Files generated by invoking Julia with --track-allocation +*.jl.mem + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package +docs/build/ +docs/site/ + +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. +Manifest.toml diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..d25e064 --- /dev/null +++ b/Project.toml @@ -0,0 +1,17 @@ +name = "Cowcalf_rumen_metagenomic_pipeline" +uuid = "7d0d08ee-6932-474e-8e49-cd3f4679ce2d" +authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.com> and contributors"] +version = "0.1.0" + +[deps] +Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" + +[compat] +Cowsay = "1.0.0" +julia = "1.11" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl new file mode 100644 index 0000000..2aac62f --- /dev/null +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -0,0 +1,8 @@ +module Cowcalf_rumen_metagenomic_pipeline + using Cowsay: cowsay + + function main() + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") + return 0 + end #function +end #module From 81f49a8ad8fa9d125a7231eca54794c7ebd67198 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 11 Apr 2025 21:01:45 -0500 Subject: [PATCH 02/21] feat: Add Conda environment installer --- Project.toml | 8 ++++++++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 24 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/Project.toml b/Project.toml index d25e064..9803d24 100644 --- a/Project.toml +++ b/Project.toml @@ -4,10 +4,18 @@ authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.co version = "0.1.0" [deps] +Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +Conda = "1.10.2" Cowsay = "1.0.0" +HTTP = "1.10.16" +URIs = "1.5.2" +YAML = "0.4.13" julia = "1.11" [extras] diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 2aac62f..6dc43fa 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -1,5 +1,29 @@ module Cowcalf_rumen_metagenomic_pipeline + using Conda: Conda, runconda using Cowsay: cowsay + using HTTP: HTTP + using URIs: URI + using YAML: YAML + + _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) + _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) + + function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) + ENV["CONDA_JL_USE_MINIFORGE"] = "1" + + # Install x86 packages via Rosetta2 on MacOS + if Sys.isapple() + ENV["CONDA_SUBDIR"] = "osx-64" + runconda(`config --env --set subdir osx-64`, env_name) + end #if + + conda_definition = _fetch_yaml_contents(yaml) + + map(c -> Conda.add_channel(c, env_name), conda_definition["channels"]) + Conda.add(conda_definition["dependencies"], env_name) + + return env_name + end #function function main() cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") From 9df8163123e79c39a23ef78571fa6226de961244 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 11 Apr 2025 21:05:13 -0500 Subject: [PATCH 03/21] style: Add formatter requirements --- .JuliaFormatter.toml | 1 + src/Cowcalf_rumen_metagenomic_pipeline.jl | 48 ++++++++++++----------- 2 files changed, 26 insertions(+), 23 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..1e72b50 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1 @@ +style="blue" diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 6dc43fa..7218120 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -1,32 +1,34 @@ module Cowcalf_rumen_metagenomic_pipeline - using Conda: Conda, runconda - using Cowsay: cowsay - using HTTP: HTTP - using URIs: URI - using YAML: YAML - _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) - _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) +using Conda: Conda, runconda +using Cowsay: cowsay +using HTTP: HTTP +using URIs: URI +using YAML: YAML - function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) - ENV["CONDA_JL_USE_MINIFORGE"] = "1" +_fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) +_fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) - # Install x86 packages via Rosetta2 on MacOS - if Sys.isapple() - ENV["CONDA_SUBDIR"] = "osx-64" - runconda(`config --env --set subdir osx-64`, env_name) - end #if +function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) + ENV["CONDA_JL_USE_MINIFORGE"] = "1" - conda_definition = _fetch_yaml_contents(yaml) + # Install x86 packages via Rosetta2 on MacOS + if Sys.isapple() + ENV["CONDA_SUBDIR"] = "osx-64" + runconda(`config --env --set subdir osx-64`, env_name) + end #if - map(c -> Conda.add_channel(c, env_name), conda_definition["channels"]) - Conda.add(conda_definition["dependencies"], env_name) + conda_definition = _fetch_yaml_contents(yaml) - return env_name - end #function + map(c -> Conda.add_channel(c, env_name), conda_definition["channels"]) + Conda.add(conda_definition["dependencies"], env_name) + + return env_name +end #function + +function main() + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") + return 0 +end #function - function main() - cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") - return 0 - end #function end #module From 3130e92395724b9115854b3019678dad20615d4d Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Mon, 14 Apr 2025 11:12:41 -0500 Subject: [PATCH 04/21] feat: Add metaxa steps --- Project.toml | 10 +++ conda_envs/metaxa2.yml | 8 +++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 58 ++++++++++++++++- src/Metaxa.jl | 77 +++++++++++++++++++++++ src/ProcessHelper.jl | 36 +++++++++++ 5 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 conda_envs/metaxa2.yml create mode 100644 src/Metaxa.jl create mode 100644 src/ProcessHelper.jl diff --git a/Project.toml b/Project.toml index 9803d24..ed6c11c 100644 --- a/Project.toml +++ b/Project.toml @@ -4,15 +4,25 @@ authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.co version = "0.1.0" [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" +Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +CSV = "0.10.15" Conda = "1.10.2" Cowsay = "1.0.0" +Dagger = "0.18.14" +DataFrames = "1.7.0" +Distributed = "1.11.0" +Glob = "1.3.1" HTTP = "1.10.16" URIs = "1.5.2" YAML = "0.4.13" diff --git a/conda_envs/metaxa2.yml b/conda_envs/metaxa2.yml new file mode 100644 index 0000000..4a27d02 --- /dev/null +++ b/conda_envs/metaxa2.yml @@ -0,0 +1,8 @@ +channels: + - bioconda + - conda-forge +dependencies: + - metaxa=2.2 + - blast-legacy=2.2.26 + - hmmer=3.1 + - mafft=7.525 diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 7218120..eba501a 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -2,10 +2,19 @@ module Cowcalf_rumen_metagenomic_pipeline using Conda: Conda, runconda using Cowsay: cowsay +using CSV: CSV +using DataFrames: DataFrame, rename! +using Distributed: pmap, @everywhere +using Glob: GlobMatch, glob using HTTP: HTTP using URIs: URI using YAML: YAML +export main + +include("Metaxa.jl") +using .Metaxa: Metaxa + _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) @@ -26,7 +35,54 @@ function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_nam return env_name end #function -function main() +function import_metadata(metadata_tsv::AbstractString) + df = DataFrame(CSV.File(metadata_tsv)) + rename!(df, 1 => :sample_name) + return df +end #function + +function sample_files(samplenames::Vector{<:AbstractString}) + function _s(samplename::AbstractString) + # Use explicit GlobMatch constructor b/c we need to interpolate values + return ( + samplename, + ( + abspath(first(glob(GlobMatch("$(samplename)*1*.fastq.gz")))), + abspath(first(glob(GlobMatch("$(samplename)*2*.fastq.gz")))), + ), + ) + end #function + + return map(_s, samplenames) +end #function + +function (@main)(ARGS) + metadata_file = pop!(ARGS) + + setup_remote_conda_environment( + URI( + "https://data.qiime2.org/distro/metagenome/qiime2-metagenome-2024.10-py310-osx-conda.yml", + ), + :qiime2, + ) + setup_remote_conda_environment( + joinpath(@__DIR__, "..", "conda_envs", "metaxa2.yml"), + :metaxa2, + ) + + metadata = import_metadata(metadata_file) + fastq_files = sample_files(metadata[!, :sample_name]) + + @eval begin + @everywhere begin + include(joinpath(@__DIR__, "Metaxa.jl")) + using .Metaxa: Metaxa + end #@everywhere + end #@eval + taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)), fastq_files) + feature_table = Metaxa.data_collector(taxonomy_files...) + cp(feature_table, pwd()) + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 end #function diff --git a/src/Metaxa.jl b/src/Metaxa.jl new file mode 100644 index 0000000..4b93d19 --- /dev/null +++ b/src/Metaxa.jl @@ -0,0 +1,77 @@ +module Metaxa + +using Conda: runconda + +include("ProcessHelper.jl") + +using .ProcessHelper: exec_in_temp_dir + +export taxonomy + +function _classifier( + samplename::AbstractString, + fastq1::AbstractString, + fastq2::AbstractString, +) + runconda( + `run metaxa2 \ + -1 $fastq1 \ + -2 $fastq2 \ + -o $samplename \ + --format fastq \ + --cpu 4 \ + --summary F \ + --graphical F \ + --fasta F \ + --taxonomy T + `, + :metaxa2, + ) + ispath("$samplename.taxonomy.txt") || + error("metaxa2 ran, but $samplename.taxonomy.txt was not found!") + return abspath("$samplename.taxonomy.txt") +end #function + +function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractString) + runconda( + `run metaxa2_ttt \ + -i $taxonomy \ + -o $samplename \ + -m 7 \ + -n 7 \ + --summary F + `, + :metaxa2, + ) + ispath("$samplename.level_7.txt") || + error("metaxa2 ran, but $samplename.level_7.txt was not found!") + return abspath("$samplename.level_7.txt") +end #function + +function taxonomy( + samplename::AbstractString, + fastq::Tuple{<:AbstractString,<:AbstractString}, +) + taxonomy_file = exec_in_temp_dir(_classifier, samplename, fastq...) + level_7_taxonomy_file = exec_in_temp_dir(_taxonomy_traversal, samplename, taxonomy_file) + return level_7_taxonomy_file +end #function + +function _dc(taxonomies::AbstractString...) + runconda( + `metaxa2_dc \ + -o feature-table.tsv \ + $(join(taxonomies, ' ')) + `, + :metaxa2, + ) + ispath("feature-table.tsv") || + error("metaxa2 ran, but feature-table.tsv was not found!") + return abspath("feature-table.tsv") +end #function + +function data_collector(taxonomies::AbstractString...) + return exec_in_temp_dir(_dc, taxonomies...) +end #function + +end #module diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl new file mode 100644 index 0000000..ca01370 --- /dev/null +++ b/src/ProcessHelper.jl @@ -0,0 +1,36 @@ +module ProcessHelper + +export sym_temp + +""" + sym_temp(file::AbstractString) -> (tmp_dir, link) + sym_temp(files::Tuple{<:AbstractString,<:AbstractString}) -> (tmp_dir, links) + +Copies `file(s)` to a new temporary directory named `tmp_dir` and symbolically links them +inside of that directory. Returns a tuple of the directory path and the path to the links. +""" +function sym_temp(files::AbstractString...) + tmp_dir = mktempdir(; cleanup = false) + @info "Creating temporary directory $tmp_dir" + + function _symlink(file::AbstractString) + symlink_path = joinpath(tmp_dir, basename(file)) + symlink(realpath(file), symlink_path) + @info "Symlinked $file to $symlink_path" + return symlink_path + end #function + + return (tmp_dir, map(_symlink, files)) +end #function + +function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractString...) + tmp_dir, tmp_files = sym_temp(files...) + return cd(() -> f(samplename, tmp_files...), tmp_dir) +end #function + +function exec_in_temp_dir(f::Function, files::AbstractString...) + tmp_dir, tmp_files = sym_temp(files...) + return cd(() -> f(tmp_files...), tmp_dir) +end #function + +end #module From 8e68ea3028ba4e4cb1471ff33864ef6ff799af51 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Mon, 14 Apr 2025 15:49:03 -0500 Subject: [PATCH 05/21] fix: add `run` to metaxa_dc command --- src/Metaxa.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Metaxa.jl b/src/Metaxa.jl index 4b93d19..fd40bee 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -59,7 +59,7 @@ end #function function _dc(taxonomies::AbstractString...) runconda( - `metaxa2_dc \ + `run metaxa2_dc \ -o feature-table.tsv \ $(join(taxonomies, ' ')) `, From 418c7f10884401258485133e98e06f8e5e81285b Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Mon, 14 Apr 2025 16:00:52 -0500 Subject: [PATCH 06/21] fix: Don't copy over a directory with a file --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index eba501a..c08e683 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -81,7 +81,7 @@ function (@main)(ARGS) end #@eval taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)), fastq_files) feature_table = Metaxa.data_collector(taxonomy_files...) - cp(feature_table, pwd()) + cp(feature_table, joinpath(pwd(), "feature-table.tsv")) cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 From 10fdcd7b3f9677b8ac2d52eac1c7d2daf446d1e4 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Mon, 14 Apr 2025 16:50:32 -0500 Subject: [PATCH 07/21] perf: Make conda not re-init an existing env --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index c08e683..2a6008e 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -19,6 +19,11 @@ _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) + # Check for the existence of the environment with the same name before creating one + # We assume that the existence of the environment means that it has been properly setup + # TODO: Provide a way to force re-initialization of a conda environment + isdir(Conda.prefix(env_name)) && return env_name + ENV["CONDA_JL_USE_MINIFORGE"] = "1" # Install x86 packages via Rosetta2 on MacOS From 0f7102b9de2bb79b5362884ef76e7a23bf238b83 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Tue, 15 Apr 2025 15:20:47 -0500 Subject: [PATCH 08/21] fix: Make metaxa2_dc use relative paths instead of absolute --- src/Metaxa.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Metaxa.jl b/src/Metaxa.jl index fd40bee..3635977 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -61,7 +61,7 @@ function _dc(taxonomies::AbstractString...) runconda( `run metaxa2_dc \ -o feature-table.tsv \ - $(join(taxonomies, ' ')) + $(join(map(basename, taxonomies), ' ')) `, :metaxa2, ) From adfef5b9b309134bdce407383d9b22129d79b5c9 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:19:09 -0500 Subject: [PATCH 09/21] refactor: Use FilePathsBase for all file paths --- Project.toml | 2 ++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 19 +++++++------- src/Metaxa.jl | 32 +++++++++-------------- src/ProcessHelper.jl | 14 +++++----- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/Project.toml b/Project.toml index ed6c11c..f89bd4e 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" @@ -22,6 +23,7 @@ Cowsay = "1.0.0" Dagger = "0.18.14" DataFrames = "1.7.0" Distributed = "1.11.0" +FilePathsBase = "0.9.24" Glob = "1.3.1" HTTP = "1.10.16" URIs = "1.5.2" diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 2a6008e..fc915b8 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -5,6 +5,7 @@ using Cowsay: cowsay using CSV: CSV using DataFrames: DataFrame, rename! using Distributed: pmap, @everywhere +using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str using Glob: GlobMatch, glob using HTTP: HTTP using URIs: URI @@ -15,10 +16,10 @@ export main include("Metaxa.jl") using .Metaxa: Metaxa -_fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) +_fetch_yaml_contents(yaml::AbstractPath) = YAML.load_file(string(yaml)) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) -function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) +function setup_remote_conda_environment(yaml::Union{AbstractPath,URI}, env_name::Symbol) # Check for the existence of the environment with the same name before creating one # We assume that the existence of the environment means that it has been properly setup # TODO: Provide a way to force re-initialization of a conda environment @@ -40,7 +41,7 @@ function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_nam return env_name end #function -function import_metadata(metadata_tsv::AbstractString) +function import_metadata(metadata_tsv::AbstractPath) df = DataFrame(CSV.File(metadata_tsv)) rename!(df, 1 => :sample_name) return df @@ -52,8 +53,8 @@ function sample_files(samplenames::Vector{<:AbstractString}) return ( samplename, ( - abspath(first(glob(GlobMatch("$(samplename)*1*.fastq.gz")))), - abspath(first(glob(GlobMatch("$(samplename)*2*.fastq.gz")))), + absolute(Path(first(glob(GlobMatch("$(samplename)*1*.fastq.gz"))))), + absolute(Path(first(glob(GlobMatch("$(samplename)*2*.fastq.gz"))))), ), ) end #function @@ -62,7 +63,7 @@ function sample_files(samplenames::Vector{<:AbstractString}) end #function function (@main)(ARGS) - metadata_file = pop!(ARGS) + metadata_file = Path(pop!(ARGS)) setup_remote_conda_environment( URI( @@ -71,7 +72,7 @@ function (@main)(ARGS) :qiime2, ) setup_remote_conda_environment( - joinpath(@__DIR__, "..", "conda_envs", "metaxa2.yml"), + joinpath(@__PATH__, p"..", p"conda_envs", p"metaxa2.yml"), :metaxa2, ) @@ -84,9 +85,9 @@ function (@main)(ARGS) using .Metaxa: Metaxa end #@everywhere end #@eval - taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)), fastq_files) + taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files) feature_table = Metaxa.data_collector(taxonomy_files...) - cp(feature_table, joinpath(pwd(), "feature-table.tsv")) + cp(feature_table, joinpath(cwd(), p"feature-table.tsv")) cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 diff --git a/src/Metaxa.jl b/src/Metaxa.jl index 3635977..7b0e0ac 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -1,6 +1,7 @@ module Metaxa using Conda: runconda +using FilePathsBase: AbstractPath, Path, absolute, exists include("ProcessHelper.jl") @@ -8,11 +9,7 @@ using .ProcessHelper: exec_in_temp_dir export taxonomy -function _classifier( - samplename::AbstractString, - fastq1::AbstractString, - fastq2::AbstractString, -) +function _classifier(samplename::AbstractString, fastq1::AbstractPath, fastq2::AbstractPath) runconda( `run metaxa2 \ -1 $fastq1 \ @@ -27,12 +24,12 @@ function _classifier( `, :metaxa2, ) - ispath("$samplename.taxonomy.txt") || + exists(Path("$samplename.taxonomy.txt")) || error("metaxa2 ran, but $samplename.taxonomy.txt was not found!") - return abspath("$samplename.taxonomy.txt") + return absolute(Path("$samplename.taxonomy.txt")) end #function -function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractString) +function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractPath) runconda( `run metaxa2_ttt \ -i $taxonomy \ @@ -43,34 +40,31 @@ function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractStrin `, :metaxa2, ) - ispath("$samplename.level_7.txt") || + exists(Path("$samplename.level_7.txt")) || error("metaxa2 ran, but $samplename.level_7.txt was not found!") - return abspath("$samplename.level_7.txt") + return absolute(Path("$samplename.level_7.txt")) end #function -function taxonomy( - samplename::AbstractString, - fastq::Tuple{<:AbstractString,<:AbstractString}, -) +function taxonomy(samplename::AbstractString, fastq::AbstractPath...) taxonomy_file = exec_in_temp_dir(_classifier, samplename, fastq...) level_7_taxonomy_file = exec_in_temp_dir(_taxonomy_traversal, samplename, taxonomy_file) return level_7_taxonomy_file end #function -function _dc(taxonomies::AbstractString...) +function _dc(taxonomies::AbstractPath...) runconda( `run metaxa2_dc \ -o feature-table.tsv \ - $(join(map(basename, taxonomies), ' ')) + $(map(basename, taxonomies)) `, :metaxa2, ) - ispath("feature-table.tsv") || + exists(Path("feature-table.tsv")) || error("metaxa2 ran, but feature-table.tsv was not found!") - return abspath("feature-table.tsv") + return absolute(Path("feature-table.tsv")) end #function -function data_collector(taxonomies::AbstractString...) +function data_collector(taxonomies::AbstractPath...) return exec_in_temp_dir(_dc, taxonomies...) end #function diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index ca01370..2592af9 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -1,5 +1,7 @@ module ProcessHelper +using FilePathsBase: AbstractPath, Path, canonicalize + export sym_temp """ @@ -9,13 +11,13 @@ export sym_temp Copies `file(s)` to a new temporary directory named `tmp_dir` and symbolically links them inside of that directory. Returns a tuple of the directory path and the path to the links. """ -function sym_temp(files::AbstractString...) - tmp_dir = mktempdir(; cleanup = false) +function sym_temp(files::AbstractPath...) + tmp_dir = Path(mktempdir(; cleanup = false)) @info "Creating temporary directory $tmp_dir" - function _symlink(file::AbstractString) + function _symlink(file::AbstractPath) symlink_path = joinpath(tmp_dir, basename(file)) - symlink(realpath(file), symlink_path) + symlink(canonicalize(file), symlink_path) @info "Symlinked $file to $symlink_path" return symlink_path end #function @@ -23,12 +25,12 @@ function sym_temp(files::AbstractString...) return (tmp_dir, map(_symlink, files)) end #function -function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractString...) +function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractPath...) tmp_dir, tmp_files = sym_temp(files...) return cd(() -> f(samplename, tmp_files...), tmp_dir) end #function -function exec_in_temp_dir(f::Function, files::AbstractString...) +function exec_in_temp_dir(f::Function, files::AbstractPath...) tmp_dir, tmp_files = sym_temp(files...) return cd(() -> f(tmp_files...), tmp_dir) end #function From 7b3cb740d9fdee5268a542463f49be1a4f5aa797 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:20:38 -0500 Subject: [PATCH 10/21] meta: Add input files to gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index f9c1b1b..0acd465 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,7 @@ docs/site/ # committed for packages, but should be committed for applications that require a static # environment. Manifest.toml + +### Input files ### +*.fastq.gz +metadata*.tsv From 516cc4985c38eb798e3f9dd5248477bc2d5917c2 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 08:35:53 -0500 Subject: [PATCH 11/21] refactor: Make exec_in_temp_dir more generic Instead of having the function have multiple methods with different signatures, we can slurp args and then use reflection to symlink any Path objects while passing the values unmodified. So do that. --- src/ProcessHelper.jl | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index 2592af9..d1a2f60 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -2,37 +2,30 @@ module ProcessHelper using FilePathsBase: AbstractPath, Path, canonicalize -export sym_temp +export exec_in_temp_dir -""" - sym_temp(file::AbstractString) -> (tmp_dir, link) - sym_temp(files::Tuple{<:AbstractString,<:AbstractString}) -> (tmp_dir, links) - -Copies `file(s)` to a new temporary directory named `tmp_dir` and symbolically links them -inside of that directory. Returns a tuple of the directory path and the path to the links. -""" -function sym_temp(files::AbstractPath...) +function exec_in_temp_dir(f::Function, args...) tmp_dir = Path(mktempdir(; cleanup = false)) @info "Creating temporary directory $tmp_dir" - function _symlink(file::AbstractPath) - symlink_path = joinpath(tmp_dir, basename(file)) - symlink(canonicalize(file), symlink_path) - @info "Symlinked $file to $symlink_path" - return symlink_path - end #function + if any(a -> a isa AbstractPath, args) + newargs = [] + for arg in args + if arg isa AbstractPath + symlink_path = joinpath(tmp_dir, basename(arg)) + symlink(canonicalize(arg), symlink_path) + @info "Symlinked $arg to $symlink_path" + push!(newargs, symlink_path) + else + push!(newargs, arg) + end #if + end #for - return (tmp_dir, map(_symlink, files)) -end #function + return cd(() -> f(newargs...), tmp_dir) + else + return cd(() -> f(args...), tmp_dir) + end #if -function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractPath...) - tmp_dir, tmp_files = sym_temp(files...) - return cd(() -> f(samplename, tmp_files...), tmp_dir) -end #function - -function exec_in_temp_dir(f::Function, files::AbstractPath...) - tmp_dir, tmp_files = sym_temp(files...) - return cd(() -> f(tmp_files...), tmp_dir) end #function end #module From d97c0898bc66bddb4be772d71cdcc651cfe5f0d2 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:06:56 -0500 Subject: [PATCH 12/21] feat: Add caching function --- Project.toml | 8 +++-- src/ProcessHelper.jl | 73 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index f89bd4e..32981f2 100644 --- a/Project.toml +++ b/Project.toml @@ -4,28 +4,32 @@ authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.co version = "0.1.0" [deps] +BaseDirs = "18cc8868-cbac-4acf-b575-c8ff214dc66f" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" -Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +BaseDirs = "1.2.4" CSV = "0.10.15" +CodecZlib = "0.7.8" Conda = "1.10.2" Cowsay = "1.0.0" -Dagger = "0.18.14" DataFrames = "1.7.0" Distributed = "1.11.0" FilePathsBase = "0.9.24" Glob = "1.3.1" HTTP = "1.10.16" +SHA = "0.7.0" URIs = "1.5.2" YAML = "0.4.13" julia = "1.11" diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index d1a2f60..af513f2 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -1,9 +1,21 @@ module ProcessHelper -using FilePathsBase: AbstractPath, Path, canonicalize +using BaseDirs: BaseDirs +using CodecZlib: GzipCompressorStream, GzipDecompressorStream +using FilePathsBase: AbstractPath, Path, canonicalize, exists +using MacroTools: splitdef +using SHA: SHA1_CTX, update!, digest! +export cache_function_value export exec_in_temp_dir +const CACHE_DIR = Path( + BaseDirs.User.cache( + BaseDirs.Project("Cowcalf_rumen_metagenomic_pipeline"); + create = true, + ), +) + function exec_in_temp_dir(f::Function, args...) tmp_dir = Path(mktempdir(; cleanup = false)) @info "Creating temporary directory $tmp_dir" @@ -28,4 +40,63 @@ function exec_in_temp_dir(f::Function, args...) end #function +function cache_function_value(f::Function, args...) + ctx = SHA1_CTX() + update!(ctx, Vector{UInt8}(String(Symbol(f)))) + for arg in args + if arg isa AbstractPath + open(string(arg), "r") do a + update!(ctx, read(a)) + end #do + else + update!(ctx, Vector{UInt8}(String(arg))) + end #if + end #for + input_hash = bytes2hex(digest!(ctx)) + + cache_file = joinpath(CACHE_DIR, input_hash) + + if exists(cache_file) + @info "Found cached file for function `$(String(Symbol(f)))` with input hash $input_hash" + tmp_dir = Path(mktempdir(; cleanup = false)) + @info "Creating temporary directory $tmp_dir" + + filename = readchomp(string(cache_file, ".filename")) + uncached_file = joinpath(tmp_dir, filename) + + open(GzipDecompressorStream, string(cache_file), "r") do compressed + open(string(uncached_file), "w") do uncompressed + write(uncompressed, read(compressed)) + end #do + end #do + + @info "Copied cached file for $input_hash to $uncached_file" + return canonicalize(uncached_file) + + end #if + + @info "Input hash $input_hash not found for function `$(String(Symbol(f)))`" + + result_file = exec_in_temp_dir(f, args...) + + result_file isa AbstractPath || error( + "`cache_function_value` can only work on functions that return an AbstractPath", + ) + + filename = basename(result_file) + open(string(cache_file, ".filename"), "w") do fnf + write(fnf, filename) + end #do + + open(string(result_file), "r") do uncompressed + open(GzipCompressorStream, string(cache_file), "w") do compressed + write(compressed, uncompressed) + end #do + end #do + @info "Copied result file $result_file into cache under hash $input_hash" + + return result_file + +end #function + end #module From e0b6a056ff7ed813688e638570c94628b02bb9c8 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:07:59 -0500 Subject: [PATCH 13/21] refactor: Switch to using cached file processes in Metaxa --- src/Metaxa.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Metaxa.jl b/src/Metaxa.jl index 7b0e0ac..57f5fbb 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -5,7 +5,7 @@ using FilePathsBase: AbstractPath, Path, absolute, exists include("ProcessHelper.jl") -using .ProcessHelper: exec_in_temp_dir +using .ProcessHelper: cache_function_value export taxonomy @@ -46,8 +46,9 @@ function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractPath) end #function function taxonomy(samplename::AbstractString, fastq::AbstractPath...) - taxonomy_file = exec_in_temp_dir(_classifier, samplename, fastq...) - level_7_taxonomy_file = exec_in_temp_dir(_taxonomy_traversal, samplename, taxonomy_file) + taxonomy_file = cache_function_value(_classifier, samplename, fastq...) + level_7_taxonomy_file = + cache_function_value(_taxonomy_traversal, samplename, taxonomy_file) return level_7_taxonomy_file end #function @@ -65,7 +66,7 @@ function _dc(taxonomies::AbstractPath...) end #function function data_collector(taxonomies::AbstractPath...) - return exec_in_temp_dir(_dc, taxonomies...) + return cache_function_value(_dc, taxonomies...) end #function end #module From 5c74bf3d69ff466324952a57382e328d7adaceaf Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:08:48 -0500 Subject: [PATCH 14/21] fixup! feat: Add caching function --- src/ProcessHelper.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index af513f2..2ea2bf6 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -3,7 +3,6 @@ module ProcessHelper using BaseDirs: BaseDirs using CodecZlib: GzipCompressorStream, GzipDecompressorStream using FilePathsBase: AbstractPath, Path, canonicalize, exists -using MacroTools: splitdef using SHA: SHA1_CTX, update!, digest! export cache_function_value From 30600e5e18d59304b5a5ce27a8aa919eacf86fb6 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 17:07:53 -0500 Subject: [PATCH 15/21] feat: Add feature table manipulation --- Project.toml | 2 ++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 34 +++++++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 32981f2..3272bdb 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] @@ -31,6 +32,7 @@ Glob = "1.3.1" HTTP = "1.10.16" SHA = "0.7.0" URIs = "1.5.2" +UUIDs = "1.11.0" YAML = "0.4.13" julia = "1.11" diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index fc915b8..fdc58a9 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -3,12 +3,13 @@ module Cowcalf_rumen_metagenomic_pipeline using Conda: Conda, runconda using Cowsay: cowsay using CSV: CSV -using DataFrames: DataFrame, rename! +using DataFrames: DataFrame, combine, eachrow, nrow, rename!, select! using Distributed: pmap, @everywhere using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str using Glob: GlobMatch, glob using HTTP: HTTP using URIs: URI +using UUIDs: uuid4 using YAML: YAML export main @@ -62,6 +63,14 @@ function sample_files(samplenames::Vector{<:AbstractString}) return map(_s, samplenames) end #function +function qiime_feature_table(table_file::AbstractPath) + df = DataFrame(CSV.File(table_file)) + rename!(df, 1 => :taxonomy) + df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] + select!(df, Symbol("#SampleId"), :) + return df +end #function + function (@main)(ARGS) metadata_file = Path(pop!(ARGS)) @@ -86,8 +95,27 @@ function (@main)(ARGS) end #@everywhere end #@eval taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files) - feature_table = Metaxa.data_collector(taxonomy_files...) - cp(feature_table, joinpath(cwd(), p"feature-table.tsv")) + feature_table = qiime_feature_table(Metaxa.data_collector(taxonomy_files...)) + rarefaction_min = minimum( + eachrow( + combine( + feature_table, + names(feature_table, Not(:taxonomy, Symbol("#SampleId"))) .=> sum, + renamecols = false, + ), + )..., + ) + rarefaction_max = maximum( + eachrow( + combine( + feature_table, + names(feature_table, Not(:taxonomy, Symbol("#SampleId"))) .=> sum, + renamecols = false, + ), + )..., + ) + @show rarefaction_min + @show rarefaction_max cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 From 0c09cd9e33e117892a49829a12949f2e29549bf6 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:21 -0500 Subject: [PATCH 16/21] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index fdc58a9..c917470 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -65,6 +65,8 @@ end #function function qiime_feature_table(table_file::AbstractPath) df = DataFrame(CSV.File(table_file)) + + # Biom format requires that taxonomy hierarchy be named exactly "taxonomy" rename!(df, 1 => :taxonomy) df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] select!(df, Symbol("#SampleId"), :) From 4462ddab6619fe070d71db7c6d5d4188a2794056 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:21 -0500 Subject: [PATCH 17/21] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index c917470..c89cfe7 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -68,6 +68,8 @@ function qiime_feature_table(table_file::AbstractPath) # Biom format requires that taxonomy hierarchy be named exactly "taxonomy" rename!(df, 1 => :taxonomy) + + # Every OTU in a feature table must have a universally unique id df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] select!(df, Symbol("#SampleId"), :) return df From 9d124cb796b49a4f191addf5a213aff8b8f55355 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:21 -0500 Subject: [PATCH 18/21] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index c89cfe7..938375d 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -71,7 +71,16 @@ function qiime_feature_table(table_file::AbstractPath) # Every OTU in a feature table must have a universally unique id df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] - select!(df, Symbol("#SampleId"), :) + + # Biom tables are sensitive to the position of the columns: + # arrange them so that #SampleId is first, taxonomy is last, + # and all the observed counts are in the middle + select!( + feature_table, + Symbol("#SampleId"), + Not([Symbol("#SampleId"), :taxonomy]), + :taxonomy, + ) return df end #function From 630385829fae70f55b9ce620ee126b5b40aae63d Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:48 -0500 Subject: [PATCH 19/21] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 938375d..30ae53b 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -3,7 +3,7 @@ module Cowcalf_rumen_metagenomic_pipeline using Conda: Conda, runconda using Cowsay: cowsay using CSV: CSV -using DataFrames: DataFrame, combine, eachrow, nrow, rename!, select! +using DataFrames: DataFrame, Not, combine, eachrow, nrow, rename!, select! using Distributed: pmap, @everywhere using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str using Glob: GlobMatch, glob From e5db18513379e3244e7c675cbb5f0f6612716a73 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:26:43 -0500 Subject: [PATCH 20/21] feat: Add DataFrame compat to cache --- src/ProcessHelper.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index 2ea2bf6..e87ee8e 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -2,6 +2,8 @@ module ProcessHelper using BaseDirs: BaseDirs using CodecZlib: GzipCompressorStream, GzipDecompressorStream +using CSV: CSV +using DataFrames: DataFrame using FilePathsBase: AbstractPath, Path, canonicalize, exists using SHA: SHA1_CTX, update!, digest! @@ -47,6 +49,10 @@ function cache_function_value(f::Function, args...) open(string(arg), "r") do a update!(ctx, read(a)) end #do + elseif arg isa DataFrame + for row in CSV.RowWriter(arg) + update!(ctx, Vector{UInt8}(row)) + end #for else update!(ctx, Vector{UInt8}(String(arg))) end #if From f1072a4bb752abb4df2f244a390b2449cdfeee90 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:27:14 -0500 Subject: [PATCH 21/21] feat: Add biom conversion step --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 6 +++++ src/Qiime.jl | 33 +++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 src/Qiime.jl diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 30ae53b..8de0d43 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -15,7 +15,9 @@ using YAML: YAML export main include("Metaxa.jl") +include("Qiime.jl") using .Metaxa: Metaxa +using .Qiime: Qiime _fetch_yaml_contents(yaml::AbstractPath) = YAML.load_file(string(yaml)) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) @@ -104,7 +106,9 @@ function (@main)(ARGS) @eval begin @everywhere begin include(joinpath(@__DIR__, "Metaxa.jl")) + include(joinpath(@__DIR__, "Qiime.jl")) using .Metaxa: Metaxa + using .Qiime: Qiime end #@everywhere end #@eval taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files) @@ -130,6 +134,8 @@ function (@main)(ARGS) @show rarefaction_min @show rarefaction_max + cp(Qiime.biom_feature_table(feature_table), cwd()) + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 end #function diff --git a/src/Qiime.jl b/src/Qiime.jl new file mode 100644 index 0000000..89b8378 --- /dev/null +++ b/src/Qiime.jl @@ -0,0 +1,33 @@ +module Qiime + +using Base: cache_file_entry +using Conda: runconda +using CSV: CSV +using DataFrames: DataFrame +using FilePathsBase: AbstractPath, Path, absolute, exists + +include("ProcessHelper.jl") + +using .ProcessHelper: cache_function_value + +function _write_biom_feature_table(df_feature_table::DataFrame) + CSV.write("feature-table.tsv", df_feature_table; delim = '\t') + runconda( + `run biom convert \ + -i feature-table.tsv \ + -o feature-table.hdf5.biom \ + --table-type="OTU table" \ + --to-hdf5 \ + --process-obs-metadata taxonomy`, + :qiime2, + ) + exists(Path("feature-table.hdf5.biom")) || + error("qiime2 ran, but feature-table.hdf5.biom was not found!") + return absolute(Path("feature-table.hdf5.biom")) +end #function + +function biom_feature_table(df_feature_table::DataFrame) + return cache_function_value(_write_biom_feature_table, df_feature_table) +end #function + +end #module