From 10fdcd7b3f9677b8ac2d52eac1c7d2daf446d1e4 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Mon, 14 Apr 2025 16:50:32 -0500 Subject: [PATCH 01/15] perf: Make conda not re-init an existing env --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index c08e683..2a6008e 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -19,6 +19,11 @@ _fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) + # Check for the existence of the environment with the same name before creating one + # We assume that the existence of the environment means that it has been properly setup + # TODO: Provide a way to force re-initialization of a conda environment + isdir(Conda.prefix(env_name)) && return env_name + ENV["CONDA_JL_USE_MINIFORGE"] = "1" # Install x86 packages via Rosetta2 on MacOS From 0f7102b9de2bb79b5362884ef76e7a23bf238b83 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Tue, 15 Apr 2025 15:20:47 -0500 Subject: [PATCH 02/15] fix: Make metaxa2_dc use relative paths instead of absolute --- src/Metaxa.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Metaxa.jl b/src/Metaxa.jl index fd40bee..3635977 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -61,7 +61,7 @@ function _dc(taxonomies::AbstractString...) runconda( `run metaxa2_dc \ -o feature-table.tsv \ - $(join(taxonomies, ' ')) + $(join(map(basename, taxonomies), ' ')) `, :metaxa2, ) From adfef5b9b309134bdce407383d9b22129d79b5c9 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:19:09 -0500 Subject: [PATCH 03/15] refactor: Use FilePathsBase for all file paths --- Project.toml | 2 ++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 19 +++++++------- src/Metaxa.jl | 32 +++++++++-------------- src/ProcessHelper.jl | 14 +++++----- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/Project.toml b/Project.toml index ed6c11c..f89bd4e 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" @@ -22,6 +23,7 @@ Cowsay = "1.0.0" Dagger = "0.18.14" DataFrames = "1.7.0" Distributed = "1.11.0" +FilePathsBase = "0.9.24" Glob = "1.3.1" HTTP = "1.10.16" URIs = "1.5.2" diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 2a6008e..fc915b8 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -5,6 +5,7 @@ using Cowsay: cowsay using CSV: CSV using DataFrames: DataFrame, rename! using Distributed: pmap, @everywhere +using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str using Glob: GlobMatch, glob using HTTP: HTTP using URIs: URI @@ -15,10 +16,10 @@ export main include("Metaxa.jl") using .Metaxa: Metaxa -_fetch_yaml_contents(yaml::AbstractString) = YAML.load_file(yaml) +_fetch_yaml_contents(yaml::AbstractPath) = YAML.load_file(string(yaml)) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) -function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_name::Symbol) +function setup_remote_conda_environment(yaml::Union{AbstractPath,URI}, env_name::Symbol) # Check for the existence of the environment with the same name before creating one # We assume that the existence of the environment means that it has been properly setup # TODO: Provide a way to force re-initialization of a conda environment @@ -40,7 +41,7 @@ function setup_remote_conda_environment(yaml::Union{AbstractString,URI}, env_nam return env_name end #function -function import_metadata(metadata_tsv::AbstractString) +function import_metadata(metadata_tsv::AbstractPath) df = DataFrame(CSV.File(metadata_tsv)) rename!(df, 1 => :sample_name) return df @@ -52,8 +53,8 @@ function sample_files(samplenames::Vector{<:AbstractString}) return ( samplename, ( - abspath(first(glob(GlobMatch("$(samplename)*1*.fastq.gz")))), - abspath(first(glob(GlobMatch("$(samplename)*2*.fastq.gz")))), + absolute(Path(first(glob(GlobMatch("$(samplename)*1*.fastq.gz"))))), + absolute(Path(first(glob(GlobMatch("$(samplename)*2*.fastq.gz"))))), ), ) end #function @@ -62,7 +63,7 @@ function sample_files(samplenames::Vector{<:AbstractString}) end #function function (@main)(ARGS) - metadata_file = pop!(ARGS) + metadata_file = Path(pop!(ARGS)) setup_remote_conda_environment( URI( @@ -71,7 +72,7 @@ function (@main)(ARGS) :qiime2, ) setup_remote_conda_environment( - joinpath(@__DIR__, "..", "conda_envs", "metaxa2.yml"), + joinpath(@__PATH__, p"..", p"conda_envs", p"metaxa2.yml"), :metaxa2, ) @@ -84,9 +85,9 @@ function (@main)(ARGS) using .Metaxa: Metaxa end #@everywhere end #@eval - taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)), fastq_files) + taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files) feature_table = Metaxa.data_collector(taxonomy_files...) - cp(feature_table, joinpath(pwd(), "feature-table.tsv")) + cp(feature_table, joinpath(cwd(), p"feature-table.tsv")) cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 diff --git a/src/Metaxa.jl b/src/Metaxa.jl index 3635977..7b0e0ac 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -1,6 +1,7 @@ module Metaxa using Conda: runconda +using FilePathsBase: AbstractPath, Path, absolute, exists include("ProcessHelper.jl") @@ -8,11 +9,7 @@ using .ProcessHelper: exec_in_temp_dir export taxonomy -function _classifier( - samplename::AbstractString, - fastq1::AbstractString, - fastq2::AbstractString, -) +function _classifier(samplename::AbstractString, fastq1::AbstractPath, fastq2::AbstractPath) runconda( `run metaxa2 \ -1 $fastq1 \ @@ -27,12 +24,12 @@ function _classifier( `, :metaxa2, ) - ispath("$samplename.taxonomy.txt") || + exists(Path("$samplename.taxonomy.txt")) || error("metaxa2 ran, but $samplename.taxonomy.txt was not found!") - return abspath("$samplename.taxonomy.txt") + return absolute(Path("$samplename.taxonomy.txt")) end #function -function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractString) +function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractPath) runconda( `run metaxa2_ttt \ -i $taxonomy \ @@ -43,34 +40,31 @@ function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractStrin `, :metaxa2, ) - ispath("$samplename.level_7.txt") || + exists(Path("$samplename.level_7.txt")) || error("metaxa2 ran, but $samplename.level_7.txt was not found!") - return abspath("$samplename.level_7.txt") + return absolute(Path("$samplename.level_7.txt")) end #function -function taxonomy( - samplename::AbstractString, - fastq::Tuple{<:AbstractString,<:AbstractString}, -) +function taxonomy(samplename::AbstractString, fastq::AbstractPath...) taxonomy_file = exec_in_temp_dir(_classifier, samplename, fastq...) level_7_taxonomy_file = exec_in_temp_dir(_taxonomy_traversal, samplename, taxonomy_file) return level_7_taxonomy_file end #function -function _dc(taxonomies::AbstractString...) +function _dc(taxonomies::AbstractPath...) runconda( `run metaxa2_dc \ -o feature-table.tsv \ - $(join(map(basename, taxonomies), ' ')) + $(map(basename, taxonomies)) `, :metaxa2, ) - ispath("feature-table.tsv") || + exists(Path("feature-table.tsv")) || error("metaxa2 ran, but feature-table.tsv was not found!") - return abspath("feature-table.tsv") + return absolute(Path("feature-table.tsv")) end #function -function data_collector(taxonomies::AbstractString...) +function data_collector(taxonomies::AbstractPath...) return exec_in_temp_dir(_dc, taxonomies...) end #function diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index ca01370..2592af9 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -1,5 +1,7 @@ module ProcessHelper +using FilePathsBase: AbstractPath, Path, canonicalize + export sym_temp """ @@ -9,13 +11,13 @@ export sym_temp Copies `file(s)` to a new temporary directory named `tmp_dir` and symbolically links them inside of that directory. Returns a tuple of the directory path and the path to the links. """ -function sym_temp(files::AbstractString...) - tmp_dir = mktempdir(; cleanup = false) +function sym_temp(files::AbstractPath...) + tmp_dir = Path(mktempdir(; cleanup = false)) @info "Creating temporary directory $tmp_dir" - function _symlink(file::AbstractString) + function _symlink(file::AbstractPath) symlink_path = joinpath(tmp_dir, basename(file)) - symlink(realpath(file), symlink_path) + symlink(canonicalize(file), symlink_path) @info "Symlinked $file to $symlink_path" return symlink_path end #function @@ -23,12 +25,12 @@ function sym_temp(files::AbstractString...) return (tmp_dir, map(_symlink, files)) end #function -function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractString...) +function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractPath...) tmp_dir, tmp_files = sym_temp(files...) return cd(() -> f(samplename, tmp_files...), tmp_dir) end #function -function exec_in_temp_dir(f::Function, files::AbstractString...) +function exec_in_temp_dir(f::Function, files::AbstractPath...) tmp_dir, tmp_files = sym_temp(files...) return cd(() -> f(tmp_files...), tmp_dir) end #function From 7b3cb740d9fdee5268a542463f49be1a4f5aa797 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:20:38 -0500 Subject: [PATCH 04/15] meta: Add input files to gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index f9c1b1b..0acd465 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,7 @@ docs/site/ # committed for packages, but should be committed for applications that require a static # environment. Manifest.toml + +### Input files ### +*.fastq.gz +metadata*.tsv From 516cc4985c38eb798e3f9dd5248477bc2d5917c2 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 08:35:53 -0500 Subject: [PATCH 05/15] refactor: Make exec_in_temp_dir more generic Instead of having the function have multiple methods with different signatures, we can slurp args and then use reflection to symlink any Path objects while passing the values unmodified. So do that. --- src/ProcessHelper.jl | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index 2592af9..d1a2f60 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -2,37 +2,30 @@ module ProcessHelper using FilePathsBase: AbstractPath, Path, canonicalize -export sym_temp +export exec_in_temp_dir -""" - sym_temp(file::AbstractString) -> (tmp_dir, link) - sym_temp(files::Tuple{<:AbstractString,<:AbstractString}) -> (tmp_dir, links) - -Copies `file(s)` to a new temporary directory named `tmp_dir` and symbolically links them -inside of that directory. Returns a tuple of the directory path and the path to the links. -""" -function sym_temp(files::AbstractPath...) +function exec_in_temp_dir(f::Function, args...) tmp_dir = Path(mktempdir(; cleanup = false)) @info "Creating temporary directory $tmp_dir" - function _symlink(file::AbstractPath) - symlink_path = joinpath(tmp_dir, basename(file)) - symlink(canonicalize(file), symlink_path) - @info "Symlinked $file to $symlink_path" - return symlink_path - end #function + if any(a -> a isa AbstractPath, args) + newargs = [] + for arg in args + if arg isa AbstractPath + symlink_path = joinpath(tmp_dir, basename(arg)) + symlink(canonicalize(arg), symlink_path) + @info "Symlinked $arg to $symlink_path" + push!(newargs, symlink_path) + else + push!(newargs, arg) + end #if + end #for - return (tmp_dir, map(_symlink, files)) -end #function + return cd(() -> f(newargs...), tmp_dir) + else + return cd(() -> f(args...), tmp_dir) + end #if -function exec_in_temp_dir(f::Function, samplename::AbstractString, files::AbstractPath...) - tmp_dir, tmp_files = sym_temp(files...) - return cd(() -> f(samplename, tmp_files...), tmp_dir) -end #function - -function exec_in_temp_dir(f::Function, files::AbstractPath...) - tmp_dir, tmp_files = sym_temp(files...) - return cd(() -> f(tmp_files...), tmp_dir) end #function end #module From d97c0898bc66bddb4be772d71cdcc651cfe5f0d2 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:06:56 -0500 Subject: [PATCH 06/15] feat: Add caching function --- Project.toml | 8 +++-- src/ProcessHelper.jl | 73 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index f89bd4e..32981f2 100644 --- a/Project.toml +++ b/Project.toml @@ -4,28 +4,32 @@ authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.co version = "0.1.0" [deps] +BaseDirs = "18cc8868-cbac-4acf-b575-c8ff214dc66f" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9" -Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f" Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +BaseDirs = "1.2.4" CSV = "0.10.15" +CodecZlib = "0.7.8" Conda = "1.10.2" Cowsay = "1.0.0" -Dagger = "0.18.14" DataFrames = "1.7.0" Distributed = "1.11.0" FilePathsBase = "0.9.24" Glob = "1.3.1" HTTP = "1.10.16" +SHA = "0.7.0" URIs = "1.5.2" YAML = "0.4.13" julia = "1.11" diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index d1a2f60..af513f2 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -1,9 +1,21 @@ module ProcessHelper -using FilePathsBase: AbstractPath, Path, canonicalize +using BaseDirs: BaseDirs +using CodecZlib: GzipCompressorStream, GzipDecompressorStream +using FilePathsBase: AbstractPath, Path, canonicalize, exists +using MacroTools: splitdef +using SHA: SHA1_CTX, update!, digest! +export cache_function_value export exec_in_temp_dir +const CACHE_DIR = Path( + BaseDirs.User.cache( + BaseDirs.Project("Cowcalf_rumen_metagenomic_pipeline"); + create = true, + ), +) + function exec_in_temp_dir(f::Function, args...) tmp_dir = Path(mktempdir(; cleanup = false)) @info "Creating temporary directory $tmp_dir" @@ -28,4 +40,63 @@ function exec_in_temp_dir(f::Function, args...) end #function +function cache_function_value(f::Function, args...) + ctx = SHA1_CTX() + update!(ctx, Vector{UInt8}(String(Symbol(f)))) + for arg in args + if arg isa AbstractPath + open(string(arg), "r") do a + update!(ctx, read(a)) + end #do + else + update!(ctx, Vector{UInt8}(String(arg))) + end #if + end #for + input_hash = bytes2hex(digest!(ctx)) + + cache_file = joinpath(CACHE_DIR, input_hash) + + if exists(cache_file) + @info "Found cached file for function `$(String(Symbol(f)))` with input hash $input_hash" + tmp_dir = Path(mktempdir(; cleanup = false)) + @info "Creating temporary directory $tmp_dir" + + filename = readchomp(string(cache_file, ".filename")) + uncached_file = joinpath(tmp_dir, filename) + + open(GzipDecompressorStream, string(cache_file), "r") do compressed + open(string(uncached_file), "w") do uncompressed + write(uncompressed, read(compressed)) + end #do + end #do + + @info "Copied cached file for $input_hash to $uncached_file" + return canonicalize(uncached_file) + + end #if + + @info "Input hash $input_hash not found for function `$(String(Symbol(f)))`" + + result_file = exec_in_temp_dir(f, args...) + + result_file isa AbstractPath || error( + "`cache_function_value` can only work on functions that return an AbstractPath", + ) + + filename = basename(result_file) + open(string(cache_file, ".filename"), "w") do fnf + write(fnf, filename) + end #do + + open(string(result_file), "r") do uncompressed + open(GzipCompressorStream, string(cache_file), "w") do compressed + write(compressed, uncompressed) + end #do + end #do + @info "Copied result file $result_file into cache under hash $input_hash" + + return result_file + +end #function + end #module From e0b6a056ff7ed813688e638570c94628b02bb9c8 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:07:59 -0500 Subject: [PATCH 07/15] refactor: Switch to using cached file processes in Metaxa --- src/Metaxa.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Metaxa.jl b/src/Metaxa.jl index 7b0e0ac..57f5fbb 100644 --- a/src/Metaxa.jl +++ b/src/Metaxa.jl @@ -5,7 +5,7 @@ using FilePathsBase: AbstractPath, Path, absolute, exists include("ProcessHelper.jl") -using .ProcessHelper: exec_in_temp_dir +using .ProcessHelper: cache_function_value export taxonomy @@ -46,8 +46,9 @@ function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractPath) end #function function taxonomy(samplename::AbstractString, fastq::AbstractPath...) - taxonomy_file = exec_in_temp_dir(_classifier, samplename, fastq...) - level_7_taxonomy_file = exec_in_temp_dir(_taxonomy_traversal, samplename, taxonomy_file) + taxonomy_file = cache_function_value(_classifier, samplename, fastq...) + level_7_taxonomy_file = + cache_function_value(_taxonomy_traversal, samplename, taxonomy_file) return level_7_taxonomy_file end #function @@ -65,7 +66,7 @@ function _dc(taxonomies::AbstractPath...) end #function function data_collector(taxonomies::AbstractPath...) - return exec_in_temp_dir(_dc, taxonomies...) + return cache_function_value(_dc, taxonomies...) end #function end #module From 5c74bf3d69ff466324952a57382e328d7adaceaf Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:08:48 -0500 Subject: [PATCH 08/15] fixup! feat: Add caching function --- src/ProcessHelper.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index af513f2..2ea2bf6 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -3,7 +3,6 @@ module ProcessHelper using BaseDirs: BaseDirs using CodecZlib: GzipCompressorStream, GzipDecompressorStream using FilePathsBase: AbstractPath, Path, canonicalize, exists -using MacroTools: splitdef using SHA: SHA1_CTX, update!, digest! export cache_function_value From 30600e5e18d59304b5a5ce27a8aa919eacf86fb6 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 17:07:53 -0500 Subject: [PATCH 09/15] feat: Add feature table manipulation --- Project.toml | 2 ++ src/Cowcalf_rumen_metagenomic_pipeline.jl | 34 +++++++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 32981f2..3272bdb 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Glob = "c27321d9-0574-5035-807b-f59d2c89b15c" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] @@ -31,6 +32,7 @@ Glob = "1.3.1" HTTP = "1.10.16" SHA = "0.7.0" URIs = "1.5.2" +UUIDs = "1.11.0" YAML = "0.4.13" julia = "1.11" diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index fc915b8..fdc58a9 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -3,12 +3,13 @@ module Cowcalf_rumen_metagenomic_pipeline using Conda: Conda, runconda using Cowsay: cowsay using CSV: CSV -using DataFrames: DataFrame, rename! +using DataFrames: DataFrame, combine, eachrow, nrow, rename!, select! using Distributed: pmap, @everywhere using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str using Glob: GlobMatch, glob using HTTP: HTTP using URIs: URI +using UUIDs: uuid4 using YAML: YAML export main @@ -62,6 +63,14 @@ function sample_files(samplenames::Vector{<:AbstractString}) return map(_s, samplenames) end #function +function qiime_feature_table(table_file::AbstractPath) + df = DataFrame(CSV.File(table_file)) + rename!(df, 1 => :taxonomy) + df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] + select!(df, Symbol("#SampleId"), :) + return df +end #function + function (@main)(ARGS) metadata_file = Path(pop!(ARGS)) @@ -86,8 +95,27 @@ function (@main)(ARGS) end #@everywhere end #@eval taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files) - feature_table = Metaxa.data_collector(taxonomy_files...) - cp(feature_table, joinpath(cwd(), p"feature-table.tsv")) + feature_table = qiime_feature_table(Metaxa.data_collector(taxonomy_files...)) + rarefaction_min = minimum( + eachrow( + combine( + feature_table, + names(feature_table, Not(:taxonomy, Symbol("#SampleId"))) .=> sum, + renamecols = false, + ), + )..., + ) + rarefaction_max = maximum( + eachrow( + combine( + feature_table, + names(feature_table, Not(:taxonomy, Symbol("#SampleId"))) .=> sum, + renamecols = false, + ), + )..., + ) + @show rarefaction_min + @show rarefaction_max cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 From 0c09cd9e33e117892a49829a12949f2e29549bf6 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:21 -0500 Subject: [PATCH 10/15] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index fdc58a9..c917470 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -65,6 +65,8 @@ end #function function qiime_feature_table(table_file::AbstractPath) df = DataFrame(CSV.File(table_file)) + + # Biom format requires that taxonomy hierarchy be named exactly "taxonomy" rename!(df, 1 => :taxonomy) df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] select!(df, Symbol("#SampleId"), :) From 4462ddab6619fe070d71db7c6d5d4188a2794056 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:21 -0500 Subject: [PATCH 11/15] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index c917470..c89cfe7 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -68,6 +68,8 @@ function qiime_feature_table(table_file::AbstractPath) # Biom format requires that taxonomy hierarchy be named exactly "taxonomy" rename!(df, 1 => :taxonomy) + + # Every OTU in a feature table must have a universally unique id df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] select!(df, Symbol("#SampleId"), :) return df From 9d124cb796b49a4f191addf5a213aff8b8f55355 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:21 -0500 Subject: [PATCH 12/15] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index c89cfe7..938375d 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -71,7 +71,16 @@ function qiime_feature_table(table_file::AbstractPath) # Every OTU in a feature table must have a universally unique id df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)] - select!(df, Symbol("#SampleId"), :) + + # Biom tables are sensitive to the position of the columns: + # arrange them so that #SampleId is first, taxonomy is last, + # and all the observed counts are in the middle + select!( + feature_table, + Symbol("#SampleId"), + Not([Symbol("#SampleId"), :taxonomy]), + :taxonomy, + ) return df end #function From 630385829fae70f55b9ce620ee126b5b40aae63d Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:25:48 -0500 Subject: [PATCH 13/15] fixup! feat: Add feature table manipulation --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 938375d..30ae53b 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -3,7 +3,7 @@ module Cowcalf_rumen_metagenomic_pipeline using Conda: Conda, runconda using Cowsay: cowsay using CSV: CSV -using DataFrames: DataFrame, combine, eachrow, nrow, rename!, select! +using DataFrames: DataFrame, Not, combine, eachrow, nrow, rename!, select! using Distributed: pmap, @everywhere using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str using Glob: GlobMatch, glob From e5db18513379e3244e7c675cbb5f0f6612716a73 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:26:43 -0500 Subject: [PATCH 14/15] feat: Add DataFrame compat to cache --- src/ProcessHelper.jl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ProcessHelper.jl b/src/ProcessHelper.jl index 2ea2bf6..e87ee8e 100644 --- a/src/ProcessHelper.jl +++ b/src/ProcessHelper.jl @@ -2,6 +2,8 @@ module ProcessHelper using BaseDirs: BaseDirs using CodecZlib: GzipCompressorStream, GzipDecompressorStream +using CSV: CSV +using DataFrames: DataFrame using FilePathsBase: AbstractPath, Path, canonicalize, exists using SHA: SHA1_CTX, update!, digest! @@ -47,6 +49,10 @@ function cache_function_value(f::Function, args...) open(string(arg), "r") do a update!(ctx, read(a)) end #do + elseif arg isa DataFrame + for row in CSV.RowWriter(arg) + update!(ctx, Vector{UInt8}(row)) + end #for else update!(ctx, Vector{UInt8}(String(arg))) end #if From f1072a4bb752abb4df2f244a390b2449cdfeee90 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:27:14 -0500 Subject: [PATCH 15/15] feat: Add biom conversion step --- src/Cowcalf_rumen_metagenomic_pipeline.jl | 6 +++++ src/Qiime.jl | 33 +++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 src/Qiime.jl diff --git a/src/Cowcalf_rumen_metagenomic_pipeline.jl b/src/Cowcalf_rumen_metagenomic_pipeline.jl index 30ae53b..8de0d43 100644 --- a/src/Cowcalf_rumen_metagenomic_pipeline.jl +++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl @@ -15,7 +15,9 @@ using YAML: YAML export main include("Metaxa.jl") +include("Qiime.jl") using .Metaxa: Metaxa +using .Qiime: Qiime _fetch_yaml_contents(yaml::AbstractPath) = YAML.load_file(string(yaml)) _fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body)) @@ -104,7 +106,9 @@ function (@main)(ARGS) @eval begin @everywhere begin include(joinpath(@__DIR__, "Metaxa.jl")) + include(joinpath(@__DIR__, "Qiime.jl")) using .Metaxa: Metaxa + using .Qiime: Qiime end #@everywhere end #@eval taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files) @@ -130,6 +134,8 @@ function (@main)(ARGS) @show rarefaction_min @show rarefaction_max + cp(Qiime.biom_feature_table(feature_table), cwd()) + cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline") return 0 end #function diff --git a/src/Qiime.jl b/src/Qiime.jl new file mode 100644 index 0000000..89b8378 --- /dev/null +++ b/src/Qiime.jl @@ -0,0 +1,33 @@ +module Qiime + +using Base: cache_file_entry +using Conda: runconda +using CSV: CSV +using DataFrames: DataFrame +using FilePathsBase: AbstractPath, Path, absolute, exists + +include("ProcessHelper.jl") + +using .ProcessHelper: cache_function_value + +function _write_biom_feature_table(df_feature_table::DataFrame) + CSV.write("feature-table.tsv", df_feature_table; delim = '\t') + runconda( + `run biom convert \ + -i feature-table.tsv \ + -o feature-table.hdf5.biom \ + --table-type="OTU table" \ + --to-hdf5 \ + --process-obs-metadata taxonomy`, + :qiime2, + ) + exists(Path("feature-table.hdf5.biom")) || + error("qiime2 ran, but feature-table.hdf5.biom was not found!") + return absolute(Path("feature-table.hdf5.biom")) +end #function + +function biom_feature_table(df_feature_table::DataFrame) + return cache_function_value(_write_biom_feature_table, df_feature_table) +end #function + +end #module