8 changed files with 1 additions and 438 deletions
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@ -1 +0,0 @@
-style="blue"
--- a/.gitignore
+++ b/.gitignore
@ -2,34 +2,4 @@
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
-!.vscode/extensions.json
-
-### Julia gitignore ###
-## Files generated by invoking Julia with --code-coverage
-*.jl.cov
-*.jl.*.cov
-
-# Files generated by invoking Julia with --track-allocation
-*.jl.mem
-
-# System-specific files and directories generated by the BinaryProvider and BinDeps packages
-# They contain absolute paths specific to the host computer, and so should not be committed
-deps/deps.jl
-deps/build.log
-deps/downloads/
-deps/usr/
-deps/src/
-
-# Build artifacts for creating documentation generated by the Documenter package
-docs/build/
-docs/site/
-
-# File generated by Pkg, the package manager, based on a corresponding Project.toml
-# It records a fixed state of all packages used by the project. As such, it should not be
-# committed for packages, but should be committed for applications that require a static
-# environment.
-Manifest.toml
-
-### Input files ###
-*.fastq.gz
-metadata*.tsv
+!.vscode/extensions.json
--- a/Project.toml
+++ b/Project.toml
@ -1,43 +0,0 @@
-name = "Cowcalf_rumen_metagenomic_pipeline"
-uuid = "7d0d08ee-6932-474e-8e49-cd3f4679ce2d"
-authors = ["Thomas A. Christensen II <25492070+MillironX@users.noreply.github.com> and contributors"]
-version = "0.1.0"
-
-[deps]
-BaseDirs = "18cc8868-cbac-4acf-b575-c8ff214dc66f"
-CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
-Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
-Cowsay = "b6370f49-8ad1-4651-ad9e-3639b35da0e9"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
-Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
-HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
-
-[compat]
-BaseDirs = "1.2.4"
-CSV = "0.10.15"
-CodecZlib = "0.7.8"
-Conda = "1.10.2"
-Cowsay = "1.0.0"
-DataFrames = "1.7.0"
-Distributed = "1.11.0"
-FilePathsBase = "0.9.24"
-Glob = "1.3.1"
-HTTP = "1.10.16"
-SHA = "0.7.0"
-URIs = "1.5.2"
-UUIDs = "1.11.0"
-YAML = "0.4.13"
-julia = "1.11"
-
-[extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[targets]
-test = ["Test"]
--- a/conda_envs/metaxa2.yml
+++ b/conda_envs/metaxa2.yml
@ -1,8 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-dependencies:
-  - metaxa=2.2
-  - blast-legacy=2.2.26
-  - hmmer=3.1
-  - mafft=7.525
--- a/src/Cowcalf_rumen_metagenomic_pipeline.jl
+++ b/src/Cowcalf_rumen_metagenomic_pipeline.jl
@ -1,143 +0,0 @@
-module Cowcalf_rumen_metagenomic_pipeline
-
-using Conda: Conda, runconda
-using Cowsay: cowsay
-using CSV: CSV
-using DataFrames: DataFrame, Not, combine, eachrow, nrow, rename!, select!
-using Distributed: pmap, @everywhere
-using FilePathsBase: AbstractPath, Path, absolute, cwd, @__PATH__, @p_str
-using Glob: GlobMatch, glob
-using HTTP: HTTP
-using URIs: URI
-using UUIDs: uuid4
-using YAML: YAML
-
-export main
-
-include("Metaxa.jl")
-include("Qiime.jl")
-using .Metaxa: Metaxa
-using .Qiime: Qiime
-
-_fetch_yaml_contents(yaml::AbstractPath) = YAML.load_file(string(yaml))
-_fetch_yaml_contents(yaml::URI) = YAML.load(String(HTTP.get(yaml).body))
-
-function setup_remote_conda_environment(yaml::Union{AbstractPath,URI}, env_name::Symbol)
-    # Check for the existence of the environment with the same name before creating one
-    # We assume that the existence of the environment means that it has been properly setup
-    # TODO: Provide a way to force re-initialization of a conda environment
-    isdir(Conda.prefix(env_name)) && return env_name
-
-    ENV["CONDA_JL_USE_MINIFORGE"] = "1"
-
-    # Install x86 packages via Rosetta2 on MacOS
-    if Sys.isapple()
-        ENV["CONDA_SUBDIR"] = "osx-64"
-        runconda(`config --env --set subdir osx-64`, env_name)
-    end #if
-
-    conda_definition = _fetch_yaml_contents(yaml)
-
-    map(c -> Conda.add_channel(c, env_name), conda_definition["channels"])
-    Conda.add(conda_definition["dependencies"], env_name)
-
-    return env_name
-end #function
-
-function import_metadata(metadata_tsv::AbstractPath)
-    df = DataFrame(CSV.File(metadata_tsv))
-    rename!(df, 1 => :sample_name)
-    return df
-end #function
-
-function sample_files(samplenames::Vector{<:AbstractString})
-    function _s(samplename::AbstractString)
-        # Use explicit GlobMatch constructor b/c we need to interpolate values
-        return (
-            samplename,
-            (
-                absolute(Path(first(glob(GlobMatch("$(samplename)*1*.fastq.gz"))))),
-                absolute(Path(first(glob(GlobMatch("$(samplename)*2*.fastq.gz"))))),
-            ),
-        )
-    end #function
-
-    return map(_s, samplenames)
-end #function
-
-function qiime_feature_table(table_file::AbstractPath)
-    df = DataFrame(CSV.File(table_file))
-
-    # Biom format requires that taxonomy hierarchy be named exactly "taxonomy"
-    rename!(df, 1 => :taxonomy)
-
-    # Every OTU in a feature table must have a universally unique id
-    df[!, Symbol("#SampleId")] = [uuid4() for i = 1:nrow(df)]
-
-    # Biom tables are sensitive to the position of the columns:
-    # arrange them so that #SampleId is first, taxonomy is last,
-    # and all the observed counts are in the middle
-    select!(
-        feature_table,
-        Symbol("#SampleId"),
-        Not([Symbol("#SampleId"), :taxonomy]),
-        :taxonomy,
-    )
-    return df
-end #function
-
-function (@main)(ARGS)
-    metadata_file = Path(pop!(ARGS))
-
-    setup_remote_conda_environment(
-        URI(
-            "https://data.qiime2.org/distro/metagenome/qiime2-metagenome-2024.10-py310-osx-conda.yml",
-        ),
-        :qiime2,
-    )
-    setup_remote_conda_environment(
-        joinpath(@__PATH__, p"..", p"conda_envs", p"metaxa2.yml"),
-        :metaxa2,
-    )
-
-    metadata = import_metadata(metadata_file)
-    fastq_files = sample_files(metadata[!, :sample_name])
-
-    @eval begin
-        @everywhere begin
-            include(joinpath(@__DIR__, "Metaxa.jl"))
-            include(joinpath(@__DIR__, "Qiime.jl"))
-            using .Metaxa: Metaxa
-            using .Qiime: Qiime
-        end #@everywhere
-    end #@eval
-    taxonomy_files = pmap(x -> Metaxa.taxonomy(first(x), last(x)...), fastq_files)
-    feature_table = qiime_feature_table(Metaxa.data_collector(taxonomy_files...))
-    rarefaction_min = minimum(
-        eachrow(
-            combine(
-                feature_table,
-                names(feature_table, Not(:taxonomy, Symbol("#SampleId"))) .=> sum,
-                renamecols = false,
-            ),
-        )...,
-    )
-    rarefaction_max = maximum(
-        eachrow(
-            combine(
-                feature_table,
-                names(feature_table, Not(:taxonomy, Symbol("#SampleId"))) .=> sum,
-                renamecols = false,
-            ),
-        )...,
-    )
-    @show rarefaction_min
-    @show rarefaction_max
-
-    cp(Qiime.biom_feature_table(feature_table), cwd())
-
-    cowsay("Hello from Cowcalf_rumen_metagenomic_pipeline")
-    return 0
-end #function
-
-end #module
--- a/src/Metaxa.jl
+++ b/src/Metaxa.jl
@ -1,72 +0,0 @@
-module Metaxa
-
-using Conda: runconda
-using FilePathsBase: AbstractPath, Path, absolute, exists
-
-include("ProcessHelper.jl")
-
-using .ProcessHelper: cache_function_value
-
-export taxonomy
-
-function _classifier(samplename::AbstractString, fastq1::AbstractPath, fastq2::AbstractPath)
-    runconda(
-        `run metaxa2 \
-            -1 $fastq1 \
-            -2 $fastq2 \
-            -o $samplename \
-            --format fastq \
-            --cpu 4 \
-            --summary F \
-            --graphical F \
-            --fasta F \
-            --taxonomy T
-        `,
-        :metaxa2,
-    )
-    exists(Path("$samplename.taxonomy.txt")) ||
-        error("metaxa2 ran, but $samplename.taxonomy.txt was not found!")
-    return absolute(Path("$samplename.taxonomy.txt"))
-end #function
-
-function _taxonomy_traversal(samplename::AbstractString, taxonomy::AbstractPath)
-    runconda(
-        `run metaxa2_ttt \
-            -i $taxonomy \
-            -o $samplename \
-            -m 7 \
-            -n 7 \
-            --summary F
-        `,
-        :metaxa2,
-    )
-    exists(Path("$samplename.level_7.txt")) ||
-        error("metaxa2 ran, but $samplename.level_7.txt was not found!")
-    return absolute(Path("$samplename.level_7.txt"))
-end #function
-
-function taxonomy(samplename::AbstractString, fastq::AbstractPath...)
-    taxonomy_file = cache_function_value(_classifier, samplename, fastq...)
-    level_7_taxonomy_file =
-        cache_function_value(_taxonomy_traversal, samplename, taxonomy_file)
-    return level_7_taxonomy_file
-end #function
-
-function _dc(taxonomies::AbstractPath...)
-    runconda(
-        `run metaxa2_dc \
-            -o feature-table.tsv \
-            $(map(basename, taxonomies))
-        `,
-        :metaxa2,
-    )
-    exists(Path("feature-table.tsv")) ||
-        error("metaxa2 ran, but feature-table.tsv was not found!")
-    return absolute(Path("feature-table.tsv"))
-end #function
-
-function data_collector(taxonomies::AbstractPath...)
-    return cache_function_value(_dc, taxonomies...)
-end #function
-
-end #module
--- a/src/ProcessHelper.jl
+++ b/src/ProcessHelper.jl
@ -1,107 +0,0 @@
-module ProcessHelper
-
-using BaseDirs: BaseDirs
-using CodecZlib: GzipCompressorStream, GzipDecompressorStream
-using CSV: CSV
-using DataFrames: DataFrame
-using FilePathsBase: AbstractPath, Path, canonicalize, exists
-using SHA: SHA1_CTX, update!, digest!
-
-export cache_function_value
-export exec_in_temp_dir
-
-const CACHE_DIR = Path(
-    BaseDirs.User.cache(
-        BaseDirs.Project("Cowcalf_rumen_metagenomic_pipeline");
-        create = true,
-    ),
-)
-
-function exec_in_temp_dir(f::Function, args...)
-    tmp_dir = Path(mktempdir(; cleanup = false))
-    @info "Creating temporary directory $tmp_dir"
-
-    if any(a -> a isa AbstractPath, args)
-        newargs = []
-        for arg in args
-            if arg isa AbstractPath
-                symlink_path = joinpath(tmp_dir, basename(arg))
-                symlink(canonicalize(arg), symlink_path)
-                @info "Symlinked $arg to $symlink_path"
-                push!(newargs, symlink_path)
-            else
-                push!(newargs, arg)
-            end #if
-        end #for
-
-        return cd(() -> f(newargs...), tmp_dir)
-    else
-        return cd(() -> f(args...), tmp_dir)
-    end #if
-
-end #function
-
-function cache_function_value(f::Function, args...)
-    ctx = SHA1_CTX()
-    update!(ctx, Vector{UInt8}(String(Symbol(f))))
-    for arg in args
-        if arg isa AbstractPath
-            open(string(arg), "r") do a
-                update!(ctx, read(a))
-            end #do
-        elseif arg isa DataFrame
-            for row in CSV.RowWriter(arg)
-                update!(ctx, Vector{UInt8}(row))
-            end #for
-        else
-            update!(ctx, Vector{UInt8}(String(arg)))
-        end #if
-    end #for
-    input_hash = bytes2hex(digest!(ctx))
-
-    cache_file = joinpath(CACHE_DIR, input_hash)
-
-    if exists(cache_file)
-        @info "Found cached file for function `$(String(Symbol(f)))` with input hash $input_hash"
-        tmp_dir = Path(mktempdir(; cleanup = false))
-        @info "Creating temporary directory $tmp_dir"
-
-        filename = readchomp(string(cache_file, ".filename"))
-        uncached_file = joinpath(tmp_dir, filename)
-
-        open(GzipDecompressorStream, string(cache_file), "r") do compressed
-            open(string(uncached_file), "w") do uncompressed
-                write(uncompressed, read(compressed))
-            end #do
-        end #do
-
-        @info "Copied cached file for $input_hash to $uncached_file"
-        return canonicalize(uncached_file)
-
-    end #if
-
-    @info "Input hash $input_hash not found for function `$(String(Symbol(f)))`"
-
-    result_file = exec_in_temp_dir(f, args...)
-
-    result_file isa AbstractPath || error(
-        "`cache_function_value` can only work on functions that return an AbstractPath",
-    )
-
-    filename = basename(result_file)
-    open(string(cache_file, ".filename"), "w") do fnf
-        write(fnf, filename)
-    end #do
-
-    open(string(result_file), "r") do uncompressed
-        open(GzipCompressorStream, string(cache_file), "w") do compressed
-            write(compressed, uncompressed)
-        end #do
-    end #do
-    @info "Copied result file $result_file into cache under hash $input_hash"
-
-    return result_file
-
-end #function
-
-end #module
--- a/src/Qiime.jl
+++ b/src/Qiime.jl
@ -1,33 +0,0 @@
-module Qiime
-
-using Base: cache_file_entry
-using Conda: runconda
-using CSV: CSV
-using DataFrames: DataFrame
-using FilePathsBase: AbstractPath, Path, absolute, exists
-
-include("ProcessHelper.jl")
-
-using .ProcessHelper: cache_function_value
-
-function _write_biom_feature_table(df_feature_table::DataFrame)
-    CSV.write("feature-table.tsv", df_feature_table; delim = '\t')
-    runconda(
-        `run biom convert \
-           -i feature-table.tsv \
-           -o feature-table.hdf5.biom \
-           --table-type="OTU table" \
-           --to-hdf5 \
-           --process-obs-metadata taxonomy`,
-        :qiime2,
-    )
-    exists(Path("feature-table.hdf5.biom")) ||
-        error("qiime2 ran, but feature-table.hdf5.biom was not found!")
-    return absolute(Path("feature-table.hdf5.biom"))
-end #function
-
-function biom_feature_table(df_feature_table::DataFrame)
-    return cache_function_value(_write_biom_feature_table, df_feature_table)
-end #function
-
-end #module