diff --git a/.actrc b/.actrc deleted file mode 100644 index 027d95f..0000000 --- a/.actrc +++ /dev/null @@ -1 +0,0 @@ --P ubuntu-latest=ghcr.io/catthehacker/ubuntu:act-latest diff --git a/.cspell/bioinformatics.txt b/.cspell/bioinformatics.txt deleted file mode 100644 index 676f294..0000000 --- a/.cspell/bioinformatics.txt +++ /dev/null @@ -1,4 +0,0 @@ -bam -haplotype -haplotypes -vcf diff --git a/.cspell/julia.txt b/.cspell/julia.txt deleted file mode 100644 index f159bc2..0000000 --- a/.cspell/julia.txt +++ /dev/null @@ -1,2 +0,0 @@ -trunc -jldoctest diff --git a/.envrc b/.envrc deleted file mode 100644 index 1d953f4..0000000 --- a/.envrc +++ /dev/null @@ -1 +0,0 @@ -use nix diff --git a/.markdownlint.yaml b/.markdownlint.yaml deleted file mode 100644 index 877dd93..0000000 --- a/.markdownlint.yaml +++ /dev/null @@ -1,9 +0,0 @@ -MD007: - indent: 2 -MD024: - allow_different_nesting: true -MD026: - punctuation: ",;:。,;:" -MD030: - ul_single: 1 - ul_multi: 1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 5e5d52f..0000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,18 +0,0 @@ -repos: - - repo: https://github.com/domluna/JuliaFormatter.jl - rev: v1.0.22 - hooks: - - id: julia-formatter - - repo: https://github.com/DavidAnson/markdownlint-cli2 - rev: v0.6.0 - hooks: - - id: markdownlint-cli2-fix - exclude: "docs" - - repo: https://github.com/pre-commit/mirrors-prettier - rev: v2.7.1 - hooks: - - id: prettier - additional_dependencies: - - prettier@2.7.1 - - prettier-plugin-sh@0.11.0 - - prettier-plugin-toml@0.3.1 diff --git a/.prettierignore b/.prettierignore deleted file mode 100644 index 9c4a2fa..0000000 --- a/.prettierignore +++ /dev/null @@ -1,40 +0,0 @@ -### Julia gitignore ### -# Files generated by invoking Julia with --code-coverage -*.jl.cov -*.jl.*.cov - -# Files generated by invoking Julia with --track-allocation -*.jl.mem - -# System-specific files and directories generated by the BinaryProvider and BinDeps packages -# They contain absolute paths specific to the host computer, and so should not be committed -deps/deps.jl -deps/build.log -deps/downloads/ -deps/usr/ -deps/src/ - -# Build artifacts for creating documentation generated by the Documenter package -docs/build/ -docs/site/ -docs/Manifest.toml - -# File generated by Pkg, the package manager, based on a corresponding Project.toml -# It records a fixed state of all packages used by the project. As such, it should not be -# committed for packages, but should be committed for applications that require a static -# environment. -Manifest.toml - -# Files generated during compilation/testing -build -example/output.* -example/reference.fasta.fai - -### Julia prettierignore ### -# Project file managed and formatted by Pkg.jl -Project.toml - -# Prettier doesn't understand the nuance of Julia markdown, so ignore docs -# entirely -docs/src/*.md -docs/src/*/*.md diff --git a/.prettierrc.toml b/.prettierrc.toml deleted file mode 100644 index 5a09f1f..0000000 --- a/.prettierrc.toml +++ /dev/null @@ -1,12 +0,0 @@ -tabWidth = 4 -proseWrap = "always" - -[[overrides]] -files = "*.{md,yml,yaml,json,Dockerfile,sh,svg}" - -[overrides.options] -tabWidth = 2 - -[[overrides]] -files = "*.md" -proseWrap = "always" diff --git a/CHANGELOG.md b/CHANGELOG.md index 9166e55..1d26aeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,17 @@ and this project adheres to ## [Unreleased] +## [1.1.0] - 2023-12-22 + +### Changed + +- Documentation improved ([#62](https://github.com/ksumngs/pull/62)) +- Haplotype calling uses smallest possible bit depth + ([#61](https://github.com/ksumngs/pull/61)) +- Haplotype calling now uses sparse matrices + ([#60](https://github.com/ksumngs/HapLink.jl/pull/60)) - FASTX.jl downgraded to v1 - ([#56](https://github.com/ksumng/HapLink.jl/pull/56)) + ([#56](https://github.com/ksumngs/HapLink.jl/pull/56)) ## [1.0.0] - 2023-06-04 @@ -209,7 +218,8 @@ and this project adheres to - `Haplotype` - `Variant` -[unreleased]: https://github.com/ksumngs/HapLink.jl/compare/v1.0.0...HEAD +[unreleased]: https://github.com/ksumngs/HapLink.jl/compare/v1.1.0...HEAD +[1.1.0]: https://github.com/ksumngs/HapLink.jl/compare/v1.0.0...v1.1.0 [1.0.0]: https://github.com/ksumngs/HapLink.jl/compare/v0.7.1...v1.0.0 [0.7.1]: https://github.com/ksumngs/HapLink.jl/compare/v0.7.0...v0.7.1 [0.7.0]: https://github.com/ksumngs/HapLink.jl/compare/v0.6.0...v0.7.0 diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 768bbe1..0000000 --- a/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -FROM ubuntu:focal - -ENV JULIA_VERSION 1.6.5 -ENV JULIA_DEPOT_PATH /.julia - -# Install the build dependencies -RUN \ - apt-get update \ - && apt-get install -y --no-install-recommends \ - curl \ - git \ - ca-certificates \ - build-essential - -# Install Julia -RUN \ - cd / \ - && curl -L "https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-${JULIA_VERSION}-linux-x86_64.tar.gz" | tar xvz \ - && ln -s /julia-${JULIA_VERSION}/bin/julia /usr/bin/julia - -# Install PackageCompiler.jl -RUN \ - julia -e 'using Pkg; Pkg.add("PackageCompiler")' - -# Copy HapLink.jl -COPY . /HapLink.jl - -# Clone and build HapLink.jl -RUN \ - cd /HapLink.jl \ - && git clean -dfx \ - && julia -e 'using Pkg; Pkg.activate("."); Pkg.instantiate()' \ - && julia -e 'using PackageCompiler; create_app(".", "build", precompile_execution_file="precompile_app.jl", executables=["haplink" => "haplink"], cpu_target="x86-64")' - -FROM ubuntu:focal - -COPY --from=0 /HapLink.jl/build/bin /usr/bin -COPY --from=0 /HapLink.jl/build/lib /usr/lib -COPY --from=0 /HapLink.jl/build/share /usr/share - -ENTRYPOINT ["/usr/bin/haplink"] diff --git a/Earthfile b/Earthfile deleted file mode 100644 index d93b6e5..0000000 --- a/Earthfile +++ /dev/null @@ -1,85 +0,0 @@ -VERSION 0.7 -FROM alpine:3.17 - -docs: - FROM julia:alpine3.18 - COPY --dir src . - COPY --dir docs . - COPY Project.toml . - RUN apk add --update --no-cache git - RUN julia --color=yes --project=docs -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' - RUN julia --color=yes --project=docs -e 'using Documenter: DocMeta, doctest; using HapLink; DocMeta.setdocmeta!(HapLink, :DocTestSetup, :(using HapLink); recursive=true); doctest(HapLink)' - RUN julia --color=yes --project=docs docs/make.jl - SAVE ARTIFACT docs/build AS LOCAL local-output/site - -test-all: - FOR JULIA_VERSION IN 1 rc 1.6 - BUILD +test --version $JULIA_VERSION - END - -test: - ARG version=latest - ARG project=@. - ARG precompile=no - ARG check_bounds=yes - ARG coverage=true - ARG depwarn=yes - ARG force_latest_compatible_version=auto - ARG inline=yes - ARG prefix='' - ARG annotate=false - ARG runtest_version=1.9.3 - FROM julia:$version - COPY --dir src . - COPY --dir docs . - COPY --dir test . - COPY Project.toml . - ENV JULIA_PKG_PRECOMPILE_AUTO=$precompile - ENV ANNOTATE=$annotate - ENV COVERAGE=$coverage - ENV FORCE_LATEST_COMPATIBLE_VERSION=$force_latest_compatible_version - ENV CHECK_BOUNDS=$check_bounds - ENV INPUT_DIRECTORIES='src,ext' - RUN julia --color=yes --project=$project -e ' \ - import Pkg; \ - VERSION >= v"1.5-" && Pkg.Registry.add("General"); \ - VERSION >= v"1.1.0-rc1" ? Pkg.build(verbose=true) : Pkg.build()' - RUN julia --color=yes -e '\ - if v"1.8pre" < VERSION < v"1.9.0-beta3"; \ - using Pkg; \ - Pkg.activate("tests-logger-env"; shared=true); \ - Pkg.add(Pkg.PackageSpec(name="GitHubActions", version="0.1")); \ - end' - RUN curl -L https://github.com/julia-actions/julia-runtest/archive/refs/tags/v1.9.3.tar.gz | tar xvz - RUN julia --color=yes \ - --depwarn=yes \ - --inline=yes \ - --project=@. \ - -e 'include(joinpath("julia-runtest-1.9.3", "test_harness.jl"))' - RUN curl -L https://github.com/julia-actions/julia-processcoverage/archive/refs/tags/v1.2.2.tar.gz | tar xvz - RUN julia --color=yes julia-processcoverage-1.2.2/main.jl - SAVE ARTIFACT lcov.info AS LOCAL local-output/lcov.$version.info - -compiler: - FROM julia:latest - RUN apt-get update && apt-get install -y --no-install-recommends curl git ca-certificates build-essential - -build: - FROM +compiler - COPY --dir src . - COPY --dir deps . - COPY --dir example . - COPY Project.toml . - COPY Comonicon.toml . - RUN julia --project -e 'using Pkg; Pkg.instantiate()' - RUN julia --project deps/build.jl app - SAVE ARTIFACT build AS LOCAL build - -docker: - FROM ubuntu:focal - ARG TAG=latest - COPY +build/build/haplink/bin /usr/bin - COPY +build/build/haplink/lib /usr/lib - COPY +build/build/haplink/share /usr/share - ENTRYPOINT ["/usr/bin/haplink"] - SAVE IMAGE --push millironx/haplink:${TAG} diff --git a/Project.toml b/Project.toml index d76edb9..8480f3c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "HapLink" uuid = "8ca39d33-de0d-4205-9b21-13a80f2b7eed" authors = ["Thomas A. Christensen II, Kansas State University, and contributors"] -version = "1.0.0" +version = "1.1.0" [deps] ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" @@ -38,14 +38,19 @@ BioSequences = "3" BioSymbols = "5" Combinatorics = "1" Comonicon = "1" +Dates = "1.6" Distributions = "0.25" FASTX = "1.3" FilePaths = "0.8" GenomicFeatures = "2" HypothesisTests = "0.10" OrderedCollections = "1.4" +Pkg = "1.6" +Random = "1.6" +SHA = "0.7, 1" SequenceVariation = "0.2.2" SparseArrayKit = "0.2.1" +Statistics = "1.6" VariantCallFormat = "0.5" XAM = "0.3.1" YAML = "0.4" diff --git a/README.md b/README.md index c4f9a31..4673e60 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # HapLink -[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![GitHub tag (latest by date)](https://img.shields.io/github/v/tag/ksumngs/HapLink.jl?label=version)](https://github.com/ksumngs/HapLink.jl/blob/master/CHANGELOG.md) [![License](https://img.shields.io/github/license/ksumngs/HapLink.jl)](https://github.com/ksumngs/HapLink.jl/) [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://ksumngs.github.io/HapLink.jl/stable) @@ -10,6 +10,8 @@ [![Build Status](https://github.com/ksumngs/HapLink.jl/workflows/CI/badge.svg)](https://github.com/ksumngs/HapLink.jl/actions) [![Coverage](https://codecov.io/gh/ksumngs/HapLink.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/ksumngs/HapLink.jl) [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) +[![Conda (channel only)](https://img.shields.io/conda/vn/bioconda/haplink?color=green)](https://anaconda.org/bioconda/haplink) +[![Genie Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/HapLink)](https://pkgs.genieframework.com?packages=HapLink) @@ -24,43 +26,46 @@ the entire genome. Comes with its own variant caller. ## Installation -To run HapLink, your system must meet the following requirements +### :snake: Via Bioconda -- Linux OS -- glibc -- x86-64 CPU +**:warning::penguin: Linux-only!** -These restrictions apply even when using the package from within Julia. - -If you need to use HapLink somewhere else, everything needed is available in a -[Docker image over on Quay]. - -### Prebuilt Binaries - -To install our Hot-and-Ready binaries, run the following command: - - +_Recommended for running HapLink on the **command line**_ ```bash -mkdir -p ~/.local/opt/HapLink-0.7.1 -curl -L https://github.com/ksumngs/HapLink.jl/releases/download/v0.7.1/HapLink-v0.7.1_linux.x86_64.tar.gz | tar xzv -C ~/.local/opt/HapLink-0.7.1 -ln -s ~/.local/opt/HapLink-0.7.1/bin/haplink ~/.local/bin +conda create -n haplink -c bioconda -c conda-forge haplink -y +conda activate haplink ``` - +### ∴ Via Julia REPL -### Julia Package +_Recommended for running HapLink within a **Julia session**_ -HapLink is not in the General Registry (yet!), so install using the `URL#tag` -syntax to use in the REPL. +```julia-repl +julia> ] +(@v1.6) pkg> add HapLink +``` -```julia -using Pkg; Pkg.add("https://github.com/ksumngs/HapLink#v0.7.1") +To use this install of HapLink from the command line, you will need to add +`$HOME/.julia/bin` to your `$PATH`. + +### :package: Via Apptainer + +_Recommended for running HapLink on a **HPC**_ + +```bash +apptainer pull docker://ghcr.io/ksumngs/haplink.jl +``` + +### :whale: Via Docker + +```bash +docker pull ghcr.io/ksumngs/haplink.jl:latest ``` ## Usage -Please check [the docs] for actually useful instructions on how to use HapLink, +Please check [the docs] for more detailed instructions on how to use HapLink, both on the command line and in the REPL. The basic flow of HapLink is @@ -79,19 +84,20 @@ You can see how this works using the files in the [example directory]: ```bash haplink variants \ - --bam example/sample.bam \ - --reference example/reference.fasta ---output sample.vcf + example/reference.fasta \ + example/sample.bam \ + > sample.vcf haplink haplotypes \ - --bam example/sample.bam \ - --variants sample.vcf \ - --output sample.yaml + example/reference.fasta \ + sample.vcf \ + example/sample.bam \ + > sample.yaml haplink sequences \ - --haplotypes sample.yaml \ - --reference example/reference.fasta \ - --output sample.fasta + example/reference.fasta \ + sample.yaml \ + sample.fasta ``` ## Development @@ -100,12 +106,23 @@ HapLink is written in [Julia]. While the focus of the program is the command line interface (CLI), it also exposes a nearly identical API in the form of a Julia Package, which is described in [the docs]. +### Development environment + +For consistency, the recommended version of Julia as well as all the recommended +formatters and commit hooks are listed in a Nix file. If you have [direnv] and +[Nix] installed, then simply run + +```bash +direnv allow . +pre-commit install +``` + +to setup Julia and the commit hook tools. + ### Editing the package HapLink.jl is a self-contained Julia package, and its development process is identical to any other package as discussed in the [Pkg documentation]. -Personally, I tend to avoid the `dev` mode, and work straight from the cloned -package directory. ```shellsession $ git clone https://github.com/ksumngs/HapLink.jl.git @@ -117,75 +134,22 @@ julia> using HapLink julia> ... ``` -### Creating the CLI application +To test your changes on the command line application, ensure that +`$HOME/.julia/bin` is on your `$PATH`, then from the Julia REPL -To work with the CLI directly, you can do one (or both) of the following - -#### 1. Create a shim - -> - _Fast to implement_ -> - _Changes are reflected immediately_ -> - _Slow execution time ([TTFP])_ - -In my `~/bin` directory, I have an executable file named `haplink` with the -following contents: - -```bash -#!/bin/sh -julia --project=$HOME/src/HapLink.jl -e 'using HapLink.haplink()' "$@" +```julia-repl +julia> ] +(@v1.6) pkg> activate . +(HapLink) pkg> build ``` -#### 2. Compile the binary - -> - _More involved implementation_ -> - _Updates must be recompiled_ -> - _Fast execution time_ - -Binaries are compiled using [PackageCompiler.jl], using the recipe in -[.github/workflows/build.yml]. - -1. Get the [official Julia release] (disto packages generally don't work) -2. Install PackageCompiler into that Julia depot - - ```shellsession - (@v1.6) pkg> install PackageCompiler - ``` - -3. Run `PackageCompiler.create_app()` with the following options - - ```julia - using PackageCompiler - create_app( - "/path/to/HapLink.jl", - "/path/to/output", - precompile_execution_file="precompile_app.jl", - executables=["haplink" => "haplink"], - cpu_target="x86-64", - ) - ``` - -Compilation can take over 15 minutes to complete, so be patient! - -## Contributors - -It's pretty lonely here: HapLink was solely made by Thomas Christensen while -working at Kansas State University. Why don't you [open a pull request] and fix -that? +This will update the application shim to include your changes. [semver]: https://semver.org [oneflow]: https://www.endoflineblog.com/oneflow-a-git-branching-model-and-workflow -[docker image over on quay]: - https://quay.io/repository/millironx/julia_bam-readcounts [the docs]: https://ksumngs.github.io/HapLink.jl/stable [example directory]: https://github.com/ksumngs/HapLink.jl/tree/master/example [julia]: https://julialang.org [pkg documentation]: https://pkgdocs.julialang.org/v1/managing-packages/#developing -[ttfp]: https://viralinstruction.com/posts/badjulia/#compile_time_latency -[packagecompiler.jl]: - https://julialang.github.io/PackageCompiler.jl/stable/apps.html -[.github/workflows/build.yml]: - https://github.com/ksumngs/HapLink.jl/blob/master/.github/workflows/build.yml -[official julia release]: https://julialang.org/downloads/ -[open a pull request]: https://github.com/ksumngs/HapLink.jl/compare diff --git a/cspell.json b/cspell.json deleted file mode 100644 index 9ed43b2..0000000 --- a/cspell.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "version": "0.2", - "language": "en", - "allowCompoundWords": true, - "dictionaryDefinitions": [ - { "name": "bioinformatics", "path": "./.cspell/bioinformatics.txt" }, - { "name": "julia", "path": "./.cspell/julia.txt" } - ], - "dictionaries": ["bioinformatics", "julia"], - "includeRegExpList": ["#.*", "string"] -} diff --git a/default.nix b/default.nix deleted file mode 100644 index e331e9f..0000000 --- a/default.nix +++ /dev/null @@ -1,10 +0,0 @@ -{ pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/2f82431c7fdfa641f9816011286a2fa2c489eedb.tar.gz") {} }: - -pkgs.mkShell { - buildInputs = [ - pkgs.julia - pkgs.pre-commit - pkgs.nodejs - ]; - -} diff --git a/docs/src/index.md b/docs/src/index.md index c99f81f..ddd0cc8 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -14,22 +14,16 @@ HapLink ## Welcome Howdy! 🤠 And welcome to HapLink! 👋 HapLink is a command-line suite of tools to -enable the exploration of viral quasispecies within a single metagenomic sample. -Every piece eventually builds up to our viral haplotype caller, which uses -linkage disequilibrium on long sequencing reads (💡 think +enable the exploration of viral quasispecies within a single sample. +Our viral haplotype caller uses +linkage disequilibrium on long sequencing reads (think [Oxford Nanopore](https://nanoporetech.com/) or [PacBio HiFi](https://www.pacb.com/)) to identify genetic mutations that are -conserved within a single virus particle. +likely conserved within a single virus particle. This manual will cover the different ways of using HapLink, starting with a few tutorials before diving into the details of our reference section. -### Contents - -```@contents - -``` - ## Getting started Ready to dive in? 🤿 Here's a 30,000-foot view @@ -45,9 +39,9 @@ julia \ echo 'export PATH=$HOME/.julia/bin:$PATH' >> $HOME/.bashrc source ~/.bashrc -wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/reference.fasta -wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/sample.bam -wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/sample.bam.bai +wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0/example/reference.fasta +wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0/example/sample.bam +wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0/example/sample.bam.bai haplink variants \ reference.fasta \ diff --git a/docs/src/tutorial/1-install.md b/docs/src/tutorial/1-install.md index 2ddd320..a5fe8c1 100644 --- a/docs/src/tutorial/1-install.md +++ b/docs/src/tutorial/1-install.md @@ -1,10 +1,7 @@ # [In the beginning](@id install-tutorial) -There are many different ways to install HapLink. Here we walk you through two -of the most common. If you're one of the 0.01% who needs a different method, -then we trust you can extrapolate from these instructions. Note that all of -these tutorials assume you have a Unix-type system (MacOS, BSD, Linux). Windows -command-line support is basically non-existant! +There are many different ways to install HapLink. Note that some of these +install methods are platform-specific. ```@contents Pages = ["1-install.md"] @@ -18,6 +15,10 @@ here to judge. 👩‍⚖️ It's easy and portable and is bundled on most HPCs. already have conda (or [mamba](https://mamba.readthedocs.io/en/latest/)), then this route is probably for you. +!!! warning + + Bioconda install is only supported on Linux + ### Install HapLink inside a conda environment We'll make a new environment with the totally original name "haplink," to house @@ -43,14 +44,83 @@ Next, cross your fingers 🤞 and run the following command: haplink --help ``` -Check for error messages, but otherwise you're done. You can reuse your +The most common error is + +```shellsession +The following package could not be installed +└─ haplink does not exist (perhaps a typo or a missing channel). +``` + +If this happens, + + 1. Check your spelling in the install command + 2. Check that you are using an x86-64 version of conda on Linux + +Another common error is + +```shellsession +bash: haplink: command not found +bash: /bin/julia: No such file or directory +``` + +If this happens, check that `CONDA_PREFIX` is set correctly by running +`echo "$CONDA_PREFIX"`, and/or rerun `conda activate haplink`. + +If there are no error messages, you're done. You can reuse your `haplink` environment for the [next tutorial](@ref cli-tutorial). -## Comonicon +## Container install + +One option for installing HapLink is don't install HapLink. Or rather, pull a +[container](https://apptainer.org/docs/user/1.2/introduction.html#why-use-containers) +that already has HapLink installed, and process files inside of it. HapLink +provides a Docker container that has been tested on [Apptainer](https://apptainer.org). +You should be able to use nearly any container software to run HapLink, but we +recommend Apptainer, due to its ubiquity on HPCs, simple file permissions, and +increased security. + +### Download the container + +With Apptainer installed, run + +```bash +apptainer pull docker://ghcr.io/ksumngs/haplink.jl +``` + +!!! info "Output" + + - haplink.jl_latest.sif + +### Run the container as a one-off + +You can check to see if the container downloaded correctly by using the +`apptainer exec` command. + +```bash +apptainer exec haplink.jl_latest.sif haplink --version +``` + +### Enter the container to run multiple commands + +For more complex commands, it is often better to enter the container's shell +environment and execute commands within the container. Apptainer will include +all files in your working directory as part of the container when doing this. + +```shellsession +$ apptainer shell haplink.jl_latest.sif +Apptainer> haplink --version +``` + +## Julia dependent-install HapLink is unashamedly a Julia program. If you already have Julia installed, -then you can leverage that existing Julia install to install HapLink thanks to -the power of [Comonicon.jl](https://comonicon.org/). +then you can leverage that existing Julia install to install HapLink. + +!!! tip + + Under the hood, HapLink can self-install thanks to the power of + [Comonicon.jl](https://comonicon.org/). Check out their docs if you want to + learn more, or want to troubleshoot a direct install. ### Check your Julia version @@ -63,9 +133,9 @@ julia --version ### Add HapLink to a temporary environment and install -Using the magic 🪄 of Julia's environments, we can do a "temp install" of the +Using the magic of Julia's environments, we can do a "temp install" of the HapLink package to a temporary directory environment. Because this is a fresh -install, though, it will trigger Comonicon to install the application to a +install, though, it will trigger an installation of the application to a brand-new isolated environment. ```bash diff --git a/docs/src/tutorial/2-examples.md b/docs/src/tutorial/2-examples.md index 7f3c1a9..153f338 100644 --- a/docs/src/tutorial/2-examples.md +++ b/docs/src/tutorial/2-examples.md @@ -1,7 +1,7 @@ -# [Kicking the tires](@id cli-tutorial) +# [Starting at the command line](@id cli-tutorial) At this point, we'll play with the example sequences included _gratis_ 💰 with -HapLink. No, they don't represent anything ☁️, and they aren't particularly +HapLink. No, they don't represent anything, and they aren't particularly interesting 🥱, but they **do** run fast 🏇, so we can get a handle on how the interface and workflow operate. @@ -9,14 +9,14 @@ interface and workflow operate. Pages = ["2-examples.md"] ``` -## Getting the goods +## Getting the goods: extracting example files Let's get the example files from the code repository. In your terminal, run ```bash -wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/reference.fasta -wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/sample.bam -wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/sample.bam.bai +wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0/example/reference.fasta +wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0/example/sample.bam +wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0/example/sample.bam.bai ``` !!! info "Output" @@ -25,7 +25,7 @@ wget https://github.com/ksumngs/HapLink.jl/raw/v1.0.0-rc1/example/sample.bam.bai - sample.bam - sample.bam.bai -## Spot the difference +## Spot the difference: differentiating between sequence error and mutations In order for HapLink to call haplotypes, it needs to know which sequence differences are due to sequencing errors, and which are due to genetic mutation. @@ -43,11 +43,11 @@ haplink variants reference.fasta sample.bam _None_ HapLink by default outputs to standard output, so the variant calls were printed -on your screen instead of saved 😡. That's okay, though 😌. It's often good to -visually check your variant calls, and it this case we absolutely needed to. +on your screen instead of saved. That's okay, though. It's often good to +visually check your variant calls, and in this case, we absolutely needed to. Notice that none of the variants got a `PASS` filter. In fact, all of them were -weeded out by too high of thresholds for depth (remember we only have 10 -sequences) and significance. Let's readjust (and save our results this time). +weeded out by the depth threshold (remember we only have 10 sequences) and +significance. Let's readjust (and save our results this time). ```bash haplink \ @@ -65,7 +65,7 @@ haplink \ These settings seemed to work out well. Let's stick with them and move on. -## The general lay of the land +## The general lay of the land: generating consensus sequences from variant calls At this point, we're going to take a break from haplotype calling and convert those variant calls into a useful summary: the consensus sequence. HapLink can @@ -79,10 +79,10 @@ haplink consensus reference.fasta sample.vcf | tee sample.consensus.fasta - sample.consensus.fasta -## The star attraction +## The star attraction: calling haplotypes from sequence data And now it's time for haplotype calling. Before you get your hopes up, there are -no _true_ haplotypes in this file. If 10 reads could yield subconsenus +no _true_ haplotypes in this file. If 10 reads could manifest subconsenus mysteries, then bioinformatics would be a super easy job. Alas, we live in the real world, and we'll have to stretch mathematical constructs to get anything out of these reads. @@ -106,7 +106,7 @@ You can see that HapLink found only one haplotype in this alignment, but formatted in HapLink's haplotype scheme. The first haplotype in any output file is always the consensus sequence. -## Haplotypes in the Matrix +## Haplotypes in the Matrix: simulating additional reads during haplotype calling If you have reads that don't span the entire genome (like we have here), you can use HapLink's maximum likelihood simulator to "create" full-length reads by @@ -136,7 +136,7 @@ Still nothing, huh? Like I said, no haplotypes here, and simulation can't change that. Note that simulating full-length reads used _a lot_ more computational power, so you should try to stick with full-length reads when you can! -## But, what does it mean? +## But, what does it mean? Reformatting into fasta format HapLink's haplotype YAML files contain everything needed to recreate the haplotype computation, but they can't really be used by any other programs. diff --git a/docs/src/tutorial/3-other.md b/docs/src/tutorial/3-other.md index a39e352..847abde 100644 --- a/docs/src/tutorial/3-other.md +++ b/docs/src/tutorial/3-other.md @@ -1,6 +1,6 @@ -# [Playing well with others](@id integration-tutorial) +# [Playing well with others: Combining HapLink with external tools](@id integration-tutorial) -HapLink is not a one-man show: it definitely knows how to cooperate with other +HapLink is not a one-man show: it knows how to cooperate with other tools! In this tutorial, we'll let HapLink do the haplotype calling, but use other tools to go from reads to variant calls, and from haplotypes to phylogenies. @@ -53,7 +53,7 @@ esearch \ Next, we'll download one of the pools from the validation set from SRA. ```bash -fasterq-dump "SUB13489216" +fasterq-dump --concatenate-reads -X "SRR24796010" | gzip > "IDV-Aug2022-P2.fastq.gz" ``` !!! info "Output" @@ -66,7 +66,7 @@ fasterq-dump "SUB13489216" We have a set of Nanopore reads and a reference genome to go with them. We'll use [minimap2](https://doi.org/10.1093/bioinformatics/bty191) to align the reads to reference. minimap2 requires the `-a` flag to output in SAM format, and uses -the `-x` flag to tweak the settings for optimal Nanoore alignment. We then run +the `-x` flag to tweak the settings for optimal Nanopore alignment. We then run those reads through `samtools sort` and `samtools index` to reduce the computational load needed to find reads by our downstream tools, and `samtools view -b` to convert the SAM file into a compressed BAM file. diff --git a/docs/src/tutorial/4-repl.md b/docs/src/tutorial/4-repl.md index 0cb24f4..8a5e7af 100644 --- a/docs/src/tutorial/4-repl.md +++ b/docs/src/tutorial/4-repl.md @@ -4,9 +4,16 @@ Julia is an ahead-of-time compiled language. Practically, that means that every time you restart Julia, you have to recompile all the code you were running. Using HapLink on the command line involves up to four different commands. Translation: up to four cases where you lose time to recompiling code that was -just running. Surely there's a better way, right? Yep, you can stay within a +just running. Surely there's a better way, right? Well, you can stay within a single Julia session by using HapLink's REPL mode. +!!! tip + + Julia's latency (aka, Time-to-first-plot or TTFP) is a big deal among Julia + programmers. Although there's no "definitive" place to learn about TTFP, + [Jakob Nissen's blog](https://viralinstruction.com/posts/badjulia/#compile_time_latency) + provides some great explanations and actionable advice for reducing latency. + ```@contents Pages = ["4-repl.md"] ``` @@ -137,7 +144,7 @@ map!( Now that we have a consensus sequence, we can properly import the reads for haplotype calling into HapLink's specialized [`Pseudoread`](@ref) class. -There is a convient [`pseudoreads`](@ref) function that can directly convert a +There is a convenient [`pseudoreads`](@ref) function that can directly convert a BAM file for us. ```@repl main diff --git a/src/haplotypecalling.jl b/src/haplotypecalling.jl index 1534fdc..4698d86 100644 --- a/src/haplotypecalling.jl +++ b/src/haplotypecalling.jl @@ -253,7 +253,16 @@ dimensional matrix. function occurrence_matrix( haplotype::AbstractArray{Variation{S,T}}, reads::AbstractArray{Haplotype{S,T}} ) where {S<:BioSequence,T<:BioSymbol} - hapcounts = SparseArray{UInt}(undef, Tuple(repeat([2], length(haplotype)))) + Q = UInt + for int_type in [UInt8, UInt16, UInt32, UInt64, UInt128] + if length(reads) < typemax(int_type) + Q = int_type + break + end #if + error("Too many reads to represent in memory") + end #for + + hapcounts = SparseArray{Q}(undef, Tuple(repeat([2], length(haplotype)))) for read in reads coordinates = zeros(Int, size(haplotype))