From 07c0f925e624ede0068951e6e915a55aabf01581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Fri, 17 Jan 2020 16:34:47 +1100 Subject: [PATCH 01/14] Generated files --- Project.toml | 4 ++++ src/XAM.jl | 5 +++++ 2 files changed, 9 insertions(+) create mode 100644 Project.toml create mode 100644 src/XAM.jl diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..caa5811 --- /dev/null +++ b/Project.toml @@ -0,0 +1,4 @@ +name = "XAM" +uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" +authors = ["Ciarán O'Mara "] +version = "0.1.0" diff --git a/src/XAM.jl b/src/XAM.jl new file mode 100644 index 0000000..5305716 --- /dev/null +++ b/src/XAM.jl @@ -0,0 +1,5 @@ +module XAM + +greet() = print("Hello World!") + +end # module From 17ea000940ce70bdd05785b48e29674a7727371c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Mon, 12 Aug 2019 17:50:38 +1000 Subject: [PATCH 02/14] Add original authors --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index caa5811..0156054 100644 --- a/Project.toml +++ b/Project.toml @@ -1,4 +1,4 @@ name = "XAM" uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" -authors = ["Ciarán O'Mara "] +authors = ["Kenta Sato ", "Ben J. Ward ", "Ciarán O'Mara "] version = "0.1.0" From 78d2329f708c2b1917aa9d393b27a31b11f65aee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 18 Jan 2020 03:04:40 +1100 Subject: [PATCH 03/14] Add .github templates --- .github/ISSUE_TEMPLATE.md | 40 +++++++++++++++++++++++++++ .github/PULL_REQUEST_TEMPLATE.md | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..f39b24c --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,40 @@ + + +> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._ + +## Expected Behavior + + + +## Current Behavior + + + +## Possible Solution / Implementation + + + +## Steps to Reproduce (for bugs) + +1. +2. +3. +4. + + + + + + +## Context + + + +## Your Environment + +- Package Version used: +- Julia Version used: +- Operating System and version (desktop or mobile): +- Link to your project: + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..3575d00 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,47 @@ +# A clear and descriptive title (No issue numbers please) + +> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._ + +## Types of changes + +This PR implements the following changes: +_(Please tick any or all of the following that are applicable)_ + +* [ ] :sparkles: New feature (A non-breaking change which adds functionality). +* [ ] :bug: Bug fix (A non-breaking change, which fixes an issue). +* [ ] :boom: Breaking change (fix or feature that would cause existing functionality to change). + +## :clipboard: Additional detail + +- If you have implemented new features or behaviour + - **Provide a description of the addition** in as many details as possible. + + - **Provide justification of the addition**. + + - **Provide a runnable example of use of your addition**. This lets reviewers + and others try out the feature before it is merged or makes it's way to release. + +- If you have changed current behaviour... + - **Describe the behaviour prior to you changes** + + - **Describe the behaviour after your changes** and justify why you have made the changes, + Please describe any breakages you anticipate as a result of these changes. + + - **Does your change alter APIs or existing exposed methods/types?** + If so, this may cause dependency issues and breakages, so the maintainer + will need to consider this when versioning the next release. + + - If you are implementing changes that are intended to increase performance, you + should provide the results of a simple performance benchmark exercise + demonstrating the improvement. Especially if the changes make code less legible. + +## :ballot_box_with_check: Checklist + +- [ ] :art: The changes implemented is consistent with the [julia style guide](https://docs.julialang.org/en/stable/manual/style-guide/). +- [ ] :blue_book: I have updated and added relevant docstrings, in a manner consistent with the [documentation styleguide](https://docs.julialang.org/en/stable/manual/documentation/). +- [ ] :blue_book: I have added or updated relevant user and developer manuals/documentation in `docs/src/`. +- [ ] :ok: There are unit tests that cover the code changes I have made. +- [ ] :ok: The unit tests cover my code changes AND they pass. +- [ ] :pencil: I have added an entry to the `[UNRELEASED]` section of the manually curated `CHANGELOG.md` file for this repository. +- [ ] :ok: All changes should be compatible with the latest stable version of Julia. +- [ ] :thought_balloon: I have commented liberally for any complex pieces of internal code. From b2ec34047414eca35d131dd506ba8e1ca33c750c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 19 Jan 2020 12:26:57 +1100 Subject: [PATCH 04/14] CompatHelper workflow --- .github/workflows/CompatHelper.yml | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/CompatHelper.yml diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..36f1665 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,38 @@ +name: CompatHelper + +on: + schedule: + - cron: '00 * * * *' + +jobs: + CompatHelper: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: [1.3.0] + julia-arch: [x86] + os: [ubuntu-latest] + steps: + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - name: Add CompatHelper + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: julia -e ' + using CompatHelper, Pkg; + my_registries = [ + Pkg.RegistrySpec( + name = "BioJuliaRegistry", + uuid = "ccbd2cc2-2954-11e9-1ccf-f3e7900901ca", + url = "https://github.com/BioJulia/BioJuliaRegistry.git" + ), + Pkg.RegistrySpec( + name = "General", + uuid = "23338594-aafe-5451-b93e-139f81909106", + url = "https://github.com/JuliaRegistries/General.git" + ) + ]; + CompatHelper.main(; registries = my_registries, master_branch = "master");' From 282d80a758d2178579fd822c631816aea7eca86c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Fri, 17 Jan 2020 16:38:16 +1100 Subject: [PATCH 05/14] Continuous Integration --- .codecov.yml | 1 + .github/workflows/UnitTests.yml | 22 ++++++++++++++++++++++ ci_prep.jl | 3 +++ coverage/Project.toml | 2 ++ coverage/coverage.jl | 11 +++++++++++ 5 files changed, 39 insertions(+) create mode 100644 .codecov.yml create mode 100644 .github/workflows/UnitTests.yml create mode 100644 ci_prep.jl create mode 100644 coverage/Project.toml create mode 100644 coverage/coverage.jl diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..69cb760 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1 @@ +comment: false diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml new file mode 100644 index 0000000..cbaf083 --- /dev/null +++ b/.github/workflows/UnitTests.yml @@ -0,0 +1,22 @@ +name: Unit tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: ['1.1', '1.2', '1.3'] + julia-arch: [x64] + os: [ubuntu-latest, windows-latest, macOS-latest] + + steps: + - uses: actions/checkout@v1.0.0 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.julia-version }} + arch: ${{ matrix.julia-arch }} + - name: Install dependencies + run: julia ci_prep.jl + - uses: julia-actions/julia-runtest@master diff --git a/ci_prep.jl b/ci_prep.jl new file mode 100644 index 0000000..f3a7535 --- /dev/null +++ b/ci_prep.jl @@ -0,0 +1,3 @@ +using Pkg.Registry +Registry.add(Registry.RegistrySpec(url = "https://github.com/BioJulia/BioJuliaRegistry.git")) +Registry.add(Registry.RegistrySpec(url = "https://github.com/JuliaRegistries/General.git")) diff --git a/coverage/Project.toml b/coverage/Project.toml new file mode 100644 index 0000000..4fbdc47 --- /dev/null +++ b/coverage/Project.toml @@ -0,0 +1,2 @@ +[deps] +Coverage = "a2441757-f6aa-5fb2-8edb-039e3f45d037" diff --git a/coverage/coverage.jl b/coverage/coverage.jl new file mode 100644 index 0000000..3d33ed9 --- /dev/null +++ b/coverage/coverage.jl @@ -0,0 +1,11 @@ +get(ENV, "TRAVIS_OS_NAME", "") == "linux" || exit() +get(ENV, "TRAVIS_JULIA_VERSION", "") == "1.3" || exit() + +using Pkg +Pkg.instantiate() + +using Coverage + +cd(joinpath(@__DIR__, "..")) do + Codecov.submit(Codecov.process_folder()) +end From 7a56931b90a17986631e4b6c9373845a9a02109d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 18 Jan 2020 05:24:09 +1100 Subject: [PATCH 06/14] Naive code roundup --- src/XAM.jl | 7 +- src/bam/auxdata.jl | 153 +++++++++++ src/bam/bai.jl | 55 ++++ src/bam/bam.jl | 19 ++ src/bam/overlap.jl | 95 +++++++ src/bam/reader.jl | 145 ++++++++++ src/bam/record.jl | 634 ++++++++++++++++++++++++++++++++++++++++++++ src/bam/writer.jl | 62 +++++ src/sam/flags.jl | 26 ++ src/sam/header.jl | 53 ++++ src/sam/metainfo.jl | 235 ++++++++++++++++ src/sam/reader.jl | 265 ++++++++++++++++++ src/sam/record.jl | 624 +++++++++++++++++++++++++++++++++++++++++++ src/sam/sam.jl | 23 ++ src/sam/writer.jl | 43 +++ test/runtests.jl | 426 +++++++++++++++++++++++++++++ 16 files changed, 2864 insertions(+), 1 deletion(-) create mode 100644 src/bam/auxdata.jl create mode 100644 src/bam/bai.jl create mode 100644 src/bam/bam.jl create mode 100644 src/bam/overlap.jl create mode 100644 src/bam/reader.jl create mode 100644 src/bam/record.jl create mode 100644 src/bam/writer.jl create mode 100644 src/sam/flags.jl create mode 100644 src/sam/header.jl create mode 100644 src/sam/metainfo.jl create mode 100644 src/sam/reader.jl create mode 100644 src/sam/record.jl create mode 100644 src/sam/sam.jl create mode 100644 src/sam/writer.jl create mode 100644 test/runtests.jl diff --git a/src/XAM.jl b/src/XAM.jl index 5305716..7fc5234 100644 --- a/src/XAM.jl +++ b/src/XAM.jl @@ -1,5 +1,10 @@ module XAM -greet() = print("Hello World!") +export + BAM, + SAM + +include("sam/sam.jl") +include("bam/bam.jl") end # module diff --git a/src/bam/auxdata.jl b/src/bam/auxdata.jl new file mode 100644 index 0000000..2bcfac2 --- /dev/null +++ b/src/bam/auxdata.jl @@ -0,0 +1,153 @@ +# BAM Auxiliary Data +# ================== + +struct AuxData <: AbstractDict{String,Any} + data::Vector{UInt8} +end + +function Base.getindex(aux::AuxData, tag::AbstractString) + checkauxtag(tag) + return getauxvalue(aux.data, 1, length(aux.data), UInt8(tag[1]), UInt8(tag[2])) +end + +function Base.length(aux::AuxData) + data = aux.data + p = 1 + len = 0 + while p ≤ length(data) + len += 1 + p = next_tag_position(data, p) + end + return len +end + +function Base.iterate(aux::AuxData, pos=1) + if pos > length(aux.data) + return nothing + end + + data = aux.data + @label doit + t1 = data[pos] + t2 = data[pos+1] + pos, typ = loadauxtype(data, pos + 2) + pos, value = loadauxvalue(data, pos, typ) + if t1 == t2 == 0xff + @goto doit + end + return Pair{String,Any}(String([t1, t2]), value), pos +end + + +# Internals +# --------- + +function checkauxtag(tag::AbstractString) + if sizeof(tag) != 2 + throw(ArgumentError("tag length must be 2")) + end +end + +function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) + pos = findauxtag(data, start, stop, t1, t2) + if pos == 0 + throw(KeyError(String([t1, t2]))) + end + pos, T = loadauxtype(data, pos + 2) + _, val = loadauxvalue(data, pos, T) + return val +end + +function loadauxtype(data::Vector{UInt8}, p::Int) + function auxtype(b) + return ( + b == UInt8('A') ? Char : + b == UInt8('c') ? Int8 : + b == UInt8('C') ? UInt8 : + b == UInt8('s') ? Int16 : + b == UInt8('S') ? UInt16 : + b == UInt8('i') ? Int32 : + b == UInt8('I') ? UInt32 : + b == UInt8('f') ? Float32 : + b == UInt8('Z') ? String : + error("invalid type tag: '$(Char(b))'")) + end + t = data[p] + if t == UInt8('B') + return p + 2, Vector{auxtype(data[p+1])} + else + return p + 1, auxtype(t) + end +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T + return p + sizeof(T), unsafe_load(Ptr{T}(pointer(data, p))) +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Char}) + return p + 1, Char(unsafe_load(pointer(data, p))) +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T + n = unsafe_load(Ptr{Int32}(pointer(data, p))) + p += 4 + xs = Vector{T}(undef, n) + unsafe_copyto!(pointer(xs), Ptr{T}(pointer(data, p)), n) + return p + n * sizeof(T), xs +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{String}) + dataptr = pointer(data, p) + endptr = ccall(:memchr, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), dataptr, '\0', length(data) - p + 1) + q::Int = p + (endptr - dataptr) - 1 + return q + 2, String(data[p:q]) +end + +function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) + pos = start + while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2) + pos = next_tag_position(data, pos) + end + if pos > stop + return 0 + else + return pos + end +end + +# Find the starting position of a next tag in `data` after `p`. +# `(data[p], data[p+1])` is supposed to be a current tag. +function next_tag_position(data::Vector{UInt8}, p::Int) + typ = Char(data[p+2]) + p += 3 + if typ == 'A' + p += 1 + elseif typ == 'c' || typ == 'C' + p += 1 + elseif typ == 's' || typ == 'S' + p += 2 + elseif typ == 'i' || typ == 'I' + p += 4 + elseif typ == 'f' + p += 4 + elseif typ == 'd' + p += 8 + elseif typ == 'Z' || typ == 'H' + while data[p] != 0x00 # NULL-terminalted string + p += 1 + end + p += 1 + elseif typ == 'B' + eltyp = Char(data[p]) + elsize = eltyp == 'c' || eltyp == 'C' ? 1 : + eltyp == 's' || eltyp == 'S' ? 2 : + eltyp == 'i' || eltye == 'I' || eltyp == 'f' ? 4 : + error("invalid type tag: '$(Char(eltyp))'") + p += 1 + n = unsafe_load(Ptr{Int32}(pointer(data, p))) + p += 4 + elsize * n + else + error("invalid type tag: '$(Char(typ))'") + end + return p +end diff --git a/src/bam/bai.jl b/src/bam/bai.jl new file mode 100644 index 0000000..e5caf0a --- /dev/null +++ b/src/bam/bai.jl @@ -0,0 +1,55 @@ +# BAI +# === +# +# Index for BAM files. + + +# An index type for the BAM file format. +struct BAI + # BGZF file index + index::GenomicFeatures.Indexes.BGZFIndex + + # number of unmapped reads + n_no_coors::Union{Nothing, Int} +end + +""" + BAI(filename::AbstractString) + +Load a BAI index from `filename`. +""" +function BAI(filename::AbstractString) + return open(read_bai, filename) +end + +""" + BAI(input::IO) + +Load a BAI index from `input`. +""" +function BAI(input::IO) + return read_bai(input) +end + +# Read a BAI object from `input`. +function read_bai(input::IO) + # check magic bytes + B = read(input, UInt8) + A = read(input, UInt8) + I = read(input, UInt8) + x = read(input, UInt8) + if B != UInt8('B') || A != UInt8('A') || I != UInt8('I') || x != 0x01 + error("input is not a valid BAI file") + end + + # read contents + n_refs = read(input, Int32) + index = GenomicFeatures.Indexes.read_bgzfindex(input, n_refs) + if !eof(input) + n_no_coors = read(input, UInt64) + else + n_no_coors = nothing + end + + return BAI(index, n_no_coors) +end diff --git a/src/bam/bam.jl b/src/bam/bam.jl new file mode 100644 index 0000000..f3317f6 --- /dev/null +++ b/src/bam/bam.jl @@ -0,0 +1,19 @@ +# BAM File Format +# =============== + +module BAM + +import BGZFStreams +import BioAlignments: BioAlignments, SAM +import GenomicFeatures: GenomicFeatures, Interval +import BioSequences +import BioCore: BioCore, isfilled + +include("bai.jl") +include("auxdata.jl") +include("reader.jl") +include("record.jl") +include("writer.jl") +include("overlap.jl") + +end diff --git a/src/bam/overlap.jl b/src/bam/overlap.jl new file mode 100644 index 0000000..10641d3 --- /dev/null +++ b/src/bam/overlap.jl @@ -0,0 +1,95 @@ +# BAM Overlap +# =========== + +struct OverlapIterator{T} + reader::Reader{T} + refname::String + interval::UnitRange{Int} +end + +function Base.IteratorSize(::Type{OverlapIterator{T}}) where T + return Base.SizeUnknown() +end + +function Base.eltype(::Type{OverlapIterator{T}}) where T + return Record +end + +function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval) + return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last) +end + +function GenomicFeatures.eachoverlap(reader::Reader, interval) + return GenomicFeatures.eachoverlap(reader, convert(Interval, interval)) +end + +function GenomicFeatures.eachoverlap(reader::Reader, refname::AbstractString, interval::UnitRange) + return OverlapIterator(reader, String(refname), interval) +end + + +# Iterator +# -------- + +mutable struct OverlapIteratorState + # reference index + refindex::Int + + # possibly overlapping chunks + chunks::Vector{GenomicFeatures.Indexes.Chunk} + + # current chunk index + chunkid::Int + + # pre-allocated record + record::Record +end + +function Base.iterate(iter::OverlapIterator) + refindex = findfirst(isequal(iter.refname), iter.reader.refseqnames) + if refindex == 0 + throw(ArgumentError("sequence name $(iter.refname) is not found in the header")) + end + @assert iter.reader.index !== nothing + chunks = GenomicFeatures.Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval) + if !isempty(chunks) + seek(iter.reader, first(chunks).start) + end + state = OverlapIteratorState(refindex, chunks, 1, Record()) + return iterate(iter, state) +end + +function Base.iterate(iter::OverlapIterator, state) + while state.chunkid ≤ lastindex(state.chunks) + chunk = state.chunks[state.chunkid] + while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop + read!(iter.reader, state.record) + c = compare_intervals(state.record, (state.refindex, iter.interval)) + if c == 0 + return copy(state.record), state + elseif c > 0 + # no more overlapping records in this chunk since records are sorted + break + end + end + state.chunkid += 1 + if state.chunkid ≤ lastindex(state.chunks) + seek(iter.reader, state.chunks[state.chunkid].start) + end + end + return nothing +end + +function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}}) + rid = refid(record) + if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2])) + # strictly left + return -1 + elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2])) + # strictly right + return +1 + else + # overlapping + return 0 + end +end diff --git a/src/bam/reader.jl b/src/bam/reader.jl new file mode 100644 index 0000000..5e92b8e --- /dev/null +++ b/src/bam/reader.jl @@ -0,0 +1,145 @@ +# BAM Reader +# ========== + +""" + BAM.Reader(input::IO; index=nothing) + +Create a data reader of the BAM file format. + +# Arguments +* `input`: data source +* `index=nothing`: filepath to a random access index (currently *bai* is supported) +""" +mutable struct Reader{T} <: BioCore.IO.AbstractReader + stream::BGZFStreams.BGZFStream{T} + header::SAM.Header + start_offset::BGZFStreams.VirtualOffset + refseqnames::Vector{String} + refseqlens::Vector{Int} + index::Union{Nothing, BAI} +end + +function Base.eltype(::Type{Reader{T}}) where T + return Record +end + +function BioCore.IO.stream(reader::Reader) + return reader.stream +end + +function Reader(input::IO; index=nothing) + if isa(index, AbstractString) + index = BAI(index) + else + if index != nothing + error("unrecognizable index argument") + end + end + reader = init_bam_reader(input) + reader.index = index + return reader +end + +function Base.show(io::IO, reader::Reader) + println(io, summary(reader), ":") + print(io, " number of contigs: ", length(reader.refseqnames)) +end + +""" + header(reader::Reader; fillSQ::Bool=false)::SAM.Header + +Get the header of `reader`. + +If `fillSQ` is `true`, this function fills missing "SQ" metainfo in the header. +""" +function header(reader::Reader; fillSQ::Bool=false)::SAM.Header + header = reader.header + if fillSQ + if !isempty(findall(reader.header, "SQ")) + throw(ArgumentError("SAM header already has SQ records")) + end + header = copy(header) + for (name, len) in zip(reader.refseqnames, reader.refseqlens) + push!(header, SAM.MetaInfo("SQ", ["SN" => name, "LN" => len])) + end + end + return header +end + +function BioCore.header(reader::Reader) + return header(reader) +end + +function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset) + seek(reader.stream, voffset) +end + +function Base.seekstart(reader::Reader) + seek(reader.stream, reader.start_offset) +end + +function Base.iterate(reader::Reader, rec=Record()) + if eof(reader) + return nothing + end + read!(reader, rec) + return copy(rec), rec +end + +# Initialize a BAM reader by reading the header section. +function init_bam_reader(input::BGZFStreams.BGZFStream) + # magic bytes + B = read(input, UInt8) + A = read(input, UInt8) + M = read(input, UInt8) + x = read(input, UInt8) + if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01 + error("input was not a valid BAM file") + end + + # SAM header + textlen = read(input, Int32) + samreader = SAM.Reader(IOBuffer(read(input, textlen))) + + # reference sequences + refseqnames = String[] + refseqlens = Int[] + n_refs = read(input, Int32) + for _ in 1:n_refs + namelen = read(input, Int32) + data = read(input, namelen) + seqname = unsafe_string(pointer(data)) + seqlen = read(input, Int32) + push!(refseqnames, seqname) + push!(refseqlens, seqlen) + end + + voffset = isa(input.io, Base.AbstractPipe) ? + BGZFStreams.VirtualOffset(0, 0) : + BGZFStreams.virtualoffset(input) + return Reader( + input, + samreader.header, + voffset, + refseqnames, + refseqlens, + nothing) +end + +function init_bam_reader(input::IO) + return init_bam_reader(BGZFStreams.BGZFStream(input)) +end + +function _read!(reader::Reader, record) + unsafe_read( + reader.stream, + pointer_from_objref(record), + FIXED_FIELDS_BYTES) + dsize = data_size(record) + if length(record.data) < dsize + resize!(record.data, dsize) + end + unsafe_read(reader.stream, pointer(record.data), dsize) + record.reader = reader + return record +end diff --git a/src/bam/record.jl b/src/bam/record.jl new file mode 100644 index 0000000..31226d2 --- /dev/null +++ b/src/bam/record.jl @@ -0,0 +1,634 @@ +# BAM Record +# ========== + +""" + BAM.Record() + +Create an unfilled BAM record. +""" +mutable struct Record + # fixed-length fields (see BMA specs for the details) + block_size::Int32 + refid::Int32 + pos::Int32 + bin_mq_nl::UInt32 + flag_nc::UInt32 + l_seq::Int32 + next_refid::Int32 + next_pos::Int32 + tlen::Int32 + # variable length data + data::Vector{UInt8} + reader::Reader + + function Record() + return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[]) + end +end + +# the data size of fixed-length fields (block_size-tlen) +const FIXED_FIELDS_BYTES = 36 + +function Record(data::Vector{UInt8}) + return convert(Record, data) +end + +function Base.convert(::Type{Record}, data::Vector{UInt8}) + length(data) < FIXED_FIELDS_BYTES && throw(ArgumentError("data too short")) + record = Record() + dst_pointer = Ptr{UInt8}(pointer_from_objref(record)) + unsafe_copyto!(dst_pointer, pointer(data), FIXED_FIELDS_BYTES) + dsize = data_size(record) + resize!(record.data, dsize) + length(data) < dsize + FIXED_FIELDS_BYTES && throw(ArgumentError("data too short")) + unsafe_copyto!(record.data, 1, data, FIXED_FIELDS_BYTES + 1, dsize) + return record +end + +function Base.copy(record::Record) + copy = Record() + copy.block_size = record.block_size + copy.refid = record.refid + copy.pos = record.pos + copy.bin_mq_nl = record.bin_mq_nl + copy.flag_nc = record.flag_nc + copy.l_seq = record.l_seq + copy.next_refid = record.next_refid + copy.next_pos = record.next_pos + copy.tlen = record.tlen + copy.data = record.data[1:data_size(record)] + if isdefined(record, :reader) + copy.reader = record.reader + end + return copy +end + +function Base.show(io::IO, record::Record) + print(io, summary(record), ':') + if isfilled(record) + println(io) + println(io, " template name: ", tempname(record)) + println(io, " flag: ", flag(record)) + println(io, " reference ID: ", refid(record)) + println(io, " position: ", position(record)) + println(io, " mapping quality: ", mappingquality(record)) + println(io, " CIGAR: ", cigar(record)) + println(io, " next reference ID: ", nextrefid(record)) + println(io, " next position: ", nextposition(record)) + println(io, " template length: ", templength(record)) + println(io, " sequence: ", sequence(record)) + # TODO: pretty print base quality + println(io, " base quality: ", quality(record)) + print(io, " auxiliary data:") + for field in keys(auxdata(record)) + print(io, ' ', field, '=', record[field]) + end + else + print(io, " ") + end +end + +function Base.read!(reader::Reader, record::Record) + return _read!(reader, record) +end + + +# Accessor Fuctions +# ----------------- + +""" + flag(record::Record)::UInt16 + +Get the bitwise flag of `record`. +""" +function flag(record::Record)::UInt16 + checkfilled(record) + return UInt16(record.flag_nc >> 16) +end + +function hasflag(record::Record) + return isfilled(record) +end + +""" + ismapped(record::Record)::Bool + +Test if `record` is mapped. +""" +function ismapped(record::Record)::Bool + return flag(record) & SAM.FLAG_UNMAP == 0 +end + +""" + isprimary(record::Record)::Bool + +Test if `record` is a primary line of the read. + +This is equivalent to `flag(record) & 0x900 == 0`. +""" +function isprimary(record::Record)::Bool + return flag(record) & 0x900 == 0 +end + +""" + ispositivestrand(record::Record)::Bool + +Test if `record` is aligned to the positive strand. + +This is equivalent to `flag(record) & 0x10 == 0`. +""" +function ispositivestrand(record::Record)::Bool + flag(record) & 0x10 == 0 +end + +""" + refid(record::Record)::Int + +Get the reference sequence ID of `record`. + +The ID is 1-based (i.e. the first sequence is 1) and is 0 for a record without a mapping position. + +See also: `BAM.rname` +""" +function refid(record::Record)::Int + checkfilled(record) + return record.refid + 1 +end + +function hasrefid(record::Record) + return isfilled(record) +end + +function checked_refid(record::Record) + id = refid(record) + if id == 0 + throw(ArgumentError("record is not mapped")) + elseif !isdefined(record, :reader) + throw(ArgumentError("reader is not defined")) + end + return id +end + +""" + refname(record::Record)::String + +Get the reference sequence name of `record`. + +See also: `BAM.refid` +""" +function refname(record::Record)::String + checkfilled(record) + id = checked_refid(record) + return record.reader.refseqnames[id] +end + +""" + reflen(record::Record)::Int + +Get the length of the reference sequence this record applies to. +""" +function reflen(record::Record)::Int + checkfilled(record) + id = checked_refid(record) + return record.reader.refseqlens[id] +end + +function hasrefname(record::Record) + return hasrefid(record) +end + +""" + position(record::Record)::Int + +Get the 1-based leftmost mapping position of `record`. +""" +function position(record::Record)::Int + checkfilled(record) + return record.pos + 1 +end + +function hasposition(record::Record) + return isfilled(record) +end + +""" + rightposition(record::Record)::Int + +Get the 1-based rightmost mapping position of `record`. +""" +function rightposition(record::Record)::Int + checkfilled(record) + return Int32(position(record) + alignlength(record) - 1) +end + +function hasrightposition(record::Record) + return isfilled(record) && ismapped(record) +end + +""" + isnextmapped(record::Record)::Bool + +Test if the mate/next read of `record` is mapped. +""" +function isnextmapped(record::Record)::Bool + return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0) +end + +""" + nextrefid(record::Record)::Int + +Get the next/mate reference sequence ID of `record`. +""" +function nextrefid(record::Record)::Int + checkfilled(record) + return record.next_refid + 1 +end + +function hasnextrefid(record::Record) + return isfilled(record) +end + +""" + nextrefname(record::Record)::String + +Get the reference name of the mate/next read of `record`. +""" +function nextrefname(record::Record)::String + checkfilled(record) + id = nextrefid(record) + if id == 0 + throw(ArgumentError("next record is not mapped")) + elseif !isdefined(record, :reader) + throw(ArgumentError("reader is not defined")) + end + return record.reader.refseqnames[id] +end + +function hasnextrefname(record::Record) + return isfilled(record) && isnextmapped(record) +end + +""" + nextposition(record::Record)::Int + +Get the 1-based leftmost mapping position of the next/mate read of `record`. +""" +function nextposition(record::Record)::Int + checkfilled(record) + return record.next_pos + 1 +end + +function hasnextposition(record::Record) + return isfilled(record) +end + +""" + mappingquality(record::Record)::UInt8 + +Get the mapping quality of `record`. +""" +function mappingquality(record::Record)::UInt8 + return UInt8((record.bin_mq_nl >> 8) & 0xff) +end + +function hasmappingquality(record::Record) + return isfilled(record) +end + +""" + n_cigar_op(record::Record, checkCG::Bool = true) + +Return the number of operations in the CIGAR string of `record`. + +Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record. +However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string. + +Calling this method with `checkCG` set to `true` (default) this method will always yield the number of operations in the true cigar string, because this is probably what you want, the vast majority of the time. + +If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to get the number of operations in the `cigar` field of the BAM record, then set `checkCG` to `false`. +""" +function n_cigar_op(record::Record, checkCG::Bool = true) + return cigar_position(record, checkCG)[2] +end + +""" + cigar(record::Record)::String + +Get the CIGAR string of `record`. + +Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record. +However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string. + +Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time. + +If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`. + +See also `BAM.cigar_rle`. +""" +function cigar(record::Record, checkCG::Bool = true)::String + buf = IOBuffer() + for (op, len) in zip(cigar_rle(record, checkCG)...) + print(buf, len, convert(Char, op)) + end + return String(take!(buf)) +end + +""" + cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}} + +Get a run-length encoded tuple `(ops, lens)` of the CIGAR string in `record`. + +Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record. +However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string. + +Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time. + +If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`. + +See also `BAM.cigar`. +""" +function cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}} + checkfilled(record) + idx, nops = cigar_position(record, checkCG) + ops, lens = extract_cigar_rle(record.data, idx, nops) + return ops, lens +end + +function extract_cigar_rle(data::Vector{UInt8}, offset, n) + ops = Vector{BioAlignments.Operation}() + lens = Vector{Int}() + for i in offset:4:offset + (n - 1) * 4 + x = unsafe_load(Ptr{UInt32}(pointer(data, i))) + op = BioAlignments.Operation(x & 0x0F) + push!(ops, op) + push!(lens, x >> 4) + end + return ops, lens +end + +function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int} + cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF + if !checkCG + return cigaridx, nops + end + if nops != 2 + return cigaridx, nops + end + x = unsafe_load(Ptr{UInt32}(pointer(record.data, cigaridx))) + if x != UInt32(seqlength(record) << 4 | 4) + return cigaridx, nops + end + start = auxdata_position(record) + stop = data_size(record) + tagidx = findauxtag(record.data, start, stop, UInt8('C'), UInt8('G')) + if tagidx == 0 + return cigaridx, nops + end + # Tag exists, validate type is BI. + typ = unsafe_load(Ptr{UInt16}(pointer(record.data, tagidx += 2))) + if typ != (UInt16('I') << 8 | UInt16('B')) + return cigaridx, nops + end + # If got this far, the CG tag is valid and contains the cigar. + # Get the true n_cigar_ops, and return it and the idx of the first + nops = UInt32(unsafe_load(Ptr{Int32}(pointer(record.data, tagidx += 2)))) + tagidx += 4 + return tagidx, nops +end + +""" + alignment(record::Record)::BioAlignments.Alignment + +Get the alignment of `record`. +""" +function alignment(record::Record)::BioAlignments.Alignment + checkfilled(record) + if !ismapped(record) + return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) + end + seqpos = 0 + refpos = position(record) - 1 + anchors = [BioAlignments.AlignmentAnchor(seqpos, refpos, BioAlignments.OP_START)] + for (op, len) in zip(cigar_rle(record)...) + if BioAlignments.ismatchop(op) + seqpos += len + refpos += len + elseif BioAlignments.isinsertop(op) + seqpos += len + elseif BioAlignments.isdeleteop(op) + refpos += len + else + error("operation $(op) is not supported") + end + push!(anchors, BioAlignments.AlignmentAnchor(seqpos, refpos, op)) + end + return BioAlignments.Alignment(anchors) +end + +function hasalignment(record::Record) + return ismapped(record) +end + +""" + alignlength(record::Record)::Int + +Get the alignment length of `record`. +""" +function alignlength(record::Record)::Int + offset = seqname_length(record) + length::Int = 0 + for i in offset + 1:4:offset + n_cigar_op(record, false) * 4 + x = unsafe_load(Ptr{UInt32}(pointer(record.data, i))) + op = BioAlignments.Operation(x & 0x0F) + if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op) + length += x >> 4 + end + end + return length +end + +""" + tempname(record::Record)::String + +Get the query template name of `record`. +""" +function tempname(record::Record)::String + checkfilled(record) + # drop the last NUL character + return unsafe_string(pointer(record.data), max(seqname_length(record) - 1, 0)) +end + +function hastempname(record::Record) + return isfilled(record) +end + +""" + templength(record::Record)::Int + +Get the template length of `record`. +""" +function templength(record::Record)::Int + checkfilled(record) + return record.tlen +end + +function hastemplength(record::Record) + return isfilled(record) +end + +""" + sequence(record::Record)::BioSequences.DNASequence + +Get the segment sequence of `record`. +""" +function sequence(record::Record)::BioSequences.DNASequence + checkfilled(record) + seqlen = seqlength(record) + data = Vector{UInt64}(undef, cld(seqlen, 16)) + src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1) + for i in 1:lastindex(data) + # copy data flipping high and low nybble + x = unsafe_load(src, i) + data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4 + end + return BioSequences.DNASequence(data, 1:seqlen, false) +end + +function hassequence(record::Record) + return isfilled(record) +end + +""" + seqlength(record::Record)::Int + +Get the sequence length of `record`. +""" +function seqlength(record::Record)::Int + checkfilled(record) + return record.l_seq % Int +end + +function hasseqlength(record::Record) + return isfilled(record) +end + +""" + quality(record::Record)::Vector{UInt8} + +Get the base quality of `record`. +""" +function quality(record::Record)::Vector{UInt8} + checkfilled(record) + seqlen = seqlength(record) + offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen] +end + +function hasquality(record::Record) + return isfilled(record) +end + +""" + auxdata(record::Record)::BAM.AuxData + +Get the auxiliary data of `record`. +""" +function auxdata(record::Record) + checkfilled(record) + return AuxData(record.data[auxdata_position(record):data_size(record)]) +end + +function hasauxdata(record::Record) + return isfilled(record) +end + +function Base.getindex(record::Record, tag::AbstractString) + checkauxtag(tag) + start = auxdata_position(record) + stop = data_size(record) + return getauxvalue(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) +end + +function Base.haskey(record::Record, tag::AbstractString) + checkauxtag(tag) + start = auxdata_position(record) + stop = data_size(record) + return findauxtag(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) > 0 +end + +function Base.keys(record::Record) + return collect(keys(auxdata(record))) +end + +function Base.values(record::Record) + return [record[key] for key in keys(record)] +end + + +# BioCore Methods +# ----------- + +function BioCore.isfilled(record::Record) + return record.block_size != 0 +end + +function BioCore.seqname(record::Record) + return tempname(record) +end + +function BioCore.hasseqname(record::Record) + return hastempname(record) +end + +function BioCore.sequence(record::Record) + return sequence(record) +end + +function BioCore.hassequence(record::Record) + return hassequence(record) +end + +function BioCore.leftposition(record::Record) + return position(record) +end + +function BioCore.hasleftposition(record::Record) + return hasposition(record) +end + +function BioCore.rightposition(record::Record) + return rightposition(record) +end + +function BioCore.hasrightposition(record::Record) + return hasrightposition(record) +end + + +# Helper Functions +# ---------------- + +# Return the size of the `.data` field. +function data_size(record::Record) + if isfilled(record) + return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size) + else + return 0 + end +end + +function checkfilled(record::Record) + if !isfilled(record) + throw(ArgumentError("unfilled BAM record")) + end +end + +function auxdata_position(record::Record) + seqlen = seqlength(record) + return seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + seqlen + 1 +end + +# Return the length of the read name. +function seqname_length(record::Record) + return record.bin_mq_nl & 0xff +end diff --git a/src/bam/writer.jl b/src/bam/writer.jl new file mode 100644 index 0000000..f017424 --- /dev/null +++ b/src/bam/writer.jl @@ -0,0 +1,62 @@ +# BAM Writer +# ========== + +""" + BAM.Writer(output::BGZFStream, header::SAM.Header) + +Create a data writer of the BAM file format. + +# Arguments +* `output`: data sink +* `header`: SAM header object +""" +mutable struct Writer <: BioCore.IO.AbstractWriter + stream::BGZFStreams.BGZFStream +end + +function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header) + refseqnames = String[] + refseqlens = Int[] + for metainfo in findall(header, "SQ") + push!(refseqnames, metainfo["SN"]) + push!(refseqlens, parse(Int, metainfo["LN"])) + end + write_header(stream, header, refseqnames, refseqlens) + return Writer(stream) +end + +function BioCore.IO.stream(writer::Writer) + return writer.stream +end + +function Base.write(writer::Writer, record::Record) + n = 0 + n += unsafe_write(writer.stream, pointer_from_objref(record), FIXED_FIELDS_BYTES) + n += unsafe_write(writer.stream, pointer(record.data), data_size(record)) + return n +end + +function write_header(stream, header, refseqnames, refseqlens) + @assert length(refseqnames) == length(refseqlens) + n = 0 + + # magic bytes + n += write(stream, "BAM\1") + + # SAM header + buf = IOBuffer() + l = write(SAM.Writer(buf), header) + n += write(stream, Int32(l)) + n += write(stream, take!(buf)) + + # reference sequences + n += write(stream, Int32(length(refseqnames))) + for (seqname, seqlen) in zip(refseqnames, refseqlens) + namelen = length(seqname) + n += write(stream, Int32(namelen + 1)) + n += write(stream, seqname, '\0') + n += write(stream, Int32(seqlen)) + end + + return n +end diff --git a/src/sam/flags.jl b/src/sam/flags.jl new file mode 100644 index 0000000..0b629d1 --- /dev/null +++ b/src/sam/flags.jl @@ -0,0 +1,26 @@ +# SAM Flags +# ========= +# +# Bitwise flags (or FLAG). + +for (name, bits, doc) in [ + (:PAIRED, UInt16(0x001), "the read is paired in sequencing, no matter whether it is mapped in a pair"), + (:PROPER_PAIR, UInt16(0x002), "the read is mapped in a proper pair" ), + (:UNMAP, UInt16(0x004), "the read itself is unmapped; conflictive with SAM.FLAG_PROPER_PAIR" ), + (:MUNMAP, UInt16(0x008), "the mate is unmapped" ), + (:REVERSE, UInt16(0x010), "the read is mapped to the reverse strand" ), + (:MREVERSE, UInt16(0x020), "the mate is mapped to the reverse strand" ), + (:READ1, UInt16(0x040), "this is read1" ), + (:READ2, UInt16(0x080), "this is read2" ), + (:SECONDARY, UInt16(0x100), "not primary alignment" ), + (:QCFAIL, UInt16(0x200), "QC failure" ), + (:DUP, UInt16(0x400), "optical or PCR duplicate" ), + (:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),] + @assert bits isa UInt16 + sym = Symbol("FLAG_", name) + doc = string(@sprintf("0x%04x: ", bits), doc) + @eval begin + @doc $(doc) const $(sym) = $(bits) + end +end + diff --git a/src/sam/header.jl b/src/sam/header.jl new file mode 100644 index 0000000..1369e37 --- /dev/null +++ b/src/sam/header.jl @@ -0,0 +1,53 @@ +# SAM Header +# ========== + +struct Header + metainfo::Vector{MetaInfo} +end + +""" + SAM.Header() + +Create an empty header. +""" +function Header() + return Header(MetaInfo[]) +end + +function Base.copy(header::Header) + return Header(header.metainfo) +end + +function Base.eltype(::Type{Header}) + return MetaInfo +end + +function Base.length(header::Header) + return length(header.metainfo) +end + +function Base.iterate(header::Header, i=1) + if i > length(header.metainfo) + return nothing + end + return header.metainfo[i], i + 1 +end + +""" + find(header::Header, key::AbstractString)::Vector{MetaInfo} + +Find metainfo objects satisfying `SAM.tag(metainfo) == key`. +""" +function Base.findall(header::Header, key::AbstractString)::Vector{MetaInfo} + return filter(m -> isequalkey(m, key), header.metainfo) +end + +function Base.pushfirst!(header::Header, metainfo::MetaInfo) + pushfirst!(header.metainfo, metainfo) + return header +end + +function Base.push!(header::Header, metainfo::MetaInfo) + push!(header.metainfo, metainfo) + return header +end diff --git a/src/sam/metainfo.jl b/src/sam/metainfo.jl new file mode 100644 index 0000000..f40e6ae --- /dev/null +++ b/src/sam/metainfo.jl @@ -0,0 +1,235 @@ +# SAM Meta-Information +# ==================== + +mutable struct MetaInfo + # data and filled range + data::Vector{UInt8} + filled::UnitRange{Int} + # indexes + tag::UnitRange{Int} + val::UnitRange{Int} + dictkey::Vector{UnitRange{Int}} + dictval::Vector{UnitRange{Int}} +end + +function MetaInfo(data::Vector{UInt8}=UInt8[]) + metainfo = MetaInfo(data, 1:0, 1:0, 1:0, UnitRange{Int}[], UnitRange{Int}[]) + if !isempty(data) + index!(metainfo) + end + return metainfo +end + +""" + MetaInfo(str::AbstractString) + +Create a SAM metainfo from `str`. + +# Examples + + julia> SAM.MetaInfo("@CO\tsome comment") + BioAlignments.SAM.MetaInfo: + tag: CO + value: some comment + + julia> SAM.MetaInfo("@SQ\tSN:chr1\tLN:12345") + BioAlignments.SAM.MetaInfo: + tag: SQ + value: SN=chr1 LN=12345 + +""" +function MetaInfo(str::AbstractString) + return MetaInfo(Vector{UInt8}(str)) +end + +""" + MetaInfo(tag::AbstractString, value) + +Create a SAM metainfo with `tag` and `value`. + +`tag` is a two-byte ASCII string. If `tag` is `"CO"`, `value` must be a string; otherwise, `value` is an iterable object with key and value pairs. + +# Examples + + julia> SAM.MetaInfo("CO", "some comment") + BioAlignments.SAM.MetaInfo: + tag: CO + value: some comment + + julia> string(ans) + "@CO\tsome comment" + + julia> SAM.MetaInfo("SQ", ["SN" => "chr1", "LN" => 12345]) + BioAlignments.SAM.MetaInfo: + tag: SQ + value: SN=chr1 LN=12345 + + julia> string(ans) + "@SQ\tSN:chr1\tLN:12345" + +""" +function MetaInfo(tag::AbstractString, value) + buf = IOBuffer() + if tag == "CO" # comment + if !isa(value, AbstractString) + throw(ArgumentError("value must be a string")) + end + write(buf, "@CO\t", value) + elseif occursin(r"[A-Z][A-Z]", tag) + print(buf, '@', tag) + for (key, val) in value + print(buf, '\t', key, ':', val) + end + else + throw(ArgumentError("tag must match r\"[A-Z][A-Z]\"")) + end + return MetaInfo(take!(buf)) +end + +function initialize!(metainfo::MetaInfo) + metainfo.filled = 1:0 + metainfo.tag = 1:0 + metainfo.val = 1:0 + empty!(metainfo.dictkey) + empty!(metainfo.dictval) + return metainfo +end + +function isfilled(metainfo::MetaInfo) + return !isempty(metainfo.filled) +end + +function datarange(metainfo::MetaInfo) + return metainfo.filled +end + +function checkfilled(metainfo::MetaInfo) + if !isfilled(metainfo) + throw(ArgumentError("unfilled SAM metainfo")) + end +end + +function isequalkey(metainfo::MetaInfo, key::AbstractString) + if !isfilled(metainfo) || sizeof(key) != 2 + return false + end + k1, k2 = UInt8(key[1]), UInt8(key[2]) + return metainfo.data[metainfo.tag[1]] == k1 && metainfo.data[metainfo.tag[2]] == k2 +end + +function Base.show(io::IO, metainfo::MetaInfo) + print(io, summary(metainfo), ':') + if isfilled(metainfo) + println(io) + println(io, " tag: ", tag(metainfo)) + print(io, " value:") + if !iscomment(metainfo) + for (key, val) in zip(keys(metainfo), values(metainfo)) + print(io, ' ', key, '=', val) + end + else + print(io, ' ', value(metainfo)) + end + else + print(io, " ") + end +end + +function Base.print(io::IO, metainfo::MetaInfo) + write(io, metainfo) + return nothing +end + +function Base.write(io::IO, metainfo::MetaInfo) + checkfilled(metainfo) + r = datarange(metainfo) + return unsafe_write(io, pointer(metainfo.data, first(r)), length(r)) +end + + +# Accessor Functions +# ------------------ + +""" + iscomment(metainfo::MetaInfo)::Bool + +Test if `metainfo` is a comment (i.e. its tag is "CO"). +""" +function iscomment(metainfo::MetaInfo)::Bool + return isequalkey(metainfo, "CO") +end + +""" + tag(metainfo::MetaInfo)::String + +Get the tag of `metainfo`. +""" +function tag(metainfo::MetaInfo)::String + checkfilled(metainfo) + return String(metainfo.data[metainfo.tag]) +end + +""" + value(metainfo::MetaInfo)::String + +Get the value of `metainfo` as a string. +""" +function value(metainfo::MetaInfo)::String + checkfilled(metainfo) + return String(metainfo.data[metainfo.val]) +end + +""" + keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}} + +Get the values of `metainfo` as string pairs. +""" +function keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}} + checkfilled(metainfo) + if iscomment(metainfo) + throw(ArgumentError("not a dictionary")) + end + return Pair{String,String}[key => val for (key, val) in zip(keys(metainfo), values(metainfo))] +end + +function Base.keys(metainfo::MetaInfo) + checkfilled(metainfo) + if iscomment(metainfo) + throw(ArgumentError("not a dictionary")) + end + return [String(metainfo.data[r]) for r in metainfo.dictkey] +end + +function Base.values(metainfo::MetaInfo) + checkfilled(metainfo) + if iscomment(metainfo) + throw(ArgumentError("not a dictionary")) + end + return [String(metainfo.data[r]) for r in metainfo.dictval] +end + +function Base.haskey(metainfo::MetaInfo, key::AbstractString) + return findkey(metainfo, key) > 0 +end + +function Base.getindex(metainfo::MetaInfo, key::AbstractString) + i = findkey(metainfo, key) + if i == 0 + throw(KeyError(key)) + end + return String(metainfo.data[metainfo.dictval[i]]) +end + +function findkey(metainfo::MetaInfo, key::AbstractString) + checkfilled(metainfo) + if sizeof(key) != 2 + return 0 + end + t1, t2 = UInt8(key[1]), UInt8(key[2]) + for (i, k) in enumerate(metainfo.dictkey) + if metainfo.data[first(k)] == t1 && metainfo.data[first(k)+1] == t2 + return i + end + end + return 0 +end diff --git a/src/sam/reader.jl b/src/sam/reader.jl new file mode 100644 index 0000000..ee016e4 --- /dev/null +++ b/src/sam/reader.jl @@ -0,0 +1,265 @@ +# SAM Reader +# ========= + +mutable struct Reader <: BioCore.IO.AbstractReader + state::BioCore.Ragel.State + header::Header + + function Reader(input::BufferedStreams.BufferedInputStream) + reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header()) + readheader!(reader) + reader.state.cs = sam_body_machine.start_state + return reader + end +end + +""" + SAM.Reader(input::IO) + +Create a data reader of the SAM file format. + +# Arguments +* `input`: data source +""" +function Reader(input::IO) + return Reader(BufferedStreams.BufferedInputStream(input)) +end + +function BioCore.IO.stream(reader::Reader) + return reader.state.stream +end + +""" + header(reader::Reader)::Header + +Get the header of `reader`. +""" +function header(reader::Reader)::Header + return reader.header +end + +function BioCore.header(reader::Reader) + return header(reader) +end + +function Base.eltype(::Type{Reader}) + return Record +end + +# file = header . body +# header = metainfo* +# body = record* +isinteractive() && info("compiling SAM") +const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () + cat = Automa.RegExp.cat + rep = Automa.RegExp.rep + alt = Automa.RegExp.alt + opt = Automa.RegExp.opt + any = Automa.RegExp.any + + metainfo = let + tag = re"[A-Z][A-Z]" \ cat("CO") + tag.actions[:enter] = [:mark1] + tag.actions[:exit] = [:metainfo_tag] + + dict = let + key = re"[A-Za-z][A-Za-z0-9]" + key.actions[:enter] = [:mark2] + key.actions[:exit] = [:metainfo_dict_key] + val = re"[ -~]+" + val.actions[:enter] = [:mark2] + val.actions[:exit] = [:metainfo_dict_val] + keyval = cat(key, ':', val) + + cat(keyval, rep(cat('\t', keyval))) + end + dict.actions[:enter] = [:mark1] + dict.actions[:exit] = [:metainfo_val] + + co = cat("CO") + co.actions[:enter] = [:mark1] + co.actions[:exit] = [:metainfo_tag] + + comment = re"[^\r\n]*" + comment.actions[:enter] = [:mark1] + comment.actions[:exit] = [:metainfo_val] + + cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) + end + metainfo.actions[:enter] = [:anchor] + metainfo.actions[:exit] = [:metainfo] + + record = let + qname = re"[!-?A-~]+" + qname.actions[:enter] = [:mark] + qname.actions[:exit] = [:record_qname] + + flag = re"[0-9]+" + flag.actions[:enter] = [:mark] + flag.actions[:exit] = [:record_flag] + + rname = re"\*|[!-()+-<>-~][!-~]*" + rname.actions[:enter] = [:mark] + rname.actions[:exit] = [:record_rname] + + pos = re"[0-9]+" + pos.actions[:enter] = [:mark] + pos.actions[:exit] = [:record_pos] + + mapq = re"[0-9]+" + mapq.actions[:enter] = [:mark] + mapq.actions[:exit] = [:record_mapq] + + cigar = re"\*|([0-9]+[MIDNSHPX=])+" + cigar.actions[:enter] = [:mark] + cigar.actions[:exit] = [:record_cigar] + + rnext = re"\*|=|[!-()+-<>-~][!-~]*" + rnext.actions[:enter] = [:mark] + rnext.actions[:exit] = [:record_rnext] + + pnext = re"[0-9]+" + pnext.actions[:enter] = [:mark] + pnext.actions[:exit] = [:record_pnext] + + tlen = re"[-+]?[0-9]+" + tlen.actions[:enter] = [:mark] + tlen.actions[:exit] = [:record_tlen] + + seq = re"\*|[A-Za-z=.]+" + seq.actions[:enter] = [:mark] + seq.actions[:exit] = [:record_seq] + + qual = re"[!-~]+" + qual.actions[:enter] = [:mark] + qual.actions[:exit] = [:record_qual] + + field = let + tag = re"[A-Za-z][A-Za-z0-9]" + val = alt( + re"A:[!-~]", + re"i:[-+]?[0-9]+", + re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", + re"Z:[ !-~]*", + re"H:([0-9A-F][0-9A-F])*", + re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+") + + cat(tag, ':', val) + end + field.actions[:enter] = [:mark] + field.actions[:exit] = [:record_field] + + cat( + qname, '\t', + flag, '\t', + rname, '\t', + pos, '\t', + mapq, '\t', + cigar, '\t', + rnext, '\t', + pnext, '\t', + tlen, '\t', + seq, '\t', + qual, + rep(cat('\t', field))) + end + record.actions[:enter] = [:anchor] + record.actions[:exit] = [:record] + + newline = let + lf = re"\n" + lf.actions[:enter] = [:countline] + + cat(re"\r?", lf) + end + + header′ = rep(cat(metainfo, newline)) + header′.actions[:exit] = [:header] + header = cat(header′, opt(any() \ cat('@'))) # look ahead + + body = rep(cat(record, newline)) + + return map(Automa.compile, (metainfo, record, header, body)) +end)() + +const sam_metainfo_actions = Dict( + :metainfo_tag => :(record.tag = (mark1:p-1) .- offset), + :metainfo_val => :(record.val = (mark1:p-1) .- offset), + :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), + :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), + :metainfo => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1) + record.filled = (offset+1:p-1) .- offset + end, + :anchor => :(), + :mark1 => :(mark1 = p), + :mark2 => :(mark2 = p)) +eval( + BioCore.ReaderHelper.generate_index_function( + MetaInfo, + sam_metainfo_machine, + :(mark1 = mark2 = offset = 0), + sam_metainfo_actions)) +eval( + BioCore.ReaderHelper.generate_readheader_function( + Reader, + MetaInfo, + sam_header_machine, + :(mark1 = mark2 = offset = 0), + merge(sam_metainfo_actions, Dict( + :metainfo => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + @assert isfilled(record) + push!(reader.header.metainfo, record) + BioCore.ReaderHelper.ensure_margin!(stream) + record = MetaInfo() + end, + :header => :(finish_header = true; @escape), + :countline => :(linenum += 1), + :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))), + quote + if !eof(stream) + stream.position -= 1 # cancel look-ahead + end + end)) + +const sam_record_actions = Dict( + :record_qname => :(record.qname = (mark:p-1) .- offset), + :record_flag => :(record.flag = (mark:p-1) .- offset), + :record_rname => :(record.rname = (mark:p-1) .- offset), + :record_pos => :(record.pos = (mark:p-1) .- offset), + :record_mapq => :(record.mapq = (mark:p-1) .- offset), + :record_cigar => :(record.cigar = (mark:p-1) .- offset), + :record_rnext => :(record.rnext = (mark:p-1) .- offset), + :record_pnext => :(record.pnext = (mark:p-1) .- offset), + :record_tlen => :(record.tlen = (mark:p-1) .- offset), + :record_seq => :(record.seq = (mark:p-1) .- offset), + :record_qual => :(record.qual = (mark:p-1) .- offset), + :record_field => :(push!(record.fields, (mark:p-1) .- offset)), + :record => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1) + record.filled = (offset+1:p-1) .- offset + end, + :anchor => :(), + :mark => :(mark = p)) +eval( + BioCore.ReaderHelper.generate_index_function( + Record, + sam_record_machine, + :(mark = offset = 0), + sam_record_actions)) +eval( + BioCore.ReaderHelper.generate_read_function( + Reader, + sam_body_machine, + :(mark = offset = 0), + merge(sam_record_actions, Dict( + :record => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + found_record = true + @escape + end, + :countline => :(linenum += 1), + :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))))) diff --git a/src/sam/record.jl b/src/sam/record.jl new file mode 100644 index 0000000..25d39d8 --- /dev/null +++ b/src/sam/record.jl @@ -0,0 +1,624 @@ +# SAM Record +# ========== + +mutable struct Record + # data and filled range + data::Vector{UInt8} + filled::UnitRange{Int} + # indexes + qname::UnitRange{Int} + flag::UnitRange{Int} + rname::UnitRange{Int} + pos::UnitRange{Int} + mapq::UnitRange{Int} + cigar::UnitRange{Int} + rnext::UnitRange{Int} + pnext::UnitRange{Int} + tlen::UnitRange{Int} + seq::UnitRange{Int} + qual::UnitRange{Int} + fields::Vector{UnitRange{Int}} +end + +""" + SAM.Record() + +Create an unfilled SAM record. +""" +function Record() + return Record( + UInt8[], 1:0, + # qname-mapq + 1:0, 1:0, 1:0, 1:0, 1:0, + # cigar-seq + 1:0, 1:0, 1:0, 1:0, 1:0, + # qual and fields + 1:0, UnitRange{Int}[]) +end + +""" + SAM.Record(data::Vector{UInt8}) + +Create a SAM record from `data`. +This function verifies the format and indexes fields for accessors. +Note that the ownership of `data` is transferred to a new record object. +""" +function Record(data::Vector{UInt8}) + return convert(Record, data) +end + +function Base.convert(::Type{Record}, data::Vector{UInt8}) + record = Record( + data, 1:0, + # qname-mapq + 1:0, 1:0, 1:0, 1:0, 1:0, + # cigar-seq + 1:0, 1:0, 1:0, 1:0, 1:0, + # qual and fields + 1:0, UnitRange{Int}[]) + index!(record) + return record +end + +""" + SAM.Record(str::AbstractString) + +Create a SAM record from `str`. +This function verifies the format and indexes fields for accessors. +""" +function Record(str::AbstractString) + return convert(Record, str) +end + +function Base.convert(::Type{Record}, str::AbstractString) + return Record(Vector{UInt8}(str)) +end + +function Base.show(io::IO, record::Record) + print(io, summary(record), ':') + if isfilled(record) + println(io) + println(io, " template name: ", hastempname(record) ? tempname(record) : "") + println(io, " flag: ", hasflag(record) ? flag(record) : "") + println(io, " reference: ", hasrefname(record) ? refname(record) : "") + println(io, " position: ", hasposition(record) ? position(record) : "") + println(io, " mapping quality: ", hasmappingquality(record) ? mappingquality(record) : "") + println(io, " CIGAR: ", hascigar(record) ? cigar(record) : "") + println(io, " next reference: ", hasnextrefname(record) ? nextrefname(record) : "") + println(io, " next position: ", hasnextposition(record) ? nextposition(record) : "") + println(io, " template length: ", hastemplength(record) ? templength(record) : "") + println(io, " sequence: ", hassequence(record) ? sequence(String, record) : "") + println(io, " base quality: ", hasquality(record) ? quality(String, record) : "") + print(io, " auxiliary data:") + for field in record.fields + print(io, ' ', String(record.data[field])) + end + else + print(io, " ") + end +end + +function Base.print(io::IO, record::Record) + write(io, record) + return nothing +end + +function Base.write(io::IO, record::Record) + checkfilled(record) + return unsafe_write(io, pointer(record.data, first(record.filled)), length(record.filled)) +end + +function Base.copy(record::Record) + return Record( + copy(record.data), + record.filled, + record.qname, + record.flag, + record.rname, + record.pos, + record.mapq, + record.cigar, + record.rnext, + record.pnext, + record.tlen, + record.seq, + record.qual, + copy(record.fields)) +end + + +# Accessor Functions +# ------------------ + +""" + flag(record::Record)::UInt16 + +Get the bitwise flag of `record`. +""" +function flag(record::Record)::UInt16 + checkfilled(record) + return unsafe_parse_decimal(UInt16, record.data, record.flag) +end + +function hasflag(record::Record) + return isfilled(record) +end + +""" + ismapped(record::Record)::Bool + +Test if `record` is mapped. +""" +function ismapped(record::Record)::Bool + return isfilled(record) && (flag(record) & FLAG_UNMAP == 0) +end + +""" + isprimary(record::Record)::Bool + +Test if `record` is a primary line of the read. + +This is equivalent to `flag(record) & 0x900 == 0`. +""" +function isprimary(record::Record)::Bool + return flag(record) & 0x900 == 0 +end + +""" + refname(record::Record)::String + +Get the reference sequence name of `record`. +""" +function refname(record::Record) + checkfilled(record) + if ismissing(record, record.rname) + missingerror(:refname) + end + return String(record.data[record.rname]) +end + +function hasrefname(record::Record) + return isfilled(record) && !ismissing(record, record.rname) +end + +""" + position(record::Record)::Int + +Get the 1-based leftmost mapping position of `record`. +""" +function position(record::Record)::Int + checkfilled(record) + pos = unsafe_parse_decimal(Int, record.data, record.pos) + if pos == 0 + missingerror(:position) + end + return pos +end + +function hasposition(record::Record) + return isfilled(record) && (length(record.pos) != 1 || record.data[first(record.pos)] != UInt8('0')) +end + +""" + rightposition(record::Record)::Int + +Get the 1-based rightmost mapping position of `record`. +""" +function rightposition(record::Record) + return position(record) + alignlength(record) - 1 +end + +function hasrightposition(record::Record) + return hasposition(record) && hasalignment(record) +end + +""" + isnextmapped(record::Record)::Bool + +Test if the mate/next read of `record` is mapped. +""" +function isnextmapped(record::Record)::Bool + return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0) +end + +""" + nextrefname(record::Record)::String + +Get the reference name of the mate/next read of `record`. +""" +function nextrefname(record::Record)::String + checkfilled(record) + if ismissing(record, record.rnext) + missingerror(:nextrefname) + end + return String(record.data[record.rnext]) +end + +function hasnextrefname(record::Record) + return isfilled(record) && !ismissing(record, record.rnext) +end + +""" + nextposition(record::Record)::Int + +Get the position of the mate/next read of `record`. +""" +function nextposition(record::Record)::Int + checkfilled(record) + pos = unsafe_parse_decimal(Int, record.data, record.pnext) + if pos == 0 + missingerror(:nextposition) + end + return pos +end + +function hasnextposition(record::Record) + return isfilled(record) && (length(record.pnext) != 1 || record.data[first(record.pnext)] != UInt8('0')) +end + +""" + mappingquality(record::Record)::UInt8 + +Get the mapping quality of `record`. +""" +function mappingquality(record::Record)::UInt8 + checkfilled(record) + qual = unsafe_parse_decimal(UInt8, record.data, record.mapq) + if qual == 0xff + missingerror(:mappingquality) + end + return qual +end + +function hasmappingquality(record::Record) + return isfilled(record) && unsafe_parse_decimal(UInt8, record.data, record.mapq) != 0xff +end + +""" + cigar(record::Record)::String + +Get the CIGAR string of `record`. +""" +function cigar(record::Record)::String + checkfilled(record) + if ismissing(record, record.cigar) + missingerror(:cigar) + end + return String(record.data[record.cigar]) +end + +function hascigar(record::Record) + return isfilled(record) && !ismissing(record, record.cigar) +end + +""" + alignment(record::Record)::BioAlignments.Alignment + +Get the alignment of `record`. +""" +function alignment(record::Record)::BioAlignments.Alignment + if ismapped(record) + return BioAlignments.Alignment(cigar(record), 1, position(record)) + else + return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) + end +end + +function hasalignment(record::Record) + return isfilled(record) && hascigar(record) +end + +""" + alignlength(record::Record)::Int + +Get the alignment length of `record`. +""" +function alignlength(record::Record)::Int + if length(record.cigar) == 1 && record.data[first(record.cigar)] == UInt8('*') + return 0 + end + ret::Int = 0 + len = 0 # operation length + for i in record.cigar + c = record.data[i] + if c ∈ UInt8('0'):UInt8('9') + len = len * 10 + (c - UInt8('0')) + else + op = convert(BioAlignments.Operation, Char(c)) + if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op) + ret += len + len = 0 + end + end + end + return ret +end + +""" + tempname(record::Record)::String + +Get the query template name of `record`. +""" +function tempname(record::Record)::String + checkfilled(record) + if ismissing(record, record.qname) + missingerror(:tempname) + end + return String(record.data[record.qname]) +end + +function hastempname(record::Record) + return isfilled(record) && !ismissing(record, record.qname) +end + +""" + templength(record::Record)::Int + +Get the template length of `record`. +""" +function templength(record::Record)::Int + checkfilled(record) + len = unsafe_parse_decimal(Int, record.data, record.tlen) + if len == 0 + missingerror(:tlen) + end + return len +end + +function hastemplength(record::Record) + return isfilled(record) && (length(record.tlen) != 1 || record.data[first(record.tlen)] != UInt8('0')) +end + +""" + sequence(record::Record)::BioSequences.DNASequence + +Get the segment sequence of `record`. +""" +function sequence(record::Record)::BioSequences.DNASequence + checkfilled(record) + if ismissing(record, record.seq) + missingerror(:sequence) + end + seqlen = length(record.seq) + ret = BioSequences.DNASequence(seqlen) + BioSequences.encode_copy!(ret, 1, record.data, first(record.seq), seqlen) + return ret +end + +function hassequence(record::Record) + return isfilled(record) && !ismissing(record, record.seq) +end +""" + sequence(::Type{String}, record::Record)::String + +Get the segment sequence of `record` as `String`. +""" +function sequence(::Type{String}, record::Record)::String + checkfilled(record) + return String(record.data[record.seq]) +end + +""" + seqlength(record::Record)::Int + +Get the sequence length of `record`. +""" +function seqlength(record::Record)::Int + checkfilled(record) + if ismissing(record, record.seq) + missingerror(:seq) + end + return length(record.seq) +end + +function hasseqlength(record::Record) + return isfilled(record) && !ismissing(record, record.seq) +end + +""" + quality(record::Record)::Vector{UInt8} + +Get the Phred-scaled base quality of `record`. +""" +function quality(record::Record)::Vector{UInt8} + checkfilled(record) + if ismissing(record, record.qual) + missingerror(:quality) + end + qual = record.data[record.qual] + for i in 1:lastindex(qual) + @inbounds qual[i] -= 33 + end + return qual +end + +function hasquality(record::Record) + return isfilled(record) && !ismissing(record, record.qual) +end + +""" + quality(::Type{String}, record::Record)::String + +Get the ASCII-encoded base quality of `record`. +""" +function quality(::Type{String}, record::Record)::String + checkfilled(record) + return String(record.data[record.qual]) +end + +""" + auxdata(record::Record)::Dict{String,Any} + +Get the auxiliary data (optional fields) of `record`. +""" +function auxdata(record::Record)::Dict{String,Any} + checkfilled(record) + return Dict(k => record[k] for k in keys(record)) +end + +function Base.haskey(record::Record, tag::AbstractString) + return findauxtag(record, tag) > 0 +end + +function Base.getindex(record::Record, tag::AbstractString) + i = findauxtag(record, tag) + if i == 0 + throw(KeyError(tag)) + end + field = record.fields[i] + # data type + typ = record.data[first(field)+3] + lo = first(field) + 5 + if i == lastindex(record.fields) + hi = last(field) + else + hi = first(record.fields[i+1]) - 2 + end + if typ == UInt8('A') + @assert lo == hi + return Char(record.data[lo]) + elseif typ == UInt8('i') + return unsafe_parse_decimal(Int, record.data, lo:hi) + elseif typ == UInt8('f') + # TODO: Call a C function directly for speed? + return parse(Float32, SubString(record.data[lo:hi])) + elseif typ == UInt8('Z') + return String(record.data[lo:hi]) + elseif typ == UInt8('H') + return parse_hexarray(record.data, lo:hi) + elseif typ == UInt8('B') + return parse_typedarray(record.data, lo:hi) + else + throw(ArgumentError("type code '$(Char(typ))' is not defined")) + end +end + +function Base.keys(record::Record) + checkfilled(record) + return [String(record.data[first(f):first(f)+1]) for f in record.fields] +end + +function Base.values(record::Record) + return [record[k] for k in keys(record)] +end + + +# Bio Methods +# ----------- + +function BioCore.isfilled(record::Record) + return !isempty(record.filled) +end + +function BioCore.seqname(record::Record) + return tempname(record) +end + +function BioCore.hasseqname(record::Record) + return hastempname(record) +end + +function BioCore.sequence(record::Record) + return sequence(record) +end + +function BioCore.hassequence(record::Record) + return hassequence(record) +end + +function BioCore.rightposition(record::Record) + return rightposition(record) +end + +function BioCore.hasrightposition(record::Record) + return hasrightposition(record) +end + +function BioCore.leftposition(record::Record) + return position(record) +end + +function BioCore.hasleftposition(record::Record) + return hasposition(record) +end + + +# Helper Functions +# ---------------- + +function initialize!(record::Record) + record.filled = 1:0 + record.qname = 1:0 + record.flag = 1:0 + record.rname = 1:0 + record.pos = 1:0 + record.mapq = 1:0 + record.cigar = 1:0 + record.rnext = 1:0 + record.pnext = 1:0 + record.tlen = 1:0 + record.seq = 1:0 + record.qual = 1:0 + empty!(record.fields) + return record +end + +function checkfilled(record::Record) + if !isfilled(record) + throw(ArgumentError("unfilled SAM record")) + end +end + +function findauxtag(record::Record, tag::AbstractString) + checkfilled(record) + if sizeof(tag) != 2 + return 0 + end + t1, t2 = UInt8(tag[1]), UInt8(tag[2]) + for (i, field) in enumerate(record.fields) + p = first(field) + if record.data[p] == t1 && record.data[p+1] == t2 + return i + end + end + return 0 +end + +function parse_hexarray(data::Vector{UInt8}, range::UnitRange{Int}) + @assert iseven(length(range)) + ret = Vector{UInt8}(length(range) >> 1) + byte2hex(b) = b ∈ 0x30:0x39 ? (b - 0x30) : b ∈ 0x41:0x46 ? (b - 0x41 + 0x0A) : error("not in [0-9A-F]") + j = 1 + for i in first(range):2:last(range)-1 + ret[j] = (byte2hex(data[range[i]]) << 4) | byte2hex(data[range[i+1]]) + j += 1 + end + return ret +end + +function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int}) + # format: [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ + t = data[first(range)] + xs = split(String(data[first(range)+2:last(range)])) + if t == UInt8('c') + return [parse(Int8, x) for x in xs] + elseif t == UInt8('C') + return [parse(UInt8, x) for x in xs] + elseif t == UInt8('s') + return [parse(Int16, x) for x in xs] + elseif t == UInt8('S') + return [parse(UInt16, x) for x in xs] + elseif t == UInt8('i') + return [parse(Int32, x) for x in xs] + elseif t == UInt8('I') + return [parse(UInt32, x) for x in xs] + elseif t == UInt8('f') + return [parse(Float32, x) for x in xs] + else + throw(ArgumentError("type code '$(Char(t))' is not defined")) + end +end + +function ismissing(record::Record, range::UnitRange{Int}) + return length(range) == 1 && record.data[first(range)] == UInt8('*') +end diff --git a/src/sam/sam.jl b/src/sam/sam.jl new file mode 100644 index 0000000..2b46653 --- /dev/null +++ b/src/sam/sam.jl @@ -0,0 +1,23 @@ +# SAM File Format +# =============== + +module SAM + +import Automa +import Automa.RegExp: @re_str +import BioAlignments +import BioCore.Exceptions: missingerror +import BioCore.RecordHelper: unsafe_parse_decimal +import BioCore: BioCore, isfilled +import BioSequences +import BufferedStreams +using Printf: @sprintf + +include("flags.jl") +include("metainfo.jl") +include("record.jl") +include("header.jl") +include("reader.jl") +include("writer.jl") + +end diff --git a/src/sam/writer.jl b/src/sam/writer.jl new file mode 100644 index 0000000..e7a5ddd --- /dev/null +++ b/src/sam/writer.jl @@ -0,0 +1,43 @@ +# SAM Writer +# ========== + +""" + Writer(output::IO, header::Header=Header()) + +Create a data writer of the SAM file format. + +# Arguments +* `output`: data sink +* `header=Header()`: SAM header object +""" +mutable struct Writer <: BioCore.IO.AbstractWriter + stream::IO + + function Writer(output::IO, header::Header=Header()) + writer = new(output) + write(writer, header) + return writer + end +end + +function BioCore.IO.stream(writer::Writer) + return writer.stream +end + +function Base.write(writer::Writer, header::Header) + n = 0 + for metainfo in header + n += write(writer, metainfo) + end + return n +end + +function Base.write(writer::Writer, metainfo::MetaInfo) + checkfilled(metainfo) + return write(writer.stream, metainfo, '\n') +end + +function Base.write(writer::Writer, record::Record) + checkfilled(record) + return write(writer.stream, record, '\n') +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..1182fb5 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,426 @@ +using Test +using BioAlignments +using BioSymbols +import BGZFStreams: BGZFStream +import BioCore.Exceptions: MissingFieldException +import BioCore.Testing.get_bio_fmt_specimens +import BioSequences: @dna_str, @aa_str +import GenomicFeatures +import YAML + +# Generate a random range within `range`. +function randrange(range) + x = rand(range) + y = rand(range) + if x < y + return x:y + else + return y:x + end +end + +@testset "SAM" begin + samdir = joinpath(get_bio_fmt_specimens("master", false, true), "SAM") + + @testset "MetaInfo" begin + metainfo = SAM.MetaInfo() + @test !isfilled(metainfo) + @test occursin("not filled", repr(metainfo)) + + metainfo = SAM.MetaInfo("CO", "some comment (parens)") + @test isfilled(metainfo) + @test string(metainfo) == "@CO\tsome comment (parens)" + @test occursin("CO", repr(metainfo)) + @test SAM.tag(metainfo) == "CO" + @test SAM.value(metainfo) == "some comment (parens)" + @test_throws ArgumentError keys(metainfo) + @test_throws ArgumentError values(metainfo) + + metainfo = SAM.MetaInfo("HD", ["VN" => "1.0", "SO" => "coordinate"]) + @test isfilled(metainfo) + @test string(metainfo) == "@HD\tVN:1.0\tSO:coordinate" + @test occursin("HD", repr(metainfo)) + @test SAM.tag(metainfo) == "HD" + @test SAM.value(metainfo) == "VN:1.0\tSO:coordinate" + @test keys(metainfo) == ["VN", "SO"] + @test values(metainfo) == ["1.0", "coordinate"] + @test SAM.keyvalues(metainfo) == ["VN" => "1.0", "SO" => "coordinate"] + @test haskey(metainfo, "VN") + @test haskey(metainfo, "SO") + @test !haskey(metainfo, "GO") + @test metainfo["VN"] == "1.0" + @test metainfo["SO"] == "coordinate" + @test_throws KeyError metainfo["GO"] + end + + @testset "Header" begin + header = SAM.Header() + @test isempty(header) + push!(header, SAM.MetaInfo("@HD\tVN:1.0\tSO:coordinate")) + @test !isempty(header) + @test length(header) == 1 + push!(header, SAM.MetaInfo("@CO\tsome comment")) + @test length(header) == 2 + @test isa(collect(header), Vector{SAM.MetaInfo}) + end + + @testset "Record" begin + record = SAM.Record() + @test !isfilled(record) + @test !SAM.ismapped(record) + @test repr(record) == "BioAlignments.SAM.Record: " + @test_throws ArgumentError SAM.flag(record) + + record = SAM.Record("r001\t99\tchr1\t7\t30\t8M2I4M1D3M\t=\t37\t39\tTTAGATAAAGGATACTG\t*") + @test isfilled(record) + @test occursin(r"^BioAlignments.SAM.Record:\n", repr(record)) + @test SAM.ismapped(record) + @test SAM.isprimary(record) + @test SAM.hastempname(record) + @test SAM.tempname(record) == "r001" + @test SAM.hasflag(record) + @test SAM.flag(record) === UInt16(99) + @test SAM.hasrefname(record) + @test SAM.refname(record) == "chr1" + @test SAM.hasposition(record) + @test SAM.position(record) === 7 + @test SAM.hasmappingquality(record) + @test SAM.mappingquality(record) === UInt8(30) + @test SAM.hascigar(record) + @test SAM.cigar(record) == "8M2I4M1D3M" + @test SAM.hasnextrefname(record) + @test SAM.nextrefname(record) == "=" + @test SAM.hasnextposition(record) + @test SAM.nextposition(record) === 37 + @test SAM.hastemplength(record) + @test SAM.templength(record) === 39 + @test SAM.hassequence(record) + @test SAM.sequence(record) == dna"TTAGATAAAGGATACTG" + @test !SAM.hasquality(record) + @test_throws MissingFieldException SAM.quality(record) + end + + @testset "Reader" begin + reader = open(SAM.Reader, joinpath(samdir, "ce#1.sam")) + @test isa(reader, SAM.Reader) + @test eltype(reader) === SAM.Record + + # header + h = header(reader) + @test string.(findall(header(reader), "SQ")) == ["@SQ\tSN:CHROMOSOME_I\tLN:1009800"] + + # first record + record = SAM.Record() + read!(reader, record) + @test SAM.ismapped(record) + @test SAM.refname(record) == "CHROMOSOME_I" + @test SAM.position(record) == leftposition(record) == 2 + @test SAM.rightposition(record) == rightposition(record) == 102 + @test SAM.tempname(record) == seqname(record) == "SRR065390.14978392" + @test SAM.sequence(record) == sequence(record) == dna"CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA" + @test SAM.sequence(String, record) == "CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA" + @test SAM.seqlength(record) == 100 + @test SAM.quality(record) == (b"#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" .- 33) + @test SAM.quality(String, record) == "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" + @test SAM.flag(record) == 16 + @test SAM.cigar(record) == "27M1D73M" + @test SAM.alignment(record) == Alignment([ + AlignmentAnchor( 0, 1, OP_START), + AlignmentAnchor( 27, 28, OP_MATCH), + AlignmentAnchor( 27, 29, OP_DELETE), + AlignmentAnchor(100, 102, OP_MATCH)]) + @test record["XG"] == 1 + @test record["XM"] == 5 + @test record["XN"] == 0 + @test record["XO"] == 1 + @test record["AS"] == -18 + @test record["XS"] == -18 + @test record["YT"] == "UU" + @test eof(reader) + close(reader) + + # iterator + @test length(collect(open(SAM.Reader, joinpath(samdir, "ce#1.sam")))) == 1 + @test length(collect(open(SAM.Reader, joinpath(samdir, "ce#2.sam")))) == 2 + + # IOStream + @test length(collect(SAM.Reader(open(joinpath(samdir, "ce#1.sam"))))) == 1 + @test length(collect(SAM.Reader(open(joinpath(samdir, "ce#2.sam"))))) == 2 + end + + @testset "Round trip" begin + function compare_records(xs, ys) + if length(xs) != length(ys) + return false + end + for (x, y) in zip(xs, ys) + if x.data[x.filled] != y.data[y.filled] + return false + end + end + return true + end + for specimen in YAML.load_file(joinpath(samdir, "index.yml")) + filepath = joinpath(samdir, specimen["filename"]) + mktemp() do path, io + # copy + reader = open(SAM.Reader, filepath) + writer = SAM.Writer(io, header(reader)) + records = SAM.Record[] + for record in reader + push!(records, record) + write(writer, record) + end + close(reader) + close(writer) + @test compare_records(open(collect, SAM.Reader, path), records) + end + end + end +end + +@testset "BAM" begin + bamdir = joinpath(get_bio_fmt_specimens("master", false), "BAM") + + @testset "AuxData" begin + auxdata = BAM.AuxData(UInt8[]) + @test isempty(auxdata) + + buf = IOBuffer() + write(buf, "NM", UInt8('s'), Int16(1)) + auxdata = BAM.AuxData(take!(buf)) + @test length(auxdata) == 1 + @test auxdata["NM"] === Int16(1) + @test collect(auxdata) == ["NM" => Int16(1)] + + buf = IOBuffer() + write(buf, "AS", UInt8('c'), Int8(-18)) + write(buf, "NM", UInt8('s'), Int16(1)) + write(buf, "XA", UInt8('f'), Float32(3.14)) + write(buf, "XB", UInt8('Z'), "some text\0") + write(buf, "XC", UInt8('B'), UInt8('i'), Int32(3), Int32[10, -5, 8]) + auxdata = BAM.AuxData(take!(buf)) + @test length(auxdata) == 5 + @test auxdata["AS"] === Int8(-18) + @test auxdata["NM"] === Int16(1) + @test auxdata["XA"] === Float32(3.14) + @test auxdata["XB"] == "some text" + @test auxdata["XC"] == Int32[10, -5, 8] + @test convert(Dict{String,Any}, auxdata) == Dict( + "AS" => Int8(-18), + "NM" => Int16(1), + "XA" => Float32(3.14), + "XB" => "some text", + "XC" => Int32[10, -5, 8]) + end + + @testset "Record" begin + record = BAM.Record() + @test !isfilled(record) + @test repr(record) == "BioAlignments.BAM.Record: " + @test_throws ArgumentError BAM.flag(record) + end + + @testset "Reader" begin + reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam")) + @test isa(reader, BAM.Reader) + @test eltype(reader) === BAM.Record + @test startswith(repr(reader), "BioAlignments.BAM.Reader{IOStream}:") + + # header + h = header(reader) + @test isa(h, SAM.Header) + + # first record + record = BAM.Record() + read!(reader, record) + @test BAM.ismapped(record) + @test BAM.isprimary(record) + @test ! BAM.ispositivestrand(record) + @test BAM.refname(record) == "CHROMOSOME_I" + @test BAM.refid(record) === 1 + @test BAM.hasnextrefid(record) + @test BAM.nextrefid(record) === 0 + @test BAM.hasposition(record) === hasleftposition(record) === true + @test BAM.position(record) === leftposition(record) === 2 + @test BAM.hasnextposition(record) + @test BAM.nextposition(record) === 0 + @test rightposition(record) == 102 + @test BAM.hastempname(record) === hasseqname(record) === true + @test BAM.tempname(record) == seqname(record) == "SRR065390.14978392" + @test BAM.hassequence(record) === hassequence(record) === true + @test BAM.sequence(record) == sequence(record) == dna""" + CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCT + AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA + """ + @test BAM.seqlength(record) === 100 + @test BAM.hasquality(record) + @test eltype(BAM.quality(record)) == UInt8 + @test BAM.quality(record) == [Int(x) - 33 for x in "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"] + @test BAM.flag(record) === UInt16(16) + @test BAM.cigar(record) == "27M1D73M" + @test BAM.alignment(record) == Alignment([ + AlignmentAnchor( 0, 1, OP_START), + AlignmentAnchor( 27, 28, OP_MATCH), + AlignmentAnchor( 27, 29, OP_DELETE), + AlignmentAnchor(100, 102, OP_MATCH)]) + @test record["XG"] == 1 + @test record["XM"] == 5 + @test record["XN"] == 0 + @test record["XO"] == 1 + @test record["AS"] == -18 + @test record["XS"] == -18 + @test record["YT"] == "UU" + @test keys(record) == ["XG","XM","XN","XO","AS","XS","YT"] + @test values(record) == [1, 5, 0, 1, -18, -18, "UU"] + @test eof(reader) + close(reader) + + # Test conversion from byte array to record + dsize = BAM.data_size(record) + array = Vector{UInt8}(undef, BAM.FIXED_FIELDS_BYTES + dsize) + GC.@preserve array record begin + ptr = Ptr{UInt8}(pointer_from_objref(record)) + unsafe_copyto!(pointer(array), ptr, BAM.FIXED_FIELDS_BYTES) + unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize) + end + new_record = convert(BAM.Record, array) + @test record.bin_mq_nl == new_record.bin_mq_nl + @test record.block_size == new_record.block_size + @test record.flag_nc == new_record.flag_nc + @test record.l_seq == new_record.l_seq + @test record.next_refid == new_record.next_refid + @test record.next_pos == new_record.next_pos + @test record.refid == new_record.refid + @test record.pos == new_record.pos + @test record.tlen == new_record.tlen + @test record.data == new_record.data + + # iterator + @test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#1.bam")))) == 1 + @test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#2.bam")))) == 2 + + # IOStream + @test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#1.bam"))))) == 1 + @test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#2.bam"))))) == 2 + end + + @testset "Read long CIGARs" begin + function get_cigar_lens(rec::BAM.Record) + cigar_ops, cigar_n = BAM.cigar_rle(rec) + field_ops, field_n = BAM.cigar_rle(rec, false) + cigar_l = length(cigar_ops) + field_l = length(field_ops) + return cigar_l, field_l + end + + function check_cigar_vs_field(rec::BAM.Record) + cigar = BAM.cigar(rec) + field = BAM.cigar(rec, false) + cigar_l, field_l = get_cigar_lens(rec) + return cigar != field && cigar_l != field_l + end + + function check_cigar_lens(rec::BAM.Record, field_len, cigar_len) + cigar_l, field_l = get_cigar_lens(rec) + return cigar_l == cigar_len && field_l == field_len + end + + reader = open(BAM.Reader, joinpath(bamdir, "cigar-64k.bam")) + rec = BAM.Record() + read!(reader, rec) + @test !check_cigar_vs_field(rec) + read!(reader, rec) + @test check_cigar_vs_field(rec) + @test check_cigar_lens(rec, 2, 72091) + end + + function compare_records(xs, ys) + if length(xs) != length(ys) + return false + end + for (x, y) in zip(xs, ys) + if !( + x.block_size == y.block_size && + x.refid == y.refid && + x.pos == y.pos && + x.bin_mq_nl == y.bin_mq_nl && + x.flag_nc == y.flag_nc && + x.l_seq == y.l_seq && + x.next_refid == y.next_refid && + x.next_pos == y.next_pos && + x.tlen == y.tlen && + x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)]) + return false + end + end + return true + end + + @testset "Round trip" begin + for specimen in YAML.load_file(joinpath(bamdir, "index.yml")) + filepath = joinpath(bamdir, specimen["filename"]) + mktemp() do path, _ + # copy + if occursin("bai", get(specimen, "tags", "")) + reader = open(BAM.Reader, filepath, index=filepath * ".bai") + else + reader = open(BAM.Reader, filepath) + end + writer = BAM.Writer( + BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ")))) + records = BAM.Record[] + for record in reader + push!(records, record) + write(writer, record) + end + close(reader) + close(writer) + @test compare_records(open(collect, BAM.Reader, path), records) + end + end + end + + @testset "Random access" begin + filepath = joinpath(bamdir, "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam") + reader = open(BAM.Reader, filepath, index=filepath * ".bai") + + @test isa(eachoverlap(reader, "chr1", 1:100), BAM.OverlapIterator) + @test isa(eachoverlap(reader, GenomicFeatures.Interval("chr1", 1, 100)), BAM.OverlapIterator) + + # expected values are counted using samtools + for (refname, interval, expected) in [ + ("chr1", 1_000:10000, 21), + ("chr1", 8_000:10000, 20), + ("chr1", 766_000:800_000, 142), + ("chr1", 786_000:800_000, 1), + ("chr1", 796_000:800_000, 0)] + intsect = eachoverlap(reader, refname, interval) + @test eltype(intsect) == BAM.Record + @test count(_ -> true, intsect) == expected + # check that the intersection iterator is stateless + @test count(_ -> true, intsect) == expected + end + + # randomized tests + for n in 1:50 + refindex = 1 + refname = "chr1" + range = randrange(1:1_000_000) + seekstart(reader) + # linear scan + expected = filter(collect(reader)) do record + BAM.compare_intervals(record, (refindex, range)) == 0 + end + # indexed scan + actual = collect(eachoverlap(reader, refname, range)) + @test compare_records(actual, expected) + end + close(reader) + + filepath = joinpath(bamdir, "R_12h_D06.uniq.q40.bam") + reader = open(BAM.Reader, filepath, index=filepath * ".bai") + @test isempty(collect(eachoverlap(reader, "chr19", 5823708:5846478))) + close(reader) + end +end From 1a3c986152c024111254acbd10dd60b19135d5b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Thu, 20 Feb 2020 21:19:07 +1100 Subject: [PATCH 07/14] Minimal code adjustments for working separation - Update to use GenomicFeatures v2. - BioAlignments v2. - BioSequences v2. - Indexes v0.1. --- Project.toml | 29 +++++++++++++++++++++++++++++ src/XAM.jl | 7 +++++-- src/bam/bai.jl | 4 ++-- src/bam/bam.jl | 13 ++++++++++--- src/bam/overlap.jl | 4 ++-- src/bam/reader.jl | 4 ---- src/bam/record.jl | 6 +++--- src/sam/reader.jl | 4 ---- src/sam/record.jl | 6 +++--- src/sam/sam.jl | 4 +++- test/runtests.jl | 26 +++++++++++++++++++------- 11 files changed, 76 insertions(+), 31 deletions(-) diff --git a/Project.toml b/Project.toml index 0156054..2f31d67 100644 --- a/Project.toml +++ b/Project.toml @@ -2,3 +2,32 @@ name = "XAM" uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" authors = ["Kenta Sato ", "Ben J. Ward ", "Ciarán O'Mara "] version = "0.1.0" + +[deps] +Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" +BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" +BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" +BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81" +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" +GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" +Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[compat] +Automa = "0.7, 0.8" +BGZFStreams = "0.3" +BioAlignments = "2" +BioCore = "2" +BioSequences = "2" +BufferedStreams = "1" +GenomicFeatures = "2" +Indexes = "0.1" +julia = "1.1" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" + +[targets] +test = ["Test", "YAML"] diff --git a/src/XAM.jl b/src/XAM.jl index 7fc5234..8609747 100644 --- a/src/XAM.jl +++ b/src/XAM.jl @@ -1,10 +1,13 @@ module XAM export - BAM, - SAM + SAM, + BAM include("sam/sam.jl") include("bam/bam.jl") +using .SAM +using .BAM + end # module diff --git a/src/bam/bai.jl b/src/bam/bai.jl index e5caf0a..035f17e 100644 --- a/src/bam/bai.jl +++ b/src/bam/bai.jl @@ -7,7 +7,7 @@ # An index type for the BAM file format. struct BAI # BGZF file index - index::GenomicFeatures.Indexes.BGZFIndex + index::Indexes.BGZFIndex # number of unmapped reads n_no_coors::Union{Nothing, Int} @@ -44,7 +44,7 @@ function read_bai(input::IO) # read contents n_refs = read(input, Int32) - index = GenomicFeatures.Indexes.read_bgzfindex(input, n_refs) + index = Indexes.read_bgzfindex(input, n_refs) if !eof(input) n_no_coors = read(input, UInt64) else diff --git a/src/bam/bam.jl b/src/bam/bam.jl index f3317f6..e055231 100644 --- a/src/bam/bam.jl +++ b/src/bam/bam.jl @@ -3,11 +3,18 @@ module BAM +using BioCore +using GenomicFeatures +using XAM.SAM + import BGZFStreams -import BioAlignments: BioAlignments, SAM -import GenomicFeatures: GenomicFeatures, Interval +import BioAlignments +import Indexes import BioSequences -import BioCore: BioCore, isfilled +import BioCore: isfilled, header + +import GenomicFeatures: eachoverlap + include("bai.jl") include("auxdata.jl") diff --git a/src/bam/overlap.jl b/src/bam/overlap.jl index 10641d3..c3475ba 100644 --- a/src/bam/overlap.jl +++ b/src/bam/overlap.jl @@ -36,7 +36,7 @@ mutable struct OverlapIteratorState refindex::Int # possibly overlapping chunks - chunks::Vector{GenomicFeatures.Indexes.Chunk} + chunks::Vector{Indexes.Chunk} # current chunk index chunkid::Int @@ -51,7 +51,7 @@ function Base.iterate(iter::OverlapIterator) throw(ArgumentError("sequence name $(iter.refname) is not found in the header")) end @assert iter.reader.index !== nothing - chunks = GenomicFeatures.Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval) + chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval) if !isempty(chunks) seek(iter.reader, first(chunks).start) end diff --git a/src/bam/reader.jl b/src/bam/reader.jl index 5e92b8e..906110a 100644 --- a/src/bam/reader.jl +++ b/src/bam/reader.jl @@ -66,10 +66,6 @@ function header(reader::Reader; fillSQ::Bool=false)::SAM.Header return header end -function BioCore.header(reader::Reader) - return header(reader) -end - function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset) seek(reader.stream, voffset) end diff --git a/src/bam/record.jl b/src/bam/record.jl index 31226d2..107f127 100644 --- a/src/bam/record.jl +++ b/src/bam/record.jl @@ -477,11 +477,11 @@ function hastemplength(record::Record) end """ - sequence(record::Record)::BioSequences.DNASequence + sequence(record::Record)::BioSequences.LongDNASeq Get the segment sequence of `record`. """ -function sequence(record::Record)::BioSequences.DNASequence +function sequence(record::Record)::BioSequences.LongDNASeq checkfilled(record) seqlen = seqlength(record) data = Vector{UInt64}(undef, cld(seqlen, 16)) @@ -491,7 +491,7 @@ function sequence(record::Record)::BioSequences.DNASequence x = unsafe_load(src, i) data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4 end - return BioSequences.DNASequence(data, 1:seqlen, false) + return BioSequences.LongDNASeq(data, 1:seqlen, false) end function hassequence(record::Record) diff --git a/src/sam/reader.jl b/src/sam/reader.jl index ee016e4..d26a840 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -38,10 +38,6 @@ function header(reader::Reader)::Header return reader.header end -function BioCore.header(reader::Reader) - return header(reader) -end - function Base.eltype(::Type{Reader}) return Record end diff --git a/src/sam/record.jl b/src/sam/record.jl index 25d39d8..407fe2f 100644 --- a/src/sam/record.jl +++ b/src/sam/record.jl @@ -370,17 +370,17 @@ function hastemplength(record::Record) end """ - sequence(record::Record)::BioSequences.DNASequence + sequence(record::Record)::BioSequences.LongDNASeq Get the segment sequence of `record`. """ -function sequence(record::Record)::BioSequences.DNASequence +function sequence(record::Record)::BioSequences.LongDNASeq checkfilled(record) if ismissing(record, record.seq) missingerror(:sequence) end seqlen = length(record.seq) - ret = BioSequences.DNASequence(seqlen) + ret = BioSequences.LongDNASeq(seqlen) BioSequences.encode_copy!(ret, 1, record.data, first(record.seq), seqlen) return ret end diff --git a/src/sam/sam.jl b/src/sam/sam.jl index 2b46653..b917f37 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -3,12 +3,14 @@ module SAM +using BioCore + import Automa import Automa.RegExp: @re_str import BioAlignments import BioCore.Exceptions: missingerror import BioCore.RecordHelper: unsafe_parse_decimal -import BioCore: BioCore, isfilled +import BioCore: isfilled, header import BioSequences import BufferedStreams using Printf: @sprintf diff --git a/test/runtests.jl b/test/runtests.jl index 1182fb5..7594e1c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,13 +1,25 @@ using Test -using BioAlignments -using BioSymbols +using GenomicFeatures +using XAM +import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE import BGZFStreams: BGZFStream import BioCore.Exceptions: MissingFieldException import BioCore.Testing.get_bio_fmt_specimens import BioSequences: @dna_str, @aa_str -import GenomicFeatures import YAML +import BioCore: + header, + isfilled, + seqname, + hasseqname, + sequence, + hassequence, + leftposition, + rightposition, + hasleftposition, + hasrightposition + # Generate a random range within `range`. function randrange(range) x = rand(range) @@ -68,12 +80,12 @@ end record = SAM.Record() @test !isfilled(record) @test !SAM.ismapped(record) - @test repr(record) == "BioAlignments.SAM.Record: " + @test repr(record) == "XAM.SAM.Record: " @test_throws ArgumentError SAM.flag(record) record = SAM.Record("r001\t99\tchr1\t7\t30\t8M2I4M1D3M\t=\t37\t39\tTTAGATAAAGGATACTG\t*") @test isfilled(record) - @test occursin(r"^BioAlignments.SAM.Record:\n", repr(record)) + @test occursin(r"^XAM.SAM.Record:\n", repr(record)) @test SAM.ismapped(record) @test SAM.isprimary(record) @test SAM.hastempname(record) @@ -217,7 +229,7 @@ end @testset "Record" begin record = BAM.Record() @test !isfilled(record) - @test repr(record) == "BioAlignments.BAM.Record: " + @test repr(record) == "XAM.BAM.Record: " @test_throws ArgumentError BAM.flag(record) end @@ -225,7 +237,7 @@ end reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam")) @test isa(reader, BAM.Reader) @test eltype(reader) === BAM.Record - @test startswith(repr(reader), "BioAlignments.BAM.Reader{IOStream}:") + @test startswith(repr(reader), "XAM.BAM.Reader{IOStream}:") # header h = header(reader) From 2df146b8fc4578e9631def931bb6441575603b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 2 Feb 2020 22:44:55 +1100 Subject: [PATCH 08/14] Use FormatSpecimens --- Project.toml | 4 ++-- test/runtests.jl | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Project.toml b/Project.toml index 2f31d67..18608d4 100644 --- a/Project.toml +++ b/Project.toml @@ -26,8 +26,8 @@ Indexes = "0.1" julia = "1.1" [extras] +FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [targets] -test = ["Test", "YAML"] +test = ["FormatSpecimens", "Test"] diff --git a/test/runtests.jl b/test/runtests.jl index 7594e1c..ffb2669 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,11 +2,11 @@ using Test using GenomicFeatures using XAM import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE +using FormatSpecimens import BGZFStreams: BGZFStream import BioCore.Exceptions: MissingFieldException -import BioCore.Testing.get_bio_fmt_specimens import BioSequences: @dna_str, @aa_str -import YAML + import BioCore: header, @@ -32,7 +32,7 @@ function randrange(range) end @testset "SAM" begin - samdir = joinpath(get_bio_fmt_specimens("master", false, true), "SAM") + samdir = path_of_format("SAM") @testset "MetaInfo" begin metainfo = SAM.MetaInfo() @@ -172,8 +172,8 @@ end end return true end - for specimen in YAML.load_file(joinpath(samdir, "index.yml")) - filepath = joinpath(samdir, specimen["filename"]) + for specimen in list_valid_specimens("SAM") + filepath = joinpath(samdir, filename(specimen)) mktemp() do path, io # copy reader = open(SAM.Reader, filepath) @@ -192,7 +192,7 @@ end end @testset "BAM" begin - bamdir = joinpath(get_bio_fmt_specimens("master", false), "BAM") + bamdir = path_of_format("BAM") @testset "AuxData" begin auxdata = BAM.AuxData(UInt8[]) @@ -370,11 +370,11 @@ end end @testset "Round trip" begin - for specimen in YAML.load_file(joinpath(bamdir, "index.yml")) - filepath = joinpath(bamdir, specimen["filename"]) + for specimen in list_valid_specimens("BAM") + filepath = joinpath(bamdir, filename(specimen)) mktemp() do path, _ # copy - if occursin("bai", get(specimen, "tags", "")) + if hastags(specimen) && in("bai", tags(specimen)) reader = open(BAM.Reader, filepath, index=filepath * ".bai") else reader = open(BAM.Reader, filepath) From 514e2c901d361f6b1815bf0777426a9c181a8c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Tue, 18 Feb 2020 20:51:27 +1100 Subject: [PATCH 09/14] Add funding --- .github/FUNDING.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..53173f8 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +open_collective: biojulia \ No newline at end of file From 4b0a8a6644bd456104b4e4d3e45a0e1f24940fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Tue, 18 Feb 2020 20:52:16 +1100 Subject: [PATCH 10/14] Add TagBot workflow --- .github/workflows/TagBot.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/TagBot.yml diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..e65374b --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,13 @@ +name: TagBot +on: + schedule: + - cron: '0 * * * *' +jobs: + TagBot: + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.TAGBOT_KEY }} + registry: BioJulia/BioJuliaRegistry \ No newline at end of file From bf835d9ff06609bdb554defc1c4f9115709598fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 18 Jan 2020 13:46:46 +1100 Subject: [PATCH 11/14] Add documentation workflow --- .github/workflows/Documentation.yml | 34 +++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/Documentation.yml diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml new file mode 100644 index 0000000..9610301 --- /dev/null +++ b/.github/workflows/Documentation.yml @@ -0,0 +1,34 @@ +name: Documentation + +on: + push: + branches: + - 'master' + - 'develop' + - 'release/.*' + tags: '*' + pull_request: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: [1.3.0] + julia-arch: [x86] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1.0.0 + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - name: Install dependencies + run: | + julia ci_prep.jl; + julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - name: Build and deploy + env: + # https://github.com/JuliaDocs/Documenter.jl/issues/1177 + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key + run: julia --project=docs/ --color=yes docs/make.jl From 30a9911e58fc778509051472572a55671dc79ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 2 Feb 2020 20:55:31 +1100 Subject: [PATCH 12/14] Documentation roundup --- docs/src/man/hts-files.md | 313 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 docs/src/man/hts-files.md diff --git a/docs/src/man/hts-files.md b/docs/src/man/hts-files.md new file mode 100644 index 0000000..81783a9 --- /dev/null +++ b/docs/src/man/hts-files.md @@ -0,0 +1,313 @@ +# SAM and BAM + + +## Introduction + +High-throughput sequencing (HTS) technologies generate a large amount of data in the form of a large number of nucleotide sequencing reads. +One of the most common tasks in bioinformatics is to align these reads against known reference genomes, chromosomes, or contigs. +BioAlignments provides several data formats commonly used for this kind of task. + +BioAlignments offers high-performance tools for SAM and BAM file formats, which are the most popular file formats. + +If you have questions about the SAM and BAM formats or any of the terminology used when discussing these formats, see the published [specification][samtools-spec], which is maintained by the [samtools group][samtools]. + +A very very simple SAM file looks like the following: + +``` +@HD VN:1.6 SO:coordinate +@SQ SN:ref LN:45 +r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG * +r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA * +r003 0 ref 9 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0; +r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC * +r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1; +r001 147 ref 37 30 9M = 7 -39 CAGCGGCAT * NM:i:1 +``` + +Where the first two lines are part of the "header", and the following lines are "records". +Each record describes how a read aligns to some reference sequence. +Sometimes one record describes one read, but there are other cases like chimeric reads and split alignments, where multiple records apply to one read. +In the example above, `r003` is a chimeric read, and `r004` is a split alignment, and `r001` are mate pair reads. +Again, we refer you to the official [specification][samtools-spec] for more details. + +A BAM file stores this same information but in a binary and compressible format that does not make for pretty printing here! + +## Reading SAM and BAM files + +A typical script iterating over all records in a file looks like below: + +```julia +using BioAlignments + +# Open a BAM file. +reader = open(BAM.Reader, "data.bam") + +# Iterate over BAM records. +for record in reader + # `record` is a BAM.Record object. + if BAM.ismapped(record) + # Print the mapped position. + println(BAM.refname(record), ':', BAM.position(record)) + end +end + +# Close the BAM file. +close(reader) +``` + +The size of a BAM file is often extremely large. +The iterator interface demonstrated above allocates an object for each record and that may be a bottleneck of reading data from a BAM file. +In-place reading reuses a pre-allocated object for every record and less memory allocation happens in reading: + +```julia +reader = open(BAM.Reader, "data.bam") +record = BAM.Record() +while !eof(reader) + read!(reader, record) + # do something +end +``` + +## SAM and BAM Headers + +Both `SAM.Reader` and `BAM.Reader` implement the `header` function, which returns a `SAM.Header` object. +To extract certain information out of the headers, you can use the `find` method on the header to extract information according to SAM/BAM tag. +Again we refer you to the [specification][samtools-spec] for full details of all the different tags that can occur in headers, and what they mean. + +Below is an example of extracting all the info about the reference sequences from the BAM header. +In SAM/BAM, any description of a reference sequence is stored in the header, under a tag denoted `SQ` (think `reference SeQuence`!). + +```jlcon +julia> reader = open(SAM.Reader, "data.sam"); + +julia> find(header(reader), "SQ") +7-element Array{Bio.Align.SAM.MetaInfo,1}: + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=Chr1 LN=30427671 + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=Chr2 LN=19698289 + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=Chr3 LN=23459830 + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=Chr4 LN=18585056 + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=Chr5 LN=26975502 + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=chloroplast LN=154478 + Bio.Align.SAM.MetaInfo: + tag: SQ + value: SN=mitochondria LN=366924 + +``` + +In the above we can see there were 7 sequences in the reference: 5 chromosomes, one chloroplast sequence, and one mitochondrial sequence. + +## SAM and BAM Records + +BioAlignments supports the following accessors for `SAM.Record` types. + +```@docs +XAM.SAM.flag +XAM.SAM.ismapped +XAM.SAM.isprimary +XAM.SAM.refname +XAM.SAM.position +XAM.SAM.rightposition +XAM.SAM.isnextmapped +XAM.SAM.nextrefname +XAM.SAM.nextposition +XAM.SAM.mappingquality +XAM.SAM.cigar +XAM.SAM.alignment +XAM.SAM.alignlength +XAM.SAM.tempname +XAM.SAM.templength +XAM.SAM.sequence +XAM.SAM.seqlength +XAM.SAM.quality +XAM.SAM.auxdata +``` + +BioAlignments supports the following accessors for `BAM.Record` types. + +```@docs +XAM.BAM.flag +XAM.BAM.ismapped +XAM.BAM.isprimary +XAM.BAM.refid +XAM.BAM.refname +XAM.BAM.reflen +XAM.BAM.position +XAM.BAM.rightposition +XAM.BAM.isnextmapped +XAM.BAM.nextrefid +XAM.BAM.nextrefname +XAM.BAM.nextposition +XAM.BAM.mappingquality +XAM.BAM.cigar +XAM.BAM.alignment +XAM.BAM.alignlength +XAM.BAM.tempname +XAM.BAM.templength +XAM.BAM.sequence +XAM.BAM.seqlength +XAM.BAM.quality +XAM.BAM.auxdata +``` + +## Accessing auxiliary data + +SAM and BAM records support the storing of optional data fields associated with tags. + +Tagged auxiliary data follows a format of `TAG:TYPE:VALUE`. +`TAG` is a two-letter string, and each tag can only appear once per record. +`TYPE` is a single case-sensetive letter which defined the format of `VALUE`. + +| Type | Description | +|------|-----------------------------------| +| 'A' | Printable character | +| 'i' | Signed integer | +| 'f' | Single-precision floating number | +| 'Z' | Printable string, including space | +| 'H' | Byte array in Hex format | +| 'B' | Integer of numeric array | + +For more information about these tags and their types we refer you to the [SAM/BAM specification][samtools-spec] and the additional [optional fields specification][samtags] document. + +There are some tags that are reserved, predefined standard tags, for specific uses. + +To access optional fields stored in tags, you use `getindex` indexing syntax on the record object. +Note that accessing optional tag fields will result in type instability in Julia. +This is because the type of the optional data is not known until run-time, as the tag is being read. +This can have a significant impact on performance. +To limit this, if the user knows the type of a value in advance, specifying it as a type annotation will alleviate the problem: + +Below is an example of looping over records in a bam file and using indexing syntax to get the data stored in the "NM" tag. +Note the `UInt8` type assertion to alleviate type instability. + +```julia +for record in open(BAM.Reader, "data.bam") + nm = record["NM"]::UInt8 + # do something +end +``` + +## Getting records in a range + +BioAlignments supports the BAI index to fetch records in a specific range from a BAM file. +[Samtools][samtools] provides `index` subcommand to create an index file (.bai) from a sorted BAM file. + +```console +$ samtools index -b SRR1238088.sort.bam +$ ls SRR1238088.sort.bam* +SRR1238088.sort.bam SRR1238088.sort.bam.bai +``` + +`eachoverlap(reader, chrom, range)` returns an iterator of BAM records overlapping the query interval: + +```julia +reader = open(BAM.Reader, "SRR1238088.sort.bam", index="SRR1238088.sort.bam.bai") +for record in eachoverlap(reader, "Chr2", 10000:11000) + # `record` is a BAM.Record object + # ... +end +close(reader) +``` + +## Getting records overlapping genomic features + +`eachoverlap` also accepts the `Interval` type defined in [GenomicFeatures.jl][genomicfeatures]. + +This allows you to do things like first read in the genomic features from a GFF3 file, and then for each feature, iterate over all the BAM records that overlap with that feature. + +```julia +# Load GFF3 module. +using GenomicFeatures +using BioAlignments + +# Load genomic features from a GFF3 file. +features = open(collect, GFF3.Reader, "TAIR10_GFF3_genes.gff") + +# Keep mRNA features. +filter!(x -> GFF3.featuretype(x) == "mRNA", features) + +# Open a BAM file and iterate over records overlapping mRNA transcripts. +reader = open(BAM.Reader, "SRR1238088.sort.bam", index = "SRR1238088.sort.bam.bai") +for feature in features + for record in eachoverlap(reader, feature) + # `record` overlaps `feature`. + # ... + end +end +close(reader) +``` + +## Writing files + +In order to write a BAM or SAM file, you must first create a `SAM.Header`. + +A `SAM.Header` is constructed from a vector of `SAM.MetaInfo` objects. + +For example, to create the following simple header: + +``` +@HD VN:1.6 SO:coordinate +@SQ SN:ref LN:45 +``` + +```julia +julia> a = SAM.MetaInfo("HD", ["VN" => 1.6, "SO" => "coordinate"]) +SAM.MetaInfo: + tag: HD + value: VN=1.6 SO=coordinate + +julia> b = SAM.MetaInfo("SQ", ["SN" => "ref", "LN" => 45]) +SAM.MetaInfo: + tag: SQ + value: SN=ref LN=45 + +julia> h = SAM.Header([a, b]) +SAM.Header(SAM.MetaInfo[SAM.MetaInfo: + tag: HD + value: VN=1.6 SO=coordinate, SAM.MetaInfo: + tag: SQ + value: SN=ref LN=45]) + +``` + +Then to create the writer for a SAM file, construct a `SAM.Writer` using the header and an `IO` type: + +```julia +julia> samw = SAM.Writer(open("my-data.sam", "w"), h) +SAM.Writer(IOStream()) + +``` + +To make a BAM Writer is slightly different, as you need to use a specific stream type from the [BGZFStreams][bgzfstreams] package: + +```julia +julia> using BGZFStreams + +julia> bamw = BAM.Writer(BGZFStream(open("my-data.bam", "w"), "w")) +BAM.Writer(BGZFStreams.BGZFStream{IOStream}()) + +``` + +Once you have a BAM or SAM writer, you can use the `write` method to write `BAM.Record`s or `SAM.Record`s to file: + +```julia +julia> write(bamw, rec) # Here rec is a `BAM.Record` +330780 +``` + +[samtools]: https://samtools.github.io/ +[samtools-spec]: https://samtools.github.io/hts-specs/SAMv1.pdf +[samtags]: https://samtools.github.io/hts-specs/SAMtags.pdf +[bgzfstreams]: https://github.com/BioJulia/BGZFStreams.jl +[genomicfeatures]: https://github.com/BioJulia/GenomicFeatures.jl From 1bcd7c344cfc5e031e11075e4ea81702d3c4fd56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Fri, 28 Feb 2020 14:29:53 +1100 Subject: [PATCH 13/14] Documentation - API autodocs. - Inline links for documenter. - Logo. - Badges. --- README.md | 77 ++++++++++++++++++++++++++++++++++++++- docs/Project.toml | 6 +++ docs/make.jl | 21 +++++++++++ docs/src/assets/logo.svg | 26 +++++++++++++ docs/src/index.md | 77 +++++++++++++++++++++++++++++++++++++++ docs/src/man/api.md | 38 +++++++++++++++++++ docs/src/man/hts-files.md | 43 +++++++++------------- 7 files changed, 260 insertions(+), 28 deletions(-) create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/assets/logo.svg create mode 100644 docs/src/index.md create mode 100644 docs/src/man/api.md diff --git a/README.md b/README.md index 645914e..45061cc 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,76 @@ -# XAM.jl +# XAM.jl -In development: minimal working separation of SAM and BAM from BioAlignments. +[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) +[![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest) +[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) +[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) +[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/) +[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl) + +> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original +blog post"). + +## Description +XAM provides I/O and utilities for manipulating SAM and BAM formatted alignment map files. + +## Installation +The latest version of XAM is made available to install through BioJulia's package registry. +By default, Julia's package manager only includes the "General" package registry. + +To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command: +```julia +registry add https://github.com/BioJulia/BioJuliaRegistry.git +``` + +Once the registry is added to your configuration, you can install XAM while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following: +```julia +add XAM +``` + +If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/XAM.jl/tree/develop) to try new features before release. + +## Testing +XAM is tested against Julia `1.X` on Linux, OS X, and Windows. + +**Latest build status:** + +[![Unit tests](https://github.com/BioJulia/XAM.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) +[![Documentation](https://github.com/BioJulia/XAM.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) +[![codecov](https://codecov.io/gh/BioJulia/XAM.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/XAM.jl) + +## Contributing +We appreciate [contributions](https://github.com/BioJulia/XAM.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features. + +Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. + +### Financial contributions +We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia). +Anyone can file an expense. +If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed. + + +## Backers & Sponsors +Thank you to all our backers and sponsors! + +Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer). + +[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers) + +Does your company use BioJulia? +Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor). +Your logo will show up here with a link to your website. + +[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website) +[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website) +[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website) +[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website) +[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website) +[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website) +[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website) +[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website) +[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website) +[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website) + + +## Questions? +If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..3506869 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,6 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[compat] +Documenter = "0.24" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..fcbd601 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,21 @@ +using Pkg +using Documenter, XAM + +makedocs( + format = Documenter.HTML( + edit_link = "develop" + ), + modules = [XAM, XAM.SAM, XAM.BAM], + sitename = "XAM.jl", + pages = [ + "Home" => "index.md", + "SAM and BAM" => "man/hts-files.md", + "API Reference" => "man/api.md" + ], + authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors." +) +deploydocs( + repo = "github.com/BioJulia/XAM.jl.git", + devbranch = "develop", + push_preview = true +) diff --git a/docs/src/assets/logo.svg b/docs/src/assets/logo.svg new file mode 100644 index 0000000..1b2ea24 --- /dev/null +++ b/docs/src/assets/logo.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..97d0001 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,77 @@ +# XAM.jl + +[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) +[![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest) +[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) +[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) +[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/) +[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl) + +> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/). + +## Description + +XAM provides I/O and utilities for manipulating SAM and BAM formatted alignment map files. + + +## Installation +The latest version of XAM is made available to install through BioJulia's package registry. +By default, Julia's package manager only includes the "General" package registry. + +To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command: +```julia +registry add https://github.com/BioJulia/BioJuliaRegistry.git +``` + +Once the registry is added to your configuration, you can install XAM while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following: +```julia +add XAM +``` + +If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/XAM.jl/tree/develop) to try new features before release. + +## Testing +XAM is tested against Julia `1.X` on Linux, OS X, and Windows. + +**Latest build status:** + +[![Unit tests](https://github.com/BioJulia/XAM.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) +[![Documentation](https://github.com/BioJulia/XAM.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) +[![codecov](https://codecov.io/gh/BioJulia/XAM.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/XAM.jl) + +## Contributing +We appreciate [contributions](https://github.com/BioJulia/XAM.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features. + +Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. + +### Financial contributions +We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia). +Anyone can file an expense. +If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed. + + +## Backers & Sponsors +Thank you to all our backers and sponsors! + +Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer). + +[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers) + +Does your company use BioJulia? +Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor). +Your logo will show up here with a link to your website. + +[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website) +[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website) +[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website) +[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website) +[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website) +[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website) +[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website) +[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website) +[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website) +[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website) + + +## Questions? +If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/src/man/api.md b/docs/src/man/api.md new file mode 100644 index 0000000..d30bca2 --- /dev/null +++ b/docs/src/man/api.md @@ -0,0 +1,38 @@ +```@meta +CurrentModule = XAM +DocTestSetup = quote + using XAM +end +``` + +# API Reference + +## SAM API + +### Public + +```@autodocs +Modules = [XAM.SAM] +private = false +``` + +### Internal +```@autodocs +Modules = [XAM.SAM] +public = false +``` + +## BAM API + +### Public + +```@autodocs +Modules = [XAM.BAM] +private = false +``` + +### Internal +```@autodocs +Modules = [XAM.BAM] +public = false +``` diff --git a/docs/src/man/hts-files.md b/docs/src/man/hts-files.md index 81783a9..de87b32 100644 --- a/docs/src/man/hts-files.md +++ b/docs/src/man/hts-files.md @@ -1,15 +1,10 @@ # SAM and BAM - ## Introduction -High-throughput sequencing (HTS) technologies generate a large amount of data in the form of a large number of nucleotide sequencing reads. -One of the most common tasks in bioinformatics is to align these reads against known reference genomes, chromosomes, or contigs. -BioAlignments provides several data formats commonly used for this kind of task. +The `XAM` package offers high-performance tools for SAM and BAM file formats, which are the most popular file formats. -BioAlignments offers high-performance tools for SAM and BAM file formats, which are the most popular file formats. - -If you have questions about the SAM and BAM formats or any of the terminology used when discussing these formats, see the published [specification][samtools-spec], which is maintained by the [samtools group][samtools]. +If you have questions about the SAM and BAM formats or any of the terminology used when discussing these formats, see the published [specification](https://samtools.github.io/hts-specs/SAMv1.pdf), which is maintained by the [samtools group](https://samtools.github.io/). A very very simple SAM file looks like the following: @@ -28,7 +23,7 @@ Where the first two lines are part of the "header", and the following lines are Each record describes how a read aligns to some reference sequence. Sometimes one record describes one read, but there are other cases like chimeric reads and split alignments, where multiple records apply to one read. In the example above, `r003` is a chimeric read, and `r004` is a split alignment, and `r001` are mate pair reads. -Again, we refer you to the official [specification][samtools-spec] for more details. +Again, we refer you to the official [specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for more details. A BAM file stores this same information but in a binary and compressible format that does not make for pretty printing here! @@ -37,7 +32,7 @@ A BAM file stores this same information but in a binary and compressible format A typical script iterating over all records in a file looks like below: ```julia -using BioAlignments +using XAM # Open a BAM file. reader = open(BAM.Reader, "data.bam") @@ -72,7 +67,7 @@ end Both `SAM.Reader` and `BAM.Reader` implement the `header` function, which returns a `SAM.Header` object. To extract certain information out of the headers, you can use the `find` method on the header to extract information according to SAM/BAM tag. -Again we refer you to the [specification][samtools-spec] for full details of all the different tags that can occur in headers, and what they mean. +Again we refer you to the [specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for full details of all the different tags that can occur in headers, and what they mean. Below is an example of extracting all the info about the reference sequences from the BAM header. In SAM/BAM, any description of a reference sequence is stored in the header, under a tag denoted `SQ` (think `reference SeQuence`!). @@ -110,7 +105,8 @@ In the above we can see there were 7 sequences in the reference: 5 chromosomes, ## SAM and BAM Records -BioAlignments supports the following accessors for `SAM.Record` types. +### SAM.Record +The `XAM` package supports the following accessors for `SAM.Record` types. ```@docs XAM.SAM.flag @@ -134,7 +130,8 @@ XAM.SAM.quality XAM.SAM.auxdata ``` -BioAlignments supports the following accessors for `BAM.Record` types. +### BAM.Record +The `XAM` package supports the following accessors for `BAM.Record` types. ```@docs XAM.BAM.flag @@ -178,7 +175,7 @@ Tagged auxiliary data follows a format of `TAG:TYPE:VALUE`. | 'H' | Byte array in Hex format | | 'B' | Integer of numeric array | -For more information about these tags and their types we refer you to the [SAM/BAM specification][samtools-spec] and the additional [optional fields specification][samtags] document. +For more information about these tags and their types we refer you to the [SAM/BAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) and the additional [optional fields specification](https://samtools.github.io/hts-specs/SAMtags.pdf) document. There are some tags that are reserved, predefined standard tags, for specific uses. @@ -200,8 +197,8 @@ end ## Getting records in a range -BioAlignments supports the BAI index to fetch records in a specific range from a BAM file. -[Samtools][samtools] provides `index` subcommand to create an index file (.bai) from a sorted BAM file. +The `XAM` package supports the BAI index to fetch records in a specific range from a BAM file. +[Samtools](https://samtools.github.io/) provides `index` subcommand to create an index file (.bai) from a sorted BAM file. ```console $ samtools index -b SRR1238088.sort.bam @@ -209,7 +206,7 @@ $ ls SRR1238088.sort.bam* SRR1238088.sort.bam SRR1238088.sort.bam.bai ``` -`eachoverlap(reader, chrom, range)` returns an iterator of BAM records overlapping the query interval: +The method `eachoverlap(reader, chrom, range)` returns an iterator of BAM records overlapping the query interval: ```julia reader = open(BAM.Reader, "SRR1238088.sort.bam", index="SRR1238088.sort.bam.bai") @@ -222,14 +219,14 @@ close(reader) ## Getting records overlapping genomic features -`eachoverlap` also accepts the `Interval` type defined in [GenomicFeatures.jl][genomicfeatures]. +The `eachoverlap` method also accepts the `Interval` type defined in [GenomicFeatures.jl](https://github.com/BioJulia/GenomicFeatures.jl). This allows you to do things like first read in the genomic features from a GFF3 file, and then for each feature, iterate over all the BAM records that overlap with that feature. ```julia -# Load GFF3 module. using GenomicFeatures -using BioAlignments +using GFF3 +using XAM # Load genomic features from a GFF3 file. features = open(collect, GFF3.Reader, "TAIR10_GFF3_genes.gff") @@ -289,7 +286,7 @@ SAM.Writer(IOStream()) ``` -To make a BAM Writer is slightly different, as you need to use a specific stream type from the [BGZFStreams][bgzfstreams] package: +To make a BAM Writer is slightly different, as you need to use a specific stream type from the (https://github.com/BioJulia/BGZFStreams.jl)[https://github.com/BioJulia/BGZFStreams.jl] package: ```julia julia> using BGZFStreams @@ -305,9 +302,3 @@ Once you have a BAM or SAM writer, you can use the `write` method to write `BAM. julia> write(bamw, rec) # Here rec is a `BAM.Record` 330780 ``` - -[samtools]: https://samtools.github.io/ -[samtools-spec]: https://samtools.github.io/hts-specs/SAMv1.pdf -[samtags]: https://samtools.github.io/hts-specs/SAMtags.pdf -[bgzfstreams]: https://github.com/BioJulia/BGZFStreams.jl -[genomicfeatures]: https://github.com/BioJulia/GenomicFeatures.jl From 085a724b4fc06564abe66352f81b1248500f1189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Fri, 28 Feb 2020 14:30:56 +1100 Subject: [PATCH 14/14] Update project status --- README.md | 2 +- docs/src/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 45061cc..850a435 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # XAM.jl -[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest) [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) [![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) diff --git a/docs/src/index.md b/docs/src/index.md index 97d0001..a3bbf39 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,6 +1,6 @@ # XAM.jl -[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest) [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) [![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)