1
0
Fork 0
mirror of https://github.com/MillironX/XAM.jl.git synced 2024-11-23 02:09:55 +00:00

Merge pull request #52 from BioJulia/release/0.3.0

Release v0.3.0
This commit is contained in:
Thomas A. Christensen II 2022-10-10 15:55:49 -05:00 committed by GitHub
commit 946e77a734
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
25 changed files with 323 additions and 237 deletions

View file

@ -1,17 +1,45 @@
name: CompatHelper name: CompatHelper
on: on:
schedule: schedule:
- cron: '0 0 * * *' - cron: 0 0 * * *
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs: jobs:
CompatHelper: CompatHelper:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Add CompatHelper - name: Check if Julia is already available in the PATH
run: julia --color=yes -e 'using Pkg; Pkg.add("CompatHelper")' id: julia_in_path
- name: Run CompatHelper run: which julia
continue-on-error: true
- name: Install Julia, but only if it is not already available in the PATH
uses: julia-actions/setup-julia@v1
with:
version: '1'
arch: ${{ runner.arch }}
if: steps.julia_in_path.outcome != 'success'
- name: "Add the General registry via Git"
run: |
import Pkg
ENV["JULIA_PKG_SERVER"] = ""
Pkg.Registry.add("General")
shell: julia --color=yes {0}
- name: "Install CompatHelper"
run: |
import Pkg
name = "CompatHelper"
uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
version = "3"
Pkg.add(; name, uuid, version)
shell: julia --color=yes {0}
- name: "Run CompatHelper"
run: |
import CompatHelper
CompatHelper.main()
shell: julia --color=yes {0}
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
run: julia --color=yes -e 'using CompatHelper; CompatHelper.main(master_branch = "master")' # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}

View file

@ -1,5 +1,4 @@
name: Build Documentation name: Documentation
on: on:
push: push:
branches: branches:
@ -10,17 +9,15 @@ on:
pull_request: pull_request:
jobs: jobs:
build: Documenter:
permissions:
contents: write
name: Documentation
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- uses: julia-actions/setup-julia@v1 - uses: julia-actions/julia-buildpkg@v1
with: - uses: julia-actions/julia-docdeploy@v1
version: '1'
- name: Install Dependencies
run: julia --color=yes --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
- name: Build and Deploy
env: env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
run: julia --color=yes --project=docs/ docs/make.jl

View file

@ -1,12 +1,22 @@
name: TagBot name: TagBot
on: on:
schedule: issue_comment:
- cron: '0 0 * * *' types:
- created
workflow_dispatch:
inputs:
lookback:
default: 3
permissions:
contents: write
jobs: jobs:
TagBot: TagBot:
if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: JuliaRegistries/TagBot@v1 - uses: JuliaRegistries/TagBot@v1
with: with:
token: ${{ secrets.GITHUB_TOKEN }} token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.TAGBOT_KEY }} ssh: ${{ secrets.DOCUMENTER_KEY }}

View file

@ -6,16 +6,20 @@ on:
jobs: jobs:
test: test:
name: Julia ${{ matrix.julia-version }} - ${{ matrix.os }} - ${{ matrix.julia-arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
continue-on-error: ${{ matrix.experimental }} continue-on-error: ${{ matrix.experimental }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
julia-version: julia-version:
- '1.0' # LTS - '1.6' # LTS
- '1' - '1'
julia-arch: [x64, x86] julia-arch: [x64, x86]
os: [ubuntu-latest, windows-latest, macOS-latest] os: [ubuntu-latest, windows-latest, macOS-latest]
exclude:
- os: macOS-latest
julia-arch: x86
experimental: [false] experimental: [false]
include: include:
- julia-version: nightly - julia-version: nightly
@ -31,7 +35,7 @@ jobs:
with: with:
version: ${{ matrix.julia-version }} version: ${{ matrix.julia-version }}
- name: Run Tests - name: Run Tests
uses: julia-actions/julia-runtest@latest uses: julia-actions/julia-runtest@v1
- name: Create CodeCov - name: Create CodeCov
uses: julia-actions/julia-processcoverage@v1 uses: julia-actions/julia-processcoverage@v1
- name: Upload CodeCov - name: Upload CodeCov

27
CHANGELOG.md Normal file
View file

@ -0,0 +1,27 @@
# Changelog
All notable changes to HapLink.jl will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [0.3.0]
## Added
- Crosschecks for SAM and BAM ([#29](https://github.com/BioJulia/XAM.jl/pull/29))
- Improved documentation for flags ([#43](https://github.com/BioJulia/XAM.jl/pull/43))
### Changed
- `BAM.quality` performance improved ([#21](https://github.com/BioJulia/XAM.jl/issues/21))
- Updated BioAlignments to v2.2 and BioSequences to v3 ([#48](https://github.com/BioJulia/XAM.jl/pull/48))
### Fixed
- `BAM.Record` layout now matches the BAM specs ([#26](https://github.com/BioJulia/XAM.jl/pull/26))
[Unreleased]: https://github.com/BioJulia/XAM.jl/compare/v0.3.0...HEAD
[0.3.0]: https://github.com/BioJulia/XAM.jl/compare/v0.2.8...v0.3.0

View file

@ -1,7 +1,7 @@
name = "XAM" name = "XAM"
uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <ward9250@gmail.com>", "Ciarán O'Mara <Ciaran.OMara@utas.edu.au>"] authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <ward9250@gmail.com>", "Ciarán O'Mara <Ciaran.OMara@utas.edu.au>"]
version = "0.2.8" version = "0.3.0"
[deps] [deps]
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
@ -17,18 +17,19 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
[compat] [compat]
Automa = "0.7, 0.8" Automa = "0.7, 0.8"
BGZFStreams = "0.3.1" BGZFStreams = "0.3.1"
BioAlignments = "2" BioAlignments = "2.2"
BioGenerics = "0.1" BioGenerics = "0.1"
BioSequences = "2.0.4" BioSequences = "3"
FormatSpecimens = "1.1" FormatSpecimens = "1.1"
GenomicFeatures = "2" GenomicFeatures = "2"
Indexes = "0.1" Indexes = "0.1"
TranscodingStreams = "0.6, 0.7, 0.8, 0.9" TranscodingStreams = "0.6, 0.7, 0.8, 0.9"
julia = "1" julia = "1.6"
[extras] [extras]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets] [targets]
test = ["FormatSpecimens", "Test"] test = ["Documenter", "FormatSpecimens", "Test"]

View file

@ -5,7 +5,6 @@
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE)
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) [![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)
[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/) [![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/)
[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl)
> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original > This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original
blog post"). blog post").
@ -66,4 +65,4 @@ Your logo will show up here with a link to your website.
## Questions? ## Questions?
If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). If you have a question about contributing or using BioJulia software, come on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).

View file

@ -3,4 +3,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[compat] [compat]
Documenter = "0.24" Documenter = "0.27"

View file

@ -2,6 +2,8 @@ using Pkg
using Documenter, XAM using Documenter, XAM
makedocs( makedocs(
checkdocs = :all,
linkcheck = true,
format = Documenter.HTML( format = Documenter.HTML(
edit_link = "develop" edit_link = "develop"
), ),

View file

@ -5,7 +5,6 @@
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE)
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) [![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)
[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/) [![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/)
[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl)
> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/). > This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/).
@ -65,4 +64,4 @@ Your logo will show up here with a link to your website.
## Questions? ## Questions?
If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). If you have a question about contributing or using BioJulia software, come on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).

View file

@ -4,6 +4,27 @@ export
SAM, SAM,
BAM BAM
"""
flag(record::Union{SAM.Record, BAM.Record})::UInt16
Get the bitwise flags of `record`. The returned value is a `UInt16` of each flag
being OR'd together. The possible flags are:
0x0001 template having multiple segments in sequencing
0x0002 each segment properly aligned according to the aligner
0x0004 segment unmapped
0x0008 next segment in the template unmapped
0x0010 SEQ being reverse complemented
0x0020 SEQ of the next segment in the template being reverse complemented
0x0040 the first segment in the template
0x0080 the last segment in the template
0x0100 secondary alignment
0x0200 not passing filters, such as platform/vendor quality controls
0x0400 PCR or optical duplicate
0x0800 supplementary alignment
"""
function flag end
include("sam/sam.jl") include("sam/sam.jl")
include("bam/bam.jl") include("bam/bam.jl")

View file

@ -6,6 +6,7 @@ module BAM
using BioGenerics using BioGenerics
using GenomicFeatures using GenomicFeatures
using XAM.SAM using XAM.SAM
import ..XAM: flag
import BGZFStreams import BGZFStreams
import BioAlignments import BioAlignments

View file

@ -50,12 +50,15 @@ function Base.iterate(iter::OverlapIterator)
if refindex === nothing if refindex === nothing
throw(ArgumentError("sequence name $(iter.refname) is not found in the header")) throw(ArgumentError("sequence name $(iter.refname) is not found in the header"))
end end
@assert iter.reader.index !== nothing
@assert iter.reader.index !== nothing "Reader index cannot be nothing."
chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval) chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval)
if !isempty(chunks) if isempty(chunks)
seek(iter.reader, first(chunks).start) return nothing
end end
state = OverlapIteratorState(refindex, chunks, 1, Record()) state = OverlapIteratorState(refindex, chunks, 1, Record())
seek(iter.reader, state.chunks[state.chunkid].start)
return iterate(iter, state) return iterate(iter, state)
end end
@ -65,7 +68,7 @@ function Base.iterate(iter::OverlapIterator, state)
while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop
read!(iter.reader, state.record) read!(iter.reader, state.record)
c = compare_intervals(state.record, (state.refindex, iter.interval)) c = compare_intervals(state.record, (state.refindex, iter.interval))
if c == 0 if c == 0 # overlapping
return copy(state.record), state return copy(state.record), state
end end
if c > 0 if c > 0
@ -78,6 +81,7 @@ function Base.iterate(iter::OverlapIterator, state)
seek(iter.reader, state.chunks[state.chunkid].start) seek(iter.reader, state.chunks[state.chunkid].start)
end end
end end
# no more overlapping records
return nothing return nothing
end end
@ -89,7 +93,7 @@ function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}})
return -1 return -1
end end
if rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2])) if rid > interval[1] || (rid == interval[1] && leftposition(record) > last(interval[2]))
# strictly right # strictly right
return +1 return +1
end end

View file

@ -11,18 +11,21 @@ mutable struct Record
block_size::Int32 block_size::Int32
refid::Int32 refid::Int32
pos::Int32 pos::Int32
bin_mq_nl::UInt32 l_read_name::UInt8
flag_nc::UInt32 mapq::UInt8
bin::UInt16
n_cigar_op::UInt16
flag::UInt16
l_seq::Int32 l_seq::Int32
next_refid::Int32 next_refid::Int32
next_pos::Int32 next_pos::Int32
tlen::Int32 tlen::Int32
# variable length data # variable length data
data::Vector{UInt8} data::Vector{UInt8}
reader::Reader reader::Union{Reader, Nothing}
function Record() function Record()
return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[]) return new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[])
end end
end end
@ -49,8 +52,11 @@ function Base.:(==)(a::Record, b::Record)
return a.block_size == b.block_size && return a.block_size == b.block_size &&
a.refid == b.refid && a.refid == b.refid &&
a.pos == b.pos && a.pos == b.pos &&
a.bin_mq_nl == b.bin_mq_nl && a.l_read_name == b.l_read_name &&
a.flag_nc == b.flag_nc && a.mapq == b.mapq &&
a.bin == b.bin &&
a.n_cigar_op == b.n_cigar_op &&
a.flag == b.flag &&
a.l_seq == b.l_seq && a.l_seq == b.l_seq &&
a.next_refid == b.next_refid && a.next_refid == b.next_refid &&
a.next_pos == b.next_pos && a.next_pos == b.next_pos &&
@ -60,19 +66,13 @@ end
function Base.copy(record::Record) function Base.copy(record::Record)
copy = Record() copy = Record()
copy.block_size = record.block_size GC.@preserve copy record begin
copy.refid = record.refid dst_pointer = Ptr{UInt8}(pointer_from_objref(copy))
copy.pos = record.pos src_pointer = Ptr{UInt8}(pointer_from_objref(record))
copy.bin_mq_nl = record.bin_mq_nl unsafe_copyto!(dst_pointer, src_pointer, FIXED_FIELDS_BYTES)
copy.flag_nc = record.flag_nc
copy.l_seq = record.l_seq
copy.next_refid = record.next_refid
copy.next_pos = record.next_pos
copy.tlen = record.tlen
copy.data = record.data[1:data_size(record)]
if isdefined(record, :reader)
copy.reader = record.reader
end end
copy.data = record.data[1:data_size(record)]
copy.reader = record.reader
return copy return copy
end end
@ -80,8 +80,11 @@ function Base.empty!(record::Record)
record.block_size = 0 record.block_size = 0
record.refid = 0 record.refid = 0
record.pos = 0 record.pos = 0
record.bin_mq_nl = 0 record.l_read_name = 0
record.flag_nc = 0 record.mapq = 0
record.bin = 0
record.flag = 0
record.n_cigar_op = 0
record.l_seq = 0 record.l_seq = 0
record.next_refid = 0 record.next_refid = 0
record.next_pos = 0 record.next_pos = 0
@ -125,14 +128,9 @@ end
# Accessor Fuctions # Accessor Fuctions
# ----------------- # -----------------
"""
flag(record::Record)::UInt16
Get the bitwise flag of `record`.
"""
function flag(record::Record)::UInt16 function flag(record::Record)::UInt16
checkfilled(record) checkfilled(record)
return UInt16(record.flag_nc >> 16) return record.flag
end end
function hasflag(record::Record) function hasflag(record::Record)
@ -318,7 +316,7 @@ end
Get the mapping quality of `record`. Get the mapping quality of `record`.
""" """
function mappingquality(record::Record)::UInt8 function mappingquality(record::Record)::UInt8
return UInt8((record.bin_mq_nl >> 8) & 0xff) return record.mapq
end end
function hasmappingquality(record::Record) function hasmappingquality(record::Record)
@ -397,7 +395,7 @@ function extract_cigar_rle(data::Vector{UInt8}, offset, n)
end end
function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int} function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int}
cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF cigaridx, nops = seqname_length(record) + 1, record.n_cigar_op
if !checkCG if !checkCG
return cigaridx, nops return cigaridx, nops
end end
@ -507,13 +505,16 @@ function hastemplength(record::Record)
end end
""" """
sequence(record::Record)::BioSequences.LongDNASeq sequence(record::Record)::BioSequences.LongDNA{4}
Get the segment sequence of `record`. Get the segment sequence of `record`.
""" """
function sequence(record::Record)::BioSequences.LongDNASeq function sequence(record::Record)
checkfilled(record) checkfilled(record)
seqlen = seqlength(record) seqlen = seqlength(record)
if seqlen == 0
return nothing
end
data = Vector{UInt64}(undef, cld(seqlen, 16)) data = Vector{UInt64}(undef, cld(seqlen, 16))
src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1) src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1)
for i in 1:lastindex(data) for i in 1:lastindex(data)
@ -521,7 +522,7 @@ function sequence(record::Record)::BioSequences.LongDNASeq
x = unsafe_load(src, i) x = unsafe_load(src, i)
data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4 data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4
end end
return BioSequences.LongDNASeq(data, 1:seqlen, false) return BioSequences.LongDNA{4}(data, UInt(seqlen))
end end
function hassequence(record::Record) function hassequence(record::Record)
@ -543,15 +544,15 @@ function hasseqlength(record::Record)
end end
""" """
quality(record::Record)::Vector{UInt8} quality(record::Record)
Get the base quality of `record`. Get the base quality of `record`.
""" """
function quality(record::Record)::Vector{UInt8} function quality(record::Record)
checkfilled(record) checkfilled(record)
seqlen = seqlength(record) seqlen = seqlength(record)
offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2)
return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen] return record.data[(1+offset):(seqlen+offset)]
end end
function hasquality(record::Record) function hasquality(record::Record)
@ -661,5 +662,5 @@ end
# Return the length of the read name. # Return the length of the read name.
function seqname_length(record::Record) function seqname_length(record::Record)
return record.bin_mq_nl & 0xff return record.l_read_name
end end

View file

@ -37,7 +37,7 @@ function Base.write(writer::Writer, record::Record)
end end
function write_header(stream, header, refseqnames, refseqlens) function write_header(stream, header, refseqnames, refseqlens)
@assert length(refseqnames) == length(refseqlens) @assert length(refseqnames) == length(refseqlens) "Lengths of refseq names and lengths must match."
n = 0 n = 0
# magic bytes # magic bytes

View file

@ -16,11 +16,14 @@ for (name, bits, doc) in [
(:QCFAIL, UInt16(0x200), "QC failure" ), (:QCFAIL, UInt16(0x200), "QC failure" ),
(:DUP, UInt16(0x400), "optical or PCR duplicate" ), (:DUP, UInt16(0x400), "optical or PCR duplicate" ),
(:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),] (:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),]
@assert bits isa UInt16 @assert bits isa UInt16 "The bits must be of type UInt16."
sym = Symbol("FLAG_", name) sym = Symbol("FLAG_", name)
doc = string(@sprintf("0x%04x: ", bits), doc) docstring = """ $sym
SAM/BAM flag: $doc
See also: [`flag`](@ref)
"""
@eval begin @eval begin
@doc $(doc) const $(sym) = $(bits) @doc $(docstring) const $(sym) = $(bits)
end end
end end

View file

@ -38,7 +38,7 @@ function Base.iterate(header::Header, i=1)
end end
""" """
find(header::Header, key::AbstractString)::Vector{MetaInfo} findall(header::Header, key::AbstractString)::Vector{MetaInfo}
Find metainfo objects satisfying `SAM.tag(metainfo) == key`. Find metainfo objects satisfying `SAM.tag(metainfo) == key`.
""" """

View file

@ -10,12 +10,16 @@ function Reader(state::State{S}) where {S <: TranscodingStream}
rdr = Reader(state, Header()) rdr = Reader(state, Header())
cs, ln, f = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum)) cs, ln = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum))
rdr.state.state = sam_machine_body.start_state rdr.state.state = sam_machine_body.start_state # Get the reader ready to read the body.
rdr.state.linenum = ln rdr.state.linenum = ln
rdr.state.filled = false rdr.state.filled = false
if cs != -1 && cs != 0 #Note: the header is finished when the state machine fails to transition after a new line (state 1).
throw(ArgumentError("Malformed SAM file header at line $(ln). Machine failed to transition from state $(cs)."))
end
return rdr return rdr
end end
@ -65,18 +69,18 @@ end
function index!(record::MetaInfo) function index!(record::MetaInfo)
stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) stream = TranscodingStreams.NoopStream(IOBuffer(record.data))
found = index!(stream, record) cs = index!(stream, record)
if !found if cs != 0
throw(ArgumentError("invalid SAM metadata")) throw(ArgumentError("Invalid SAM metadata. Machine failed to transition from state $(cs)."))
end end
return record return record
end end
function index!(record::Record) function index!(record::Record)
stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) stream = TranscodingStreams.NoopStream(IOBuffer(record.data))
found = index!(stream, record) cs = index!(stream, record)
if !found if cs != 0
throw(ArgumentError("invalid SAM record")) throw(ArgumentError("Invalid SAM record. Machine failed to transition from state $(cs)."))
end end
return record return record
end end
@ -110,5 +114,6 @@ function Base.read!(rdr::Reader, record::Record)
throw(EOFError()) throw(EOFError())
end end
throw(ArgumentError("malformed SAM file")) throw(ArgumentError("Malformed SAM file record at line $(ln). Machine failed to transition from state $(cs)."))
end end

View file

@ -41,7 +41,7 @@ const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_
co.actions[:enter] = [:pos1] co.actions[:enter] = [:pos1]
co.actions[:exit] = [:metainfo_tag] co.actions[:exit] = [:metainfo_tag]
comment = re"[^\r\n]*" comment = re"[^\r\n]*" # Note: Only single line comments are allowed.
comment.actions[:enter] = [:pos1] comment.actions[:enter] = [:pos1]
comment.actions[:exit] = [:metainfo_val] comment.actions[:exit] = [:metainfo_val]
@ -137,7 +137,7 @@ const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_
header = rep(cat(metainfo, newline)) header = rep(cat(metainfo, newline))
header.actions[:exit] = [:header] header.actions[:exit] = [:header]
body = record * rep(newline * record) * opt(newline) body = rep(cat(record, newline))
body.actions[:exit] = [:body] body.actions[:exit] = [:body]
sam = cat(header, body) sam = cat(header, body)
@ -168,13 +168,6 @@ function appendfrom!(dst, dpos, src, spos, n)
return dst return dst
end end
const action_metainfo = quote
appendfrom!(metainfo.data, 1, data, @markpos, p-@markpos)
metainfo.filled = 1:(p-@markpos)
found_metainfo = true
end
const sam_actions_metainfo = Dict( const sam_actions_metainfo = Dict(
:mark => :(@mark), :mark => :(@mark),
:pos1 => :(pos1 = @relpos(p)), :pos1 => :(pos1 = @relpos(p)),
@ -183,7 +176,10 @@ const sam_actions_metainfo = Dict(
:metainfo_val => :(metainfo.val = pos1:@relpos(p-1)), :metainfo_val => :(metainfo.val = pos1:@relpos(p-1)),
:metainfo_dict_key => :(push!(metainfo.dictkey, pos2:@relpos(p-1))), :metainfo_dict_key => :(push!(metainfo.dictkey, pos2:@relpos(p-1))),
:metainfo_dict_val => :(push!(metainfo.dictval, pos2:@relpos(p-1))), :metainfo_dict_val => :(push!(metainfo.dictval, pos2:@relpos(p-1))),
:metainfo => action_metainfo :metainfo => quote
appendfrom!(metainfo.data, 1, data, @markpos, p-@markpos)
metainfo.filled = 1:(p-@markpos)
end
) )
const sam_actions_header = merge( const sam_actions_header = merge(
@ -191,16 +187,11 @@ const sam_actions_header = merge(
Dict( Dict(
:countline => :(linenum += 1), :countline => :(linenum += 1),
:metainfo => quote :metainfo => quote
$(action_metainfo) $(sam_actions_metainfo[:metainfo])
push!(header, metainfo) push!(header, metainfo)
metainfo = MetaInfo() metainfo = MetaInfo()
end, end,
:header => quote :header => :(@escape)
finish_header = true
@escape
end
) )
) )
@ -222,9 +213,6 @@ const sam_actions_record = Dict(
:record => quote :record => quote
appendfrom!(record.data, 1, data, @markpos, p-@markpos) appendfrom!(record.data, 1, data, @markpos, p-@markpos)
record.filled = 1:(p-@markpos) record.filled = 1:(p-@markpos)
found_record = true
@escape
end end
) )
@ -232,18 +220,15 @@ const sam_actions_body = merge(
sam_actions_record, sam_actions_record,
Dict( Dict(
:countline => :(linenum += 1), :countline => :(linenum += 1),
:body => quote :record => quote
finish_body = true found_record = true
$(sam_actions_record[:record])
@escape @escape
end end,
:body => :(@escape)
) )
) )
# const sam_actions = merge(
# sam_actions_header,
# sam_actions_body
# )
const sam_context = Automa.CodeGenContext( const sam_context = Automa.CodeGenContext(
generator = :goto, generator = :goto,
checkbounds = false, checkbounds = false,
@ -253,43 +238,24 @@ const sam_context = Automa.CodeGenContext(
const sam_initcode_metainfo = quote const sam_initcode_metainfo = quote
pos1 = 0 pos1 = 0
pos2 = 0 pos2 = 0
found_metainfo = false
end
const sam_initcode_record = quote
pos = 0
found_record = false
end end
const sam_initcode_header = quote const sam_initcode_header = quote
$(sam_initcode_metainfo) $(sam_initcode_metainfo)
metainfo = MetaInfo() metainfo = MetaInfo()
finish_header = false
cs, linenum = state cs, linenum = state
end end
const sam_initcode_record = quote
pos = 0
end
const sam_initcode_body = quote const sam_initcode_body = quote
$(sam_initcode_record) $(sam_initcode_record)
finish_body = false found_record = false
cs, linenum = state cs, linenum = state
end end
const sam_loopcode_metainfo = quote
if cs < 0
throw(ArgumentError("malformed metainfo at pos $(p)"))
end
if found_metainfo
@goto __return__
end
end
const sam_returncode_metainfo = quote
return found_metainfo
end
Automa.Stream.generate_reader( Automa.Stream.generate_reader(
:index!, :index!,
sam_machine_metainfo, sam_machine_metainfo,
@ -297,19 +263,10 @@ Automa.Stream.generate_reader(
actions = sam_actions_metainfo, actions = sam_actions_metainfo,
context = sam_context, context = sam_context,
initcode = sam_initcode_metainfo, initcode = sam_initcode_metainfo,
loopcode = sam_loopcode_metainfo,
returncode = sam_returncode_metainfo
) |> eval ) |> eval
const sam_loopcode_header = quote
if finish_header
@goto __return__
end
end
const sam_returncode_header = quote const sam_returncode_header = quote
return cs, linenum, finish_header return cs, linenum
end end
Automa.Stream.generate_reader( Automa.Stream.generate_reader(
@ -319,38 +276,13 @@ Automa.Stream.generate_reader(
actions = sam_actions_header, actions = sam_actions_header,
context = sam_context, context = sam_context,
initcode = sam_initcode_header, initcode = sam_initcode_header,
loopcode = sam_loopcode_header,
returncode = sam_returncode_header returncode = sam_returncode_header
) |> eval ) |> eval
const sam_loopcode_body = quote
const sam_loopcode_record = quote
if cs < 0
throw(ArgumentError("malformed SAM record at position $(p), line $(linenum)"))
end
# # if cs != 0
# # throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)])))))
# # end
if found_record if found_record
@goto __return__ @goto __return__
end end
end
const sam_loopcode_body = quote
$(sam_loopcode_record)
if finish_body
@goto __return__
end
end
const sam_returncode_record = quote
return found_record
end end
Automa.Stream.generate_reader( Automa.Stream.generate_reader(
@ -360,12 +292,9 @@ Automa.Stream.generate_reader(
actions = sam_actions_record, actions = sam_actions_record,
context = sam_context, context = sam_context,
initcode = sam_initcode_record, initcode = sam_initcode_record,
loopcode = sam_loopcode_record,
returncode = sam_returncode_record
) |> eval ) |> eval
const sam_returncode_body = quote const sam_returncode_body = quote
return cs, linenum, found_record return cs, linenum, found_record
end end

View file

@ -2,10 +2,11 @@
# ========== # ==========
mutable struct Record mutable struct Record
# data and filled range # Data and filled range.
data::Vector{UInt8} data::Vector{UInt8}
filled::UnitRange{Int} filled::UnitRange{Int} # Note: Specifies the data in use.
# indexes
# Mandatory fields.
qname::UnitRange{Int} qname::UnitRange{Int}
flag::UnitRange{Int} flag::UnitRange{Int}
rname::UnitRange{Int} rname::UnitRange{Int}
@ -17,6 +18,8 @@ mutable struct Record
tlen::UnitRange{Int} tlen::UnitRange{Int}
seq::UnitRange{Int} seq::UnitRange{Int}
qual::UnitRange{Int} qual::UnitRange{Int}
# Auxiliary fields.
fields::Vector{UnitRange{Int}} fields::Vector{UnitRange{Int}}
end end
@ -147,11 +150,6 @@ end
# Accessor Functions # Accessor Functions
# ------------------ # ------------------
"""
flag(record::Record)::UInt16
Get the bitwise flag of `record`.
"""
function flag(record::Record)::UInt16 function flag(record::Record)::UInt16
checkfilled(record) checkfilled(record)
return unsafe_parse_decimal(UInt16, record.data, record.flag) return unsafe_parse_decimal(UInt16, record.data, record.flag)
@ -206,9 +204,9 @@ Get the 1-based leftmost mapping position of `record`.
function position(record::Record)::Int function position(record::Record)::Int
checkfilled(record) checkfilled(record)
pos = unsafe_parse_decimal(Int, record.data, record.pos) pos = unsafe_parse_decimal(Int, record.data, record.pos)
if pos == 0 # if pos == 0
missingerror(:position) # missingerror(:position)
end # end
return pos return pos
end end
@ -263,9 +261,9 @@ Get the position of the mate/next read of `record`.
function nextposition(record::Record)::Int function nextposition(record::Record)::Int
checkfilled(record) checkfilled(record)
pos = unsafe_parse_decimal(Int, record.data, record.pnext) pos = unsafe_parse_decimal(Int, record.data, record.pnext)
if pos == 0 # if pos == 0
missingerror(:nextposition) # missingerror(:nextposition)
end # end
return pos return pos
end end
@ -299,7 +297,8 @@ Get the CIGAR string of `record`.
function cigar(record::Record)::String function cigar(record::Record)::String
checkfilled(record) checkfilled(record)
if ismissing(record, record.cigar) if ismissing(record, record.cigar)
missingerror(:cigar) # missingerror(:cigar)
return ""
end end
return String(record.data[record.cigar]) return String(record.data[record.cigar])
end end
@ -377,9 +376,9 @@ Get the template length of `record`.
function templength(record::Record)::Int function templength(record::Record)::Int
checkfilled(record) checkfilled(record)
len = unsafe_parse_decimal(Int, record.data, record.tlen) len = unsafe_parse_decimal(Int, record.data, record.tlen)
if len == 0 # if len == 0
missingerror(:tlen) # missingerror(:tlen)
end # end
return len return len
end end
@ -388,17 +387,18 @@ function hastemplength(record::Record)
end end
""" """
sequence(record::Record)::BioSequences.LongDNASeq sequence(record::Record)::BioSequences.LongDNA{4}
Get the segment sequence of `record`. Get the segment sequence of `record`.
""" """
function sequence(record::Record)::BioSequences.LongDNASeq function sequence(record::Record)
checkfilled(record) checkfilled(record)
if ismissing(record, record.seq) if ismissing(record, record.seq)
missingerror(:sequence) # missingerror(:sequence)
return nothing
end end
seqlen = length(record.seq) seqlen = length(record.seq)
ret = BioSequences.LongDNASeq(seqlen) ret = BioSequences.LongDNA{4}(undef, seqlen)
copyto!(ret, 1, record.data, first(record.seq), seqlen) copyto!(ret, 1, record.data, first(record.seq), seqlen)
return ret return ret
end end
@ -494,7 +494,7 @@ function Base.getindex(record::Record, tag::AbstractString)
end end
if typ == UInt8('A') if typ == UInt8('A')
@assert lo == hi @assert lo == hi "Values lo and hi must be equivalent."
return Char(record.data[lo]) return Char(record.data[lo])
end end
if typ == UInt8('i') if typ == UInt8('i')

View file

@ -11,6 +11,7 @@ import BioGenerics.Exceptions: missingerror
import BioGenerics.Automa: State import BioGenerics.Automa: State
import BioSequences import BioSequences
import TranscodingStreams: TranscodingStreams, TranscodingStream import TranscodingStreams: TranscodingStreams, TranscodingStream
import ..XAM: flag
using Printf: @sprintf using Printf: @sprintf
@ -46,20 +47,6 @@ function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{I
return sign * x return sign * x
end end
# #TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams).
# import BufferedStreams: BufferedStreams, BufferedInputStream
# # A type keeping track of a ragel-based parser's state.
# mutable struct State{T<:BufferedInputStream}
# stream::T # input stream
# cs::Int # current DFA state of Ragel
# linenum::Int # line number: parser is responsible for updating this
# finished::Bool # true if finished (regardless of where in the stream we are)
# end
# function State(initstate::Int, input::BufferedInputStream)
# return State(input, initstate, 1, false)
# end
include("flags.jl") include("flags.jl")
include("metainfo.jl") include("metainfo.jl")

View file

@ -1,4 +1,5 @@
using Test using Test
using Documenter
using BioGenerics using BioGenerics
using FormatSpecimens using FormatSpecimens
@ -23,6 +24,13 @@ function randrange(range)
end end
include("test_sam.jl") @testset "XAM" begin
include("test_bam.jl") include("test_sam.jl")
include("test_issues.jl") include("test_bam.jl")
include("test_issues.jl")
include("test_crosscheck.jl")
# Include doctests.
DocMeta.setdocmeta!(XAM, :DocTestSetup, :(using XAM); recursive=true)
doctest(XAM; manual = false)
end

View file

@ -104,9 +104,12 @@
unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize) unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize)
end end
new_record = convert(BAM.Record, array) new_record = convert(BAM.Record, array)
@test record.bin_mq_nl == new_record.bin_mq_nl @test record.l_read_name == new_record.l_read_name
@test record.mapq == new_record.mapq
@test record.bin == new_record.bin
@test record.block_size == new_record.block_size @test record.block_size == new_record.block_size
@test record.flag_nc == new_record.flag_nc @test record.flag == new_record.flag
@test record.n_cigar_op == new_record.n_cigar_op
@test record.l_seq == new_record.l_seq @test record.l_seq == new_record.l_seq
@test record.next_refid == new_record.next_refid @test record.next_refid == new_record.next_refid
@test record.next_pos == new_record.next_pos @test record.next_pos == new_record.next_pos
@ -162,18 +165,21 @@
if length(xs) != length(ys) if length(xs) != length(ys)
return false return false
end end
for (x, y) in zip(xs, ys) for (a, b) in zip(xs, ys)
if !( if !(
x.block_size == y.block_size && a.block_size == b.block_size &&
x.refid == y.refid && a.refid == b.refid &&
x.pos == y.pos && a.pos == b.pos &&
x.bin_mq_nl == y.bin_mq_nl && a.l_read_name == b.l_read_name &&
x.flag_nc == y.flag_nc && a.mapq == b.mapq &&
x.l_seq == y.l_seq && a.bin == b.bin &&
x.next_refid == y.next_refid && a.n_cigar_op == b.n_cigar_op &&
x.next_pos == y.next_pos && a.flag == b.flag &&
x.tlen == y.tlen && a.l_seq == b.l_seq &&
x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)]) a.next_refid == b.next_refid &&
a.next_pos == b.next_pos &&
a.tlen == b.tlen &&
a.data[1:BAM.data_size(a)] == b.data[1:BAM.data_size(b)])
return false return false
end end
end end

54
test/test_crosscheck.jl Normal file
View file

@ -0,0 +1,54 @@
@testset "Cross Check Properties" begin
Broadcast.broadcastable(x::XAM.BAM.Record) = Ref(x) #TODO: consider moving to XAM.jl.
Broadcast.broadcastable(x::XAM.SAM.Record) = Ref(x) #TODO: consider moving to XAM.jl
function crosscheck(bam::BAM.Record, sam::SAM.Record, property::Symbol)
bam_property = getproperty(XAM.BAM, property)
sam_property = getproperty(XAM.SAM, property)
if bam_property(bam) != sam_property(sam)
@warn "$property result is not the same" bam_property(bam) sam_property(sam)
return false
end
return true
end
samdir = path_of_format("SAM")
bamdir = path_of_format("BAM")
filenames = [
"ce#1",
"ce#2",
"ce#5",
"ce#5b",
"ce#unmap",
"ce#unmap1",
"ce#unmap2",
]
properties = [
:position,# POS
:tempname,# QNAME
:mappingquality,# MAPQ
:cigar, # CIGAR
:flag, # FLAG
:sequence, # SEQ
:nextposition, # PNEXT
:templength, # TLEN
]
for filename in filenames
records_bam = collect(open(BAM.Reader, joinpath(bamdir, filename * ".bam")))
records_sam = collect(open(SAM.Reader, joinpath(samdir, filename * ".sam")))
for (bam, sam) in zip(records_bam, records_sam)
@test all(crosscheck.(bam, sam, properties)) == true
end
end
end # testset Crosscheck