1
0
Fork 0
mirror of https://github.com/MillironX/XAM.jl.git synced 2024-12-23 13:28:16 +00:00

Naive code roundup

This commit is contained in:
Ciarán O'Mara 2020-01-18 05:24:09 +11:00
parent 282d80a758
commit 7a56931b90
16 changed files with 2864 additions and 1 deletions

View file

@ -1,5 +1,10 @@
module XAM
greet() = print("Hello World!")
export
BAM,
SAM
include("sam/sam.jl")
include("bam/bam.jl")
end # module

153
src/bam/auxdata.jl Normal file
View file

@ -0,0 +1,153 @@
# BAM Auxiliary Data
# ==================
struct AuxData <: AbstractDict{String,Any}
data::Vector{UInt8}
end
function Base.getindex(aux::AuxData, tag::AbstractString)
checkauxtag(tag)
return getauxvalue(aux.data, 1, length(aux.data), UInt8(tag[1]), UInt8(tag[2]))
end
function Base.length(aux::AuxData)
data = aux.data
p = 1
len = 0
while p length(data)
len += 1
p = next_tag_position(data, p)
end
return len
end
function Base.iterate(aux::AuxData, pos=1)
if pos > length(aux.data)
return nothing
end
data = aux.data
@label doit
t1 = data[pos]
t2 = data[pos+1]
pos, typ = loadauxtype(data, pos + 2)
pos, value = loadauxvalue(data, pos, typ)
if t1 == t2 == 0xff
@goto doit
end
return Pair{String,Any}(String([t1, t2]), value), pos
end
# Internals
# ---------
function checkauxtag(tag::AbstractString)
if sizeof(tag) != 2
throw(ArgumentError("tag length must be 2"))
end
end
function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
pos = findauxtag(data, start, stop, t1, t2)
if pos == 0
throw(KeyError(String([t1, t2])))
end
pos, T = loadauxtype(data, pos + 2)
_, val = loadauxvalue(data, pos, T)
return val
end
function loadauxtype(data::Vector{UInt8}, p::Int)
function auxtype(b)
return (
b == UInt8('A') ? Char :
b == UInt8('c') ? Int8 :
b == UInt8('C') ? UInt8 :
b == UInt8('s') ? Int16 :
b == UInt8('S') ? UInt16 :
b == UInt8('i') ? Int32 :
b == UInt8('I') ? UInt32 :
b == UInt8('f') ? Float32 :
b == UInt8('Z') ? String :
error("invalid type tag: '$(Char(b))'"))
end
t = data[p]
if t == UInt8('B')
return p + 2, Vector{auxtype(data[p+1])}
else
return p + 1, auxtype(t)
end
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T
return p + sizeof(T), unsafe_load(Ptr{T}(pointer(data, p)))
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Char})
return p + 1, Char(unsafe_load(pointer(data, p)))
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
p += 4
xs = Vector{T}(undef, n)
unsafe_copyto!(pointer(xs), Ptr{T}(pointer(data, p)), n)
return p + n * sizeof(T), xs
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{String})
dataptr = pointer(data, p)
endptr = ccall(:memchr, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), dataptr, '\0', length(data) - p + 1)
q::Int = p + (endptr - dataptr) - 1
return q + 2, String(data[p:q])
end
function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
pos = start
while pos stop && !(data[pos] == t1 && data[pos+1] == t2)
pos = next_tag_position(data, pos)
end
if pos > stop
return 0
else
return pos
end
end
# Find the starting position of a next tag in `data` after `p`.
# `(data[p], data[p+1])` is supposed to be a current tag.
function next_tag_position(data::Vector{UInt8}, p::Int)
typ = Char(data[p+2])
p += 3
if typ == 'A'
p += 1
elseif typ == 'c' || typ == 'C'
p += 1
elseif typ == 's' || typ == 'S'
p += 2
elseif typ == 'i' || typ == 'I'
p += 4
elseif typ == 'f'
p += 4
elseif typ == 'd'
p += 8
elseif typ == 'Z' || typ == 'H'
while data[p] != 0x00 # NULL-terminalted string
p += 1
end
p += 1
elseif typ == 'B'
eltyp = Char(data[p])
elsize = eltyp == 'c' || eltyp == 'C' ? 1 :
eltyp == 's' || eltyp == 'S' ? 2 :
eltyp == 'i' || eltye == 'I' || eltyp == 'f' ? 4 :
error("invalid type tag: '$(Char(eltyp))'")
p += 1
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
p += 4 + elsize * n
else
error("invalid type tag: '$(Char(typ))'")
end
return p
end

55
src/bam/bai.jl Normal file
View file

@ -0,0 +1,55 @@
# BAI
# ===
#
# Index for BAM files.
# An index type for the BAM file format.
struct BAI
# BGZF file index
index::GenomicFeatures.Indexes.BGZFIndex
# number of unmapped reads
n_no_coors::Union{Nothing, Int}
end
"""
BAI(filename::AbstractString)
Load a BAI index from `filename`.
"""
function BAI(filename::AbstractString)
return open(read_bai, filename)
end
"""
BAI(input::IO)
Load a BAI index from `input`.
"""
function BAI(input::IO)
return read_bai(input)
end
# Read a BAI object from `input`.
function read_bai(input::IO)
# check magic bytes
B = read(input, UInt8)
A = read(input, UInt8)
I = read(input, UInt8)
x = read(input, UInt8)
if B != UInt8('B') || A != UInt8('A') || I != UInt8('I') || x != 0x01
error("input is not a valid BAI file")
end
# read contents
n_refs = read(input, Int32)
index = GenomicFeatures.Indexes.read_bgzfindex(input, n_refs)
if !eof(input)
n_no_coors = read(input, UInt64)
else
n_no_coors = nothing
end
return BAI(index, n_no_coors)
end

19
src/bam/bam.jl Normal file
View file

@ -0,0 +1,19 @@
# BAM File Format
# ===============
module BAM
import BGZFStreams
import BioAlignments: BioAlignments, SAM
import GenomicFeatures: GenomicFeatures, Interval
import BioSequences
import BioCore: BioCore, isfilled
include("bai.jl")
include("auxdata.jl")
include("reader.jl")
include("record.jl")
include("writer.jl")
include("overlap.jl")
end

95
src/bam/overlap.jl Normal file
View file

@ -0,0 +1,95 @@
# BAM Overlap
# ===========
struct OverlapIterator{T}
reader::Reader{T}
refname::String
interval::UnitRange{Int}
end
function Base.IteratorSize(::Type{OverlapIterator{T}}) where T
return Base.SizeUnknown()
end
function Base.eltype(::Type{OverlapIterator{T}}) where T
return Record
end
function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval)
return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last)
end
function GenomicFeatures.eachoverlap(reader::Reader, interval)
return GenomicFeatures.eachoverlap(reader, convert(Interval, interval))
end
function GenomicFeatures.eachoverlap(reader::Reader, refname::AbstractString, interval::UnitRange)
return OverlapIterator(reader, String(refname), interval)
end
# Iterator
# --------
mutable struct OverlapIteratorState
# reference index
refindex::Int
# possibly overlapping chunks
chunks::Vector{GenomicFeatures.Indexes.Chunk}
# current chunk index
chunkid::Int
# pre-allocated record
record::Record
end
function Base.iterate(iter::OverlapIterator)
refindex = findfirst(isequal(iter.refname), iter.reader.refseqnames)
if refindex == 0
throw(ArgumentError("sequence name $(iter.refname) is not found in the header"))
end
@assert iter.reader.index !== nothing
chunks = GenomicFeatures.Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval)
if !isempty(chunks)
seek(iter.reader, first(chunks).start)
end
state = OverlapIteratorState(refindex, chunks, 1, Record())
return iterate(iter, state)
end
function Base.iterate(iter::OverlapIterator, state)
while state.chunkid lastindex(state.chunks)
chunk = state.chunks[state.chunkid]
while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop
read!(iter.reader, state.record)
c = compare_intervals(state.record, (state.refindex, iter.interval))
if c == 0
return copy(state.record), state
elseif c > 0
# no more overlapping records in this chunk since records are sorted
break
end
end
state.chunkid += 1
if state.chunkid lastindex(state.chunks)
seek(iter.reader, state.chunks[state.chunkid].start)
end
end
return nothing
end
function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}})
rid = refid(record)
if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2]))
# strictly left
return -1
elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2]))
# strictly right
return +1
else
# overlapping
return 0
end
end

145
src/bam/reader.jl Normal file
View file

@ -0,0 +1,145 @@
# BAM Reader
# ==========
"""
BAM.Reader(input::IO; index=nothing)
Create a data reader of the BAM file format.
# Arguments
* `input`: data source
* `index=nothing`: filepath to a random access index (currently *bai* is supported)
"""
mutable struct Reader{T} <: BioCore.IO.AbstractReader
stream::BGZFStreams.BGZFStream{T}
header::SAM.Header
start_offset::BGZFStreams.VirtualOffset
refseqnames::Vector{String}
refseqlens::Vector{Int}
index::Union{Nothing, BAI}
end
function Base.eltype(::Type{Reader{T}}) where T
return Record
end
function BioCore.IO.stream(reader::Reader)
return reader.stream
end
function Reader(input::IO; index=nothing)
if isa(index, AbstractString)
index = BAI(index)
else
if index != nothing
error("unrecognizable index argument")
end
end
reader = init_bam_reader(input)
reader.index = index
return reader
end
function Base.show(io::IO, reader::Reader)
println(io, summary(reader), ":")
print(io, " number of contigs: ", length(reader.refseqnames))
end
"""
header(reader::Reader; fillSQ::Bool=false)::SAM.Header
Get the header of `reader`.
If `fillSQ` is `true`, this function fills missing "SQ" metainfo in the header.
"""
function header(reader::Reader; fillSQ::Bool=false)::SAM.Header
header = reader.header
if fillSQ
if !isempty(findall(reader.header, "SQ"))
throw(ArgumentError("SAM header already has SQ records"))
end
header = copy(header)
for (name, len) in zip(reader.refseqnames, reader.refseqlens)
push!(header, SAM.MetaInfo("SQ", ["SN" => name, "LN" => len]))
end
end
return header
end
function BioCore.header(reader::Reader)
return header(reader)
end
function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset)
seek(reader.stream, voffset)
end
function Base.seekstart(reader::Reader)
seek(reader.stream, reader.start_offset)
end
function Base.iterate(reader::Reader, rec=Record())
if eof(reader)
return nothing
end
read!(reader, rec)
return copy(rec), rec
end
# Initialize a BAM reader by reading the header section.
function init_bam_reader(input::BGZFStreams.BGZFStream)
# magic bytes
B = read(input, UInt8)
A = read(input, UInt8)
M = read(input, UInt8)
x = read(input, UInt8)
if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01
error("input was not a valid BAM file")
end
# SAM header
textlen = read(input, Int32)
samreader = SAM.Reader(IOBuffer(read(input, textlen)))
# reference sequences
refseqnames = String[]
refseqlens = Int[]
n_refs = read(input, Int32)
for _ in 1:n_refs
namelen = read(input, Int32)
data = read(input, namelen)
seqname = unsafe_string(pointer(data))
seqlen = read(input, Int32)
push!(refseqnames, seqname)
push!(refseqlens, seqlen)
end
voffset = isa(input.io, Base.AbstractPipe) ?
BGZFStreams.VirtualOffset(0, 0) :
BGZFStreams.virtualoffset(input)
return Reader(
input,
samreader.header,
voffset,
refseqnames,
refseqlens,
nothing)
end
function init_bam_reader(input::IO)
return init_bam_reader(BGZFStreams.BGZFStream(input))
end
function _read!(reader::Reader, record)
unsafe_read(
reader.stream,
pointer_from_objref(record),
FIXED_FIELDS_BYTES)
dsize = data_size(record)
if length(record.data) < dsize
resize!(record.data, dsize)
end
unsafe_read(reader.stream, pointer(record.data), dsize)
record.reader = reader
return record
end

634
src/bam/record.jl Normal file
View file

@ -0,0 +1,634 @@
# BAM Record
# ==========
"""
BAM.Record()
Create an unfilled BAM record.
"""
mutable struct Record
# fixed-length fields (see BMA specs for the details)
block_size::Int32
refid::Int32
pos::Int32
bin_mq_nl::UInt32
flag_nc::UInt32
l_seq::Int32
next_refid::Int32
next_pos::Int32
tlen::Int32
# variable length data
data::Vector{UInt8}
reader::Reader
function Record()
return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[])
end
end
# the data size of fixed-length fields (block_size-tlen)
const FIXED_FIELDS_BYTES = 36
function Record(data::Vector{UInt8})
return convert(Record, data)
end
function Base.convert(::Type{Record}, data::Vector{UInt8})
length(data) < FIXED_FIELDS_BYTES && throw(ArgumentError("data too short"))
record = Record()
dst_pointer = Ptr{UInt8}(pointer_from_objref(record))
unsafe_copyto!(dst_pointer, pointer(data), FIXED_FIELDS_BYTES)
dsize = data_size(record)
resize!(record.data, dsize)
length(data) < dsize + FIXED_FIELDS_BYTES && throw(ArgumentError("data too short"))
unsafe_copyto!(record.data, 1, data, FIXED_FIELDS_BYTES + 1, dsize)
return record
end
function Base.copy(record::Record)
copy = Record()
copy.block_size = record.block_size
copy.refid = record.refid
copy.pos = record.pos
copy.bin_mq_nl = record.bin_mq_nl
copy.flag_nc = record.flag_nc
copy.l_seq = record.l_seq
copy.next_refid = record.next_refid
copy.next_pos = record.next_pos
copy.tlen = record.tlen
copy.data = record.data[1:data_size(record)]
if isdefined(record, :reader)
copy.reader = record.reader
end
return copy
end
function Base.show(io::IO, record::Record)
print(io, summary(record), ':')
if isfilled(record)
println(io)
println(io, " template name: ", tempname(record))
println(io, " flag: ", flag(record))
println(io, " reference ID: ", refid(record))
println(io, " position: ", position(record))
println(io, " mapping quality: ", mappingquality(record))
println(io, " CIGAR: ", cigar(record))
println(io, " next reference ID: ", nextrefid(record))
println(io, " next position: ", nextposition(record))
println(io, " template length: ", templength(record))
println(io, " sequence: ", sequence(record))
# TODO: pretty print base quality
println(io, " base quality: ", quality(record))
print(io, " auxiliary data:")
for field in keys(auxdata(record))
print(io, ' ', field, '=', record[field])
end
else
print(io, " <not filled>")
end
end
function Base.read!(reader::Reader, record::Record)
return _read!(reader, record)
end
# Accessor Fuctions
# -----------------
"""
flag(record::Record)::UInt16
Get the bitwise flag of `record`.
"""
function flag(record::Record)::UInt16
checkfilled(record)
return UInt16(record.flag_nc >> 16)
end
function hasflag(record::Record)
return isfilled(record)
end
"""
ismapped(record::Record)::Bool
Test if `record` is mapped.
"""
function ismapped(record::Record)::Bool
return flag(record) & SAM.FLAG_UNMAP == 0
end
"""
isprimary(record::Record)::Bool
Test if `record` is a primary line of the read.
This is equivalent to `flag(record) & 0x900 == 0`.
"""
function isprimary(record::Record)::Bool
return flag(record) & 0x900 == 0
end
"""
ispositivestrand(record::Record)::Bool
Test if `record` is aligned to the positive strand.
This is equivalent to `flag(record) & 0x10 == 0`.
"""
function ispositivestrand(record::Record)::Bool
flag(record) & 0x10 == 0
end
"""
refid(record::Record)::Int
Get the reference sequence ID of `record`.
The ID is 1-based (i.e. the first sequence is 1) and is 0 for a record without a mapping position.
See also: `BAM.rname`
"""
function refid(record::Record)::Int
checkfilled(record)
return record.refid + 1
end
function hasrefid(record::Record)
return isfilled(record)
end
function checked_refid(record::Record)
id = refid(record)
if id == 0
throw(ArgumentError("record is not mapped"))
elseif !isdefined(record, :reader)
throw(ArgumentError("reader is not defined"))
end
return id
end
"""
refname(record::Record)::String
Get the reference sequence name of `record`.
See also: `BAM.refid`
"""
function refname(record::Record)::String
checkfilled(record)
id = checked_refid(record)
return record.reader.refseqnames[id]
end
"""
reflen(record::Record)::Int
Get the length of the reference sequence this record applies to.
"""
function reflen(record::Record)::Int
checkfilled(record)
id = checked_refid(record)
return record.reader.refseqlens[id]
end
function hasrefname(record::Record)
return hasrefid(record)
end
"""
position(record::Record)::Int
Get the 1-based leftmost mapping position of `record`.
"""
function position(record::Record)::Int
checkfilled(record)
return record.pos + 1
end
function hasposition(record::Record)
return isfilled(record)
end
"""
rightposition(record::Record)::Int
Get the 1-based rightmost mapping position of `record`.
"""
function rightposition(record::Record)::Int
checkfilled(record)
return Int32(position(record) + alignlength(record) - 1)
end
function hasrightposition(record::Record)
return isfilled(record) && ismapped(record)
end
"""
isnextmapped(record::Record)::Bool
Test if the mate/next read of `record` is mapped.
"""
function isnextmapped(record::Record)::Bool
return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0)
end
"""
nextrefid(record::Record)::Int
Get the next/mate reference sequence ID of `record`.
"""
function nextrefid(record::Record)::Int
checkfilled(record)
return record.next_refid + 1
end
function hasnextrefid(record::Record)
return isfilled(record)
end
"""
nextrefname(record::Record)::String
Get the reference name of the mate/next read of `record`.
"""
function nextrefname(record::Record)::String
checkfilled(record)
id = nextrefid(record)
if id == 0
throw(ArgumentError("next record is not mapped"))
elseif !isdefined(record, :reader)
throw(ArgumentError("reader is not defined"))
end
return record.reader.refseqnames[id]
end
function hasnextrefname(record::Record)
return isfilled(record) && isnextmapped(record)
end
"""
nextposition(record::Record)::Int
Get the 1-based leftmost mapping position of the next/mate read of `record`.
"""
function nextposition(record::Record)::Int
checkfilled(record)
return record.next_pos + 1
end
function hasnextposition(record::Record)
return isfilled(record)
end
"""
mappingquality(record::Record)::UInt8
Get the mapping quality of `record`.
"""
function mappingquality(record::Record)::UInt8
return UInt8((record.bin_mq_nl >> 8) & 0xff)
end
function hasmappingquality(record::Record)
return isfilled(record)
end
"""
n_cigar_op(record::Record, checkCG::Bool = true)
Return the number of operations in the CIGAR string of `record`.
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
Calling this method with `checkCG` set to `true` (default) this method will always yield the number of operations in the true cigar string, because this is probably what you want, the vast majority of the time.
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to get the number of operations in the `cigar` field of the BAM record, then set `checkCG` to `false`.
"""
function n_cigar_op(record::Record, checkCG::Bool = true)
return cigar_position(record, checkCG)[2]
end
"""
cigar(record::Record)::String
Get the CIGAR string of `record`.
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time.
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`.
See also `BAM.cigar_rle`.
"""
function cigar(record::Record, checkCG::Bool = true)::String
buf = IOBuffer()
for (op, len) in zip(cigar_rle(record, checkCG)...)
print(buf, len, convert(Char, op))
end
return String(take!(buf))
end
"""
cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}}
Get a run-length encoded tuple `(ops, lens)` of the CIGAR string in `record`.
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time.
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`.
See also `BAM.cigar`.
"""
function cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}}
checkfilled(record)
idx, nops = cigar_position(record, checkCG)
ops, lens = extract_cigar_rle(record.data, idx, nops)
return ops, lens
end
function extract_cigar_rle(data::Vector{UInt8}, offset, n)
ops = Vector{BioAlignments.Operation}()
lens = Vector{Int}()
for i in offset:4:offset + (n - 1) * 4
x = unsafe_load(Ptr{UInt32}(pointer(data, i)))
op = BioAlignments.Operation(x & 0x0F)
push!(ops, op)
push!(lens, x >> 4)
end
return ops, lens
end
function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int}
cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF
if !checkCG
return cigaridx, nops
end
if nops != 2
return cigaridx, nops
end
x = unsafe_load(Ptr{UInt32}(pointer(record.data, cigaridx)))
if x != UInt32(seqlength(record) << 4 | 4)
return cigaridx, nops
end
start = auxdata_position(record)
stop = data_size(record)
tagidx = findauxtag(record.data, start, stop, UInt8('C'), UInt8('G'))
if tagidx == 0
return cigaridx, nops
end
# Tag exists, validate type is BI.
typ = unsafe_load(Ptr{UInt16}(pointer(record.data, tagidx += 2)))
if typ != (UInt16('I') << 8 | UInt16('B'))
return cigaridx, nops
end
# If got this far, the CG tag is valid and contains the cigar.
# Get the true n_cigar_ops, and return it and the idx of the first
nops = UInt32(unsafe_load(Ptr{Int32}(pointer(record.data, tagidx += 2))))
tagidx += 4
return tagidx, nops
end
"""
alignment(record::Record)::BioAlignments.Alignment
Get the alignment of `record`.
"""
function alignment(record::Record)::BioAlignments.Alignment
checkfilled(record)
if !ismapped(record)
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
end
seqpos = 0
refpos = position(record) - 1
anchors = [BioAlignments.AlignmentAnchor(seqpos, refpos, BioAlignments.OP_START)]
for (op, len) in zip(cigar_rle(record)...)
if BioAlignments.ismatchop(op)
seqpos += len
refpos += len
elseif BioAlignments.isinsertop(op)
seqpos += len
elseif BioAlignments.isdeleteop(op)
refpos += len
else
error("operation $(op) is not supported")
end
push!(anchors, BioAlignments.AlignmentAnchor(seqpos, refpos, op))
end
return BioAlignments.Alignment(anchors)
end
function hasalignment(record::Record)
return ismapped(record)
end
"""
alignlength(record::Record)::Int
Get the alignment length of `record`.
"""
function alignlength(record::Record)::Int
offset = seqname_length(record)
length::Int = 0
for i in offset + 1:4:offset + n_cigar_op(record, false) * 4
x = unsafe_load(Ptr{UInt32}(pointer(record.data, i)))
op = BioAlignments.Operation(x & 0x0F)
if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op)
length += x >> 4
end
end
return length
end
"""
tempname(record::Record)::String
Get the query template name of `record`.
"""
function tempname(record::Record)::String
checkfilled(record)
# drop the last NUL character
return unsafe_string(pointer(record.data), max(seqname_length(record) - 1, 0))
end
function hastempname(record::Record)
return isfilled(record)
end
"""
templength(record::Record)::Int
Get the template length of `record`.
"""
function templength(record::Record)::Int
checkfilled(record)
return record.tlen
end
function hastemplength(record::Record)
return isfilled(record)
end
"""
sequence(record::Record)::BioSequences.DNASequence
Get the segment sequence of `record`.
"""
function sequence(record::Record)::BioSequences.DNASequence
checkfilled(record)
seqlen = seqlength(record)
data = Vector{UInt64}(undef, cld(seqlen, 16))
src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1)
for i in 1:lastindex(data)
# copy data flipping high and low nybble
x = unsafe_load(src, i)
data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4
end
return BioSequences.DNASequence(data, 1:seqlen, false)
end
function hassequence(record::Record)
return isfilled(record)
end
"""
seqlength(record::Record)::Int
Get the sequence length of `record`.
"""
function seqlength(record::Record)::Int
checkfilled(record)
return record.l_seq % Int
end
function hasseqlength(record::Record)
return isfilled(record)
end
"""
quality(record::Record)::Vector{UInt8}
Get the base quality of `record`.
"""
function quality(record::Record)::Vector{UInt8}
checkfilled(record)
seqlen = seqlength(record)
offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2)
return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen]
end
function hasquality(record::Record)
return isfilled(record)
end
"""
auxdata(record::Record)::BAM.AuxData
Get the auxiliary data of `record`.
"""
function auxdata(record::Record)
checkfilled(record)
return AuxData(record.data[auxdata_position(record):data_size(record)])
end
function hasauxdata(record::Record)
return isfilled(record)
end
function Base.getindex(record::Record, tag::AbstractString)
checkauxtag(tag)
start = auxdata_position(record)
stop = data_size(record)
return getauxvalue(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2]))
end
function Base.haskey(record::Record, tag::AbstractString)
checkauxtag(tag)
start = auxdata_position(record)
stop = data_size(record)
return findauxtag(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) > 0
end
function Base.keys(record::Record)
return collect(keys(auxdata(record)))
end
function Base.values(record::Record)
return [record[key] for key in keys(record)]
end
# BioCore Methods
# -----------
function BioCore.isfilled(record::Record)
return record.block_size != 0
end
function BioCore.seqname(record::Record)
return tempname(record)
end
function BioCore.hasseqname(record::Record)
return hastempname(record)
end
function BioCore.sequence(record::Record)
return sequence(record)
end
function BioCore.hassequence(record::Record)
return hassequence(record)
end
function BioCore.leftposition(record::Record)
return position(record)
end
function BioCore.hasleftposition(record::Record)
return hasposition(record)
end
function BioCore.rightposition(record::Record)
return rightposition(record)
end
function BioCore.hasrightposition(record::Record)
return hasrightposition(record)
end
# Helper Functions
# ----------------
# Return the size of the `.data` field.
function data_size(record::Record)
if isfilled(record)
return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size)
else
return 0
end
end
function checkfilled(record::Record)
if !isfilled(record)
throw(ArgumentError("unfilled BAM record"))
end
end
function auxdata_position(record::Record)
seqlen = seqlength(record)
return seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + seqlen + 1
end
# Return the length of the read name.
function seqname_length(record::Record)
return record.bin_mq_nl & 0xff
end

62
src/bam/writer.jl Normal file
View file

@ -0,0 +1,62 @@
# BAM Writer
# ==========
"""
BAM.Writer(output::BGZFStream, header::SAM.Header)
Create a data writer of the BAM file format.
# Arguments
* `output`: data sink
* `header`: SAM header object
"""
mutable struct Writer <: BioCore.IO.AbstractWriter
stream::BGZFStreams.BGZFStream
end
function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header)
refseqnames = String[]
refseqlens = Int[]
for metainfo in findall(header, "SQ")
push!(refseqnames, metainfo["SN"])
push!(refseqlens, parse(Int, metainfo["LN"]))
end
write_header(stream, header, refseqnames, refseqlens)
return Writer(stream)
end
function BioCore.IO.stream(writer::Writer)
return writer.stream
end
function Base.write(writer::Writer, record::Record)
n = 0
n += unsafe_write(writer.stream, pointer_from_objref(record), FIXED_FIELDS_BYTES)
n += unsafe_write(writer.stream, pointer(record.data), data_size(record))
return n
end
function write_header(stream, header, refseqnames, refseqlens)
@assert length(refseqnames) == length(refseqlens)
n = 0
# magic bytes
n += write(stream, "BAM\1")
# SAM header
buf = IOBuffer()
l = write(SAM.Writer(buf), header)
n += write(stream, Int32(l))
n += write(stream, take!(buf))
# reference sequences
n += write(stream, Int32(length(refseqnames)))
for (seqname, seqlen) in zip(refseqnames, refseqlens)
namelen = length(seqname)
n += write(stream, Int32(namelen + 1))
n += write(stream, seqname, '\0')
n += write(stream, Int32(seqlen))
end
return n
end

26
src/sam/flags.jl Normal file
View file

@ -0,0 +1,26 @@
# SAM Flags
# =========
#
# Bitwise flags (or FLAG).
for (name, bits, doc) in [
(:PAIRED, UInt16(0x001), "the read is paired in sequencing, no matter whether it is mapped in a pair"),
(:PROPER_PAIR, UInt16(0x002), "the read is mapped in a proper pair" ),
(:UNMAP, UInt16(0x004), "the read itself is unmapped; conflictive with SAM.FLAG_PROPER_PAIR" ),
(:MUNMAP, UInt16(0x008), "the mate is unmapped" ),
(:REVERSE, UInt16(0x010), "the read is mapped to the reverse strand" ),
(:MREVERSE, UInt16(0x020), "the mate is mapped to the reverse strand" ),
(:READ1, UInt16(0x040), "this is read1" ),
(:READ2, UInt16(0x080), "this is read2" ),
(:SECONDARY, UInt16(0x100), "not primary alignment" ),
(:QCFAIL, UInt16(0x200), "QC failure" ),
(:DUP, UInt16(0x400), "optical or PCR duplicate" ),
(:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),]
@assert bits isa UInt16
sym = Symbol("FLAG_", name)
doc = string(@sprintf("0x%04x: ", bits), doc)
@eval begin
@doc $(doc) const $(sym) = $(bits)
end
end

53
src/sam/header.jl Normal file
View file

@ -0,0 +1,53 @@
# SAM Header
# ==========
struct Header
metainfo::Vector{MetaInfo}
end
"""
SAM.Header()
Create an empty header.
"""
function Header()
return Header(MetaInfo[])
end
function Base.copy(header::Header)
return Header(header.metainfo)
end
function Base.eltype(::Type{Header})
return MetaInfo
end
function Base.length(header::Header)
return length(header.metainfo)
end
function Base.iterate(header::Header, i=1)
if i > length(header.metainfo)
return nothing
end
return header.metainfo[i], i + 1
end
"""
find(header::Header, key::AbstractString)::Vector{MetaInfo}
Find metainfo objects satisfying `SAM.tag(metainfo) == key`.
"""
function Base.findall(header::Header, key::AbstractString)::Vector{MetaInfo}
return filter(m -> isequalkey(m, key), header.metainfo)
end
function Base.pushfirst!(header::Header, metainfo::MetaInfo)
pushfirst!(header.metainfo, metainfo)
return header
end
function Base.push!(header::Header, metainfo::MetaInfo)
push!(header.metainfo, metainfo)
return header
end

235
src/sam/metainfo.jl Normal file
View file

@ -0,0 +1,235 @@
# SAM Meta-Information
# ====================
mutable struct MetaInfo
# data and filled range
data::Vector{UInt8}
filled::UnitRange{Int}
# indexes
tag::UnitRange{Int}
val::UnitRange{Int}
dictkey::Vector{UnitRange{Int}}
dictval::Vector{UnitRange{Int}}
end
function MetaInfo(data::Vector{UInt8}=UInt8[])
metainfo = MetaInfo(data, 1:0, 1:0, 1:0, UnitRange{Int}[], UnitRange{Int}[])
if !isempty(data)
index!(metainfo)
end
return metainfo
end
"""
MetaInfo(str::AbstractString)
Create a SAM metainfo from `str`.
# Examples
julia> SAM.MetaInfo("@CO\tsome comment")
BioAlignments.SAM.MetaInfo:
tag: CO
value: some comment
julia> SAM.MetaInfo("@SQ\tSN:chr1\tLN:12345")
BioAlignments.SAM.MetaInfo:
tag: SQ
value: SN=chr1 LN=12345
"""
function MetaInfo(str::AbstractString)
return MetaInfo(Vector{UInt8}(str))
end
"""
MetaInfo(tag::AbstractString, value)
Create a SAM metainfo with `tag` and `value`.
`tag` is a two-byte ASCII string. If `tag` is `"CO"`, `value` must be a string; otherwise, `value` is an iterable object with key and value pairs.
# Examples
julia> SAM.MetaInfo("CO", "some comment")
BioAlignments.SAM.MetaInfo:
tag: CO
value: some comment
julia> string(ans)
"@CO\tsome comment"
julia> SAM.MetaInfo("SQ", ["SN" => "chr1", "LN" => 12345])
BioAlignments.SAM.MetaInfo:
tag: SQ
value: SN=chr1 LN=12345
julia> string(ans)
"@SQ\tSN:chr1\tLN:12345"
"""
function MetaInfo(tag::AbstractString, value)
buf = IOBuffer()
if tag == "CO" # comment
if !isa(value, AbstractString)
throw(ArgumentError("value must be a string"))
end
write(buf, "@CO\t", value)
elseif occursin(r"[A-Z][A-Z]", tag)
print(buf, '@', tag)
for (key, val) in value
print(buf, '\t', key, ':', val)
end
else
throw(ArgumentError("tag must match r\"[A-Z][A-Z]\""))
end
return MetaInfo(take!(buf))
end
function initialize!(metainfo::MetaInfo)
metainfo.filled = 1:0
metainfo.tag = 1:0
metainfo.val = 1:0
empty!(metainfo.dictkey)
empty!(metainfo.dictval)
return metainfo
end
function isfilled(metainfo::MetaInfo)
return !isempty(metainfo.filled)
end
function datarange(metainfo::MetaInfo)
return metainfo.filled
end
function checkfilled(metainfo::MetaInfo)
if !isfilled(metainfo)
throw(ArgumentError("unfilled SAM metainfo"))
end
end
function isequalkey(metainfo::MetaInfo, key::AbstractString)
if !isfilled(metainfo) || sizeof(key) != 2
return false
end
k1, k2 = UInt8(key[1]), UInt8(key[2])
return metainfo.data[metainfo.tag[1]] == k1 && metainfo.data[metainfo.tag[2]] == k2
end
function Base.show(io::IO, metainfo::MetaInfo)
print(io, summary(metainfo), ':')
if isfilled(metainfo)
println(io)
println(io, " tag: ", tag(metainfo))
print(io, " value:")
if !iscomment(metainfo)
for (key, val) in zip(keys(metainfo), values(metainfo))
print(io, ' ', key, '=', val)
end
else
print(io, ' ', value(metainfo))
end
else
print(io, " <not filled>")
end
end
function Base.print(io::IO, metainfo::MetaInfo)
write(io, metainfo)
return nothing
end
function Base.write(io::IO, metainfo::MetaInfo)
checkfilled(metainfo)
r = datarange(metainfo)
return unsafe_write(io, pointer(metainfo.data, first(r)), length(r))
end
# Accessor Functions
# ------------------
"""
iscomment(metainfo::MetaInfo)::Bool
Test if `metainfo` is a comment (i.e. its tag is "CO").
"""
function iscomment(metainfo::MetaInfo)::Bool
return isequalkey(metainfo, "CO")
end
"""
tag(metainfo::MetaInfo)::String
Get the tag of `metainfo`.
"""
function tag(metainfo::MetaInfo)::String
checkfilled(metainfo)
return String(metainfo.data[metainfo.tag])
end
"""
value(metainfo::MetaInfo)::String
Get the value of `metainfo` as a string.
"""
function value(metainfo::MetaInfo)::String
checkfilled(metainfo)
return String(metainfo.data[metainfo.val])
end
"""
keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}}
Get the values of `metainfo` as string pairs.
"""
function keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}}
checkfilled(metainfo)
if iscomment(metainfo)
throw(ArgumentError("not a dictionary"))
end
return Pair{String,String}[key => val for (key, val) in zip(keys(metainfo), values(metainfo))]
end
function Base.keys(metainfo::MetaInfo)
checkfilled(metainfo)
if iscomment(metainfo)
throw(ArgumentError("not a dictionary"))
end
return [String(metainfo.data[r]) for r in metainfo.dictkey]
end
function Base.values(metainfo::MetaInfo)
checkfilled(metainfo)
if iscomment(metainfo)
throw(ArgumentError("not a dictionary"))
end
return [String(metainfo.data[r]) for r in metainfo.dictval]
end
function Base.haskey(metainfo::MetaInfo, key::AbstractString)
return findkey(metainfo, key) > 0
end
function Base.getindex(metainfo::MetaInfo, key::AbstractString)
i = findkey(metainfo, key)
if i == 0
throw(KeyError(key))
end
return String(metainfo.data[metainfo.dictval[i]])
end
function findkey(metainfo::MetaInfo, key::AbstractString)
checkfilled(metainfo)
if sizeof(key) != 2
return 0
end
t1, t2 = UInt8(key[1]), UInt8(key[2])
for (i, k) in enumerate(metainfo.dictkey)
if metainfo.data[first(k)] == t1 && metainfo.data[first(k)+1] == t2
return i
end
end
return 0
end

265
src/sam/reader.jl Normal file
View file

@ -0,0 +1,265 @@
# SAM Reader
# =========
mutable struct Reader <: BioCore.IO.AbstractReader
state::BioCore.Ragel.State
header::Header
function Reader(input::BufferedStreams.BufferedInputStream)
reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header())
readheader!(reader)
reader.state.cs = sam_body_machine.start_state
return reader
end
end
"""
SAM.Reader(input::IO)
Create a data reader of the SAM file format.
# Arguments
* `input`: data source
"""
function Reader(input::IO)
return Reader(BufferedStreams.BufferedInputStream(input))
end
function BioCore.IO.stream(reader::Reader)
return reader.state.stream
end
"""
header(reader::Reader)::Header
Get the header of `reader`.
"""
function header(reader::Reader)::Header
return reader.header
end
function BioCore.header(reader::Reader)
return header(reader)
end
function Base.eltype(::Type{Reader})
return Record
end
# file = header . body
# header = metainfo*
# body = record*
isinteractive() && info("compiling SAM")
const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function ()
cat = Automa.RegExp.cat
rep = Automa.RegExp.rep
alt = Automa.RegExp.alt
opt = Automa.RegExp.opt
any = Automa.RegExp.any
metainfo = let
tag = re"[A-Z][A-Z]" \ cat("CO")
tag.actions[:enter] = [:mark1]
tag.actions[:exit] = [:metainfo_tag]
dict = let
key = re"[A-Za-z][A-Za-z0-9]"
key.actions[:enter] = [:mark2]
key.actions[:exit] = [:metainfo_dict_key]
val = re"[ -~]+"
val.actions[:enter] = [:mark2]
val.actions[:exit] = [:metainfo_dict_val]
keyval = cat(key, ':', val)
cat(keyval, rep(cat('\t', keyval)))
end
dict.actions[:enter] = [:mark1]
dict.actions[:exit] = [:metainfo_val]
co = cat("CO")
co.actions[:enter] = [:mark1]
co.actions[:exit] = [:metainfo_tag]
comment = re"[^\r\n]*"
comment.actions[:enter] = [:mark1]
comment.actions[:exit] = [:metainfo_val]
cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment)))
end
metainfo.actions[:enter] = [:anchor]
metainfo.actions[:exit] = [:metainfo]
record = let
qname = re"[!-?A-~]+"
qname.actions[:enter] = [:mark]
qname.actions[:exit] = [:record_qname]
flag = re"[0-9]+"
flag.actions[:enter] = [:mark]
flag.actions[:exit] = [:record_flag]
rname = re"\*|[!-()+-<>-~][!-~]*"
rname.actions[:enter] = [:mark]
rname.actions[:exit] = [:record_rname]
pos = re"[0-9]+"
pos.actions[:enter] = [:mark]
pos.actions[:exit] = [:record_pos]
mapq = re"[0-9]+"
mapq.actions[:enter] = [:mark]
mapq.actions[:exit] = [:record_mapq]
cigar = re"\*|([0-9]+[MIDNSHPX=])+"
cigar.actions[:enter] = [:mark]
cigar.actions[:exit] = [:record_cigar]
rnext = re"\*|=|[!-()+-<>-~][!-~]*"
rnext.actions[:enter] = [:mark]
rnext.actions[:exit] = [:record_rnext]
pnext = re"[0-9]+"
pnext.actions[:enter] = [:mark]
pnext.actions[:exit] = [:record_pnext]
tlen = re"[-+]?[0-9]+"
tlen.actions[:enter] = [:mark]
tlen.actions[:exit] = [:record_tlen]
seq = re"\*|[A-Za-z=.]+"
seq.actions[:enter] = [:mark]
seq.actions[:exit] = [:record_seq]
qual = re"[!-~]+"
qual.actions[:enter] = [:mark]
qual.actions[:exit] = [:record_qual]
field = let
tag = re"[A-Za-z][A-Za-z0-9]"
val = alt(
re"A:[!-~]",
re"i:[-+]?[0-9]+",
re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?",
re"Z:[ !-~]*",
re"H:([0-9A-F][0-9A-F])*",
re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+")
cat(tag, ':', val)
end
field.actions[:enter] = [:mark]
field.actions[:exit] = [:record_field]
cat(
qname, '\t',
flag, '\t',
rname, '\t',
pos, '\t',
mapq, '\t',
cigar, '\t',
rnext, '\t',
pnext, '\t',
tlen, '\t',
seq, '\t',
qual,
rep(cat('\t', field)))
end
record.actions[:enter] = [:anchor]
record.actions[:exit] = [:record]
newline = let
lf = re"\n"
lf.actions[:enter] = [:countline]
cat(re"\r?", lf)
end
header = rep(cat(metainfo, newline))
header.actions[:exit] = [:header]
header = cat(header, opt(any() \ cat('@'))) # look ahead
body = rep(cat(record, newline))
return map(Automa.compile, (metainfo, record, header, body))
end)()
const sam_metainfo_actions = Dict(
:metainfo_tag => :(record.tag = (mark1:p-1) .- offset),
:metainfo_val => :(record.val = (mark1:p-1) .- offset),
:metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)),
:metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)),
:metainfo => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1)
record.filled = (offset+1:p-1) .- offset
end,
:anchor => :(),
:mark1 => :(mark1 = p),
:mark2 => :(mark2 = p))
eval(
BioCore.ReaderHelper.generate_index_function(
MetaInfo,
sam_metainfo_machine,
:(mark1 = mark2 = offset = 0),
sam_metainfo_actions))
eval(
BioCore.ReaderHelper.generate_readheader_function(
Reader,
MetaInfo,
sam_header_machine,
:(mark1 = mark2 = offset = 0),
merge(sam_metainfo_actions, Dict(
:metainfo => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
record.filled = (offset+1:p-1) .- offset
@assert isfilled(record)
push!(reader.header.metainfo, record)
BioCore.ReaderHelper.ensure_margin!(stream)
record = MetaInfo()
end,
:header => :(finish_header = true; @escape),
:countline => :(linenum += 1),
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))),
quote
if !eof(stream)
stream.position -= 1 # cancel look-ahead
end
end))
const sam_record_actions = Dict(
:record_qname => :(record.qname = (mark:p-1) .- offset),
:record_flag => :(record.flag = (mark:p-1) .- offset),
:record_rname => :(record.rname = (mark:p-1) .- offset),
:record_pos => :(record.pos = (mark:p-1) .- offset),
:record_mapq => :(record.mapq = (mark:p-1) .- offset),
:record_cigar => :(record.cigar = (mark:p-1) .- offset),
:record_rnext => :(record.rnext = (mark:p-1) .- offset),
:record_pnext => :(record.pnext = (mark:p-1) .- offset),
:record_tlen => :(record.tlen = (mark:p-1) .- offset),
:record_seq => :(record.seq = (mark:p-1) .- offset),
:record_qual => :(record.qual = (mark:p-1) .- offset),
:record_field => :(push!(record.fields, (mark:p-1) .- offset)),
:record => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1)
record.filled = (offset+1:p-1) .- offset
end,
:anchor => :(),
:mark => :(mark = p))
eval(
BioCore.ReaderHelper.generate_index_function(
Record,
sam_record_machine,
:(mark = offset = 0),
sam_record_actions))
eval(
BioCore.ReaderHelper.generate_read_function(
Reader,
sam_body_machine,
:(mark = offset = 0),
merge(sam_record_actions, Dict(
:record => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
record.filled = (offset+1:p-1) .- offset
found_record = true
@escape
end,
:countline => :(linenum += 1),
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1)))))

624
src/sam/record.jl Normal file
View file

@ -0,0 +1,624 @@
# SAM Record
# ==========
mutable struct Record
# data and filled range
data::Vector{UInt8}
filled::UnitRange{Int}
# indexes
qname::UnitRange{Int}
flag::UnitRange{Int}
rname::UnitRange{Int}
pos::UnitRange{Int}
mapq::UnitRange{Int}
cigar::UnitRange{Int}
rnext::UnitRange{Int}
pnext::UnitRange{Int}
tlen::UnitRange{Int}
seq::UnitRange{Int}
qual::UnitRange{Int}
fields::Vector{UnitRange{Int}}
end
"""
SAM.Record()
Create an unfilled SAM record.
"""
function Record()
return Record(
UInt8[], 1:0,
# qname-mapq
1:0, 1:0, 1:0, 1:0, 1:0,
# cigar-seq
1:0, 1:0, 1:0, 1:0, 1:0,
# qual and fields
1:0, UnitRange{Int}[])
end
"""
SAM.Record(data::Vector{UInt8})
Create a SAM record from `data`.
This function verifies the format and indexes fields for accessors.
Note that the ownership of `data` is transferred to a new record object.
"""
function Record(data::Vector{UInt8})
return convert(Record, data)
end
function Base.convert(::Type{Record}, data::Vector{UInt8})
record = Record(
data, 1:0,
# qname-mapq
1:0, 1:0, 1:0, 1:0, 1:0,
# cigar-seq
1:0, 1:0, 1:0, 1:0, 1:0,
# qual and fields
1:0, UnitRange{Int}[])
index!(record)
return record
end
"""
SAM.Record(str::AbstractString)
Create a SAM record from `str`.
This function verifies the format and indexes fields for accessors.
"""
function Record(str::AbstractString)
return convert(Record, str)
end
function Base.convert(::Type{Record}, str::AbstractString)
return Record(Vector{UInt8}(str))
end
function Base.show(io::IO, record::Record)
print(io, summary(record), ':')
if isfilled(record)
println(io)
println(io, " template name: ", hastempname(record) ? tempname(record) : "<missing>")
println(io, " flag: ", hasflag(record) ? flag(record) : "<missing>")
println(io, " reference: ", hasrefname(record) ? refname(record) : "<missing>")
println(io, " position: ", hasposition(record) ? position(record) : "<missing>")
println(io, " mapping quality: ", hasmappingquality(record) ? mappingquality(record) : "<missing>")
println(io, " CIGAR: ", hascigar(record) ? cigar(record) : "<missing>")
println(io, " next reference: ", hasnextrefname(record) ? nextrefname(record) : "<missing>")
println(io, " next position: ", hasnextposition(record) ? nextposition(record) : "<missing>")
println(io, " template length: ", hastemplength(record) ? templength(record) : "<missing>")
println(io, " sequence: ", hassequence(record) ? sequence(String, record) : "<missing>")
println(io, " base quality: ", hasquality(record) ? quality(String, record) : "<missing>")
print(io, " auxiliary data:")
for field in record.fields
print(io, ' ', String(record.data[field]))
end
else
print(io, " <not filled>")
end
end
function Base.print(io::IO, record::Record)
write(io, record)
return nothing
end
function Base.write(io::IO, record::Record)
checkfilled(record)
return unsafe_write(io, pointer(record.data, first(record.filled)), length(record.filled))
end
function Base.copy(record::Record)
return Record(
copy(record.data),
record.filled,
record.qname,
record.flag,
record.rname,
record.pos,
record.mapq,
record.cigar,
record.rnext,
record.pnext,
record.tlen,
record.seq,
record.qual,
copy(record.fields))
end
# Accessor Functions
# ------------------
"""
flag(record::Record)::UInt16
Get the bitwise flag of `record`.
"""
function flag(record::Record)::UInt16
checkfilled(record)
return unsafe_parse_decimal(UInt16, record.data, record.flag)
end
function hasflag(record::Record)
return isfilled(record)
end
"""
ismapped(record::Record)::Bool
Test if `record` is mapped.
"""
function ismapped(record::Record)::Bool
return isfilled(record) && (flag(record) & FLAG_UNMAP == 0)
end
"""
isprimary(record::Record)::Bool
Test if `record` is a primary line of the read.
This is equivalent to `flag(record) & 0x900 == 0`.
"""
function isprimary(record::Record)::Bool
return flag(record) & 0x900 == 0
end
"""
refname(record::Record)::String
Get the reference sequence name of `record`.
"""
function refname(record::Record)
checkfilled(record)
if ismissing(record, record.rname)
missingerror(:refname)
end
return String(record.data[record.rname])
end
function hasrefname(record::Record)
return isfilled(record) && !ismissing(record, record.rname)
end
"""
position(record::Record)::Int
Get the 1-based leftmost mapping position of `record`.
"""
function position(record::Record)::Int
checkfilled(record)
pos = unsafe_parse_decimal(Int, record.data, record.pos)
if pos == 0
missingerror(:position)
end
return pos
end
function hasposition(record::Record)
return isfilled(record) && (length(record.pos) != 1 || record.data[first(record.pos)] != UInt8('0'))
end
"""
rightposition(record::Record)::Int
Get the 1-based rightmost mapping position of `record`.
"""
function rightposition(record::Record)
return position(record) + alignlength(record) - 1
end
function hasrightposition(record::Record)
return hasposition(record) && hasalignment(record)
end
"""
isnextmapped(record::Record)::Bool
Test if the mate/next read of `record` is mapped.
"""
function isnextmapped(record::Record)::Bool
return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0)
end
"""
nextrefname(record::Record)::String
Get the reference name of the mate/next read of `record`.
"""
function nextrefname(record::Record)::String
checkfilled(record)
if ismissing(record, record.rnext)
missingerror(:nextrefname)
end
return String(record.data[record.rnext])
end
function hasnextrefname(record::Record)
return isfilled(record) && !ismissing(record, record.rnext)
end
"""
nextposition(record::Record)::Int
Get the position of the mate/next read of `record`.
"""
function nextposition(record::Record)::Int
checkfilled(record)
pos = unsafe_parse_decimal(Int, record.data, record.pnext)
if pos == 0
missingerror(:nextposition)
end
return pos
end
function hasnextposition(record::Record)
return isfilled(record) && (length(record.pnext) != 1 || record.data[first(record.pnext)] != UInt8('0'))
end
"""
mappingquality(record::Record)::UInt8
Get the mapping quality of `record`.
"""
function mappingquality(record::Record)::UInt8
checkfilled(record)
qual = unsafe_parse_decimal(UInt8, record.data, record.mapq)
if qual == 0xff
missingerror(:mappingquality)
end
return qual
end
function hasmappingquality(record::Record)
return isfilled(record) && unsafe_parse_decimal(UInt8, record.data, record.mapq) != 0xff
end
"""
cigar(record::Record)::String
Get the CIGAR string of `record`.
"""
function cigar(record::Record)::String
checkfilled(record)
if ismissing(record, record.cigar)
missingerror(:cigar)
end
return String(record.data[record.cigar])
end
function hascigar(record::Record)
return isfilled(record) && !ismissing(record, record.cigar)
end
"""
alignment(record::Record)::BioAlignments.Alignment
Get the alignment of `record`.
"""
function alignment(record::Record)::BioAlignments.Alignment
if ismapped(record)
return BioAlignments.Alignment(cigar(record), 1, position(record))
else
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
end
end
function hasalignment(record::Record)
return isfilled(record) && hascigar(record)
end
"""
alignlength(record::Record)::Int
Get the alignment length of `record`.
"""
function alignlength(record::Record)::Int
if length(record.cigar) == 1 && record.data[first(record.cigar)] == UInt8('*')
return 0
end
ret::Int = 0
len = 0 # operation length
for i in record.cigar
c = record.data[i]
if c UInt8('0'):UInt8('9')
len = len * 10 + (c - UInt8('0'))
else
op = convert(BioAlignments.Operation, Char(c))
if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op)
ret += len
len = 0
end
end
end
return ret
end
"""
tempname(record::Record)::String
Get the query template name of `record`.
"""
function tempname(record::Record)::String
checkfilled(record)
if ismissing(record, record.qname)
missingerror(:tempname)
end
return String(record.data[record.qname])
end
function hastempname(record::Record)
return isfilled(record) && !ismissing(record, record.qname)
end
"""
templength(record::Record)::Int
Get the template length of `record`.
"""
function templength(record::Record)::Int
checkfilled(record)
len = unsafe_parse_decimal(Int, record.data, record.tlen)
if len == 0
missingerror(:tlen)
end
return len
end
function hastemplength(record::Record)
return isfilled(record) && (length(record.tlen) != 1 || record.data[first(record.tlen)] != UInt8('0'))
end
"""
sequence(record::Record)::BioSequences.DNASequence
Get the segment sequence of `record`.
"""
function sequence(record::Record)::BioSequences.DNASequence
checkfilled(record)
if ismissing(record, record.seq)
missingerror(:sequence)
end
seqlen = length(record.seq)
ret = BioSequences.DNASequence(seqlen)
BioSequences.encode_copy!(ret, 1, record.data, first(record.seq), seqlen)
return ret
end
function hassequence(record::Record)
return isfilled(record) && !ismissing(record, record.seq)
end
"""
sequence(::Type{String}, record::Record)::String
Get the segment sequence of `record` as `String`.
"""
function sequence(::Type{String}, record::Record)::String
checkfilled(record)
return String(record.data[record.seq])
end
"""
seqlength(record::Record)::Int
Get the sequence length of `record`.
"""
function seqlength(record::Record)::Int
checkfilled(record)
if ismissing(record, record.seq)
missingerror(:seq)
end
return length(record.seq)
end
function hasseqlength(record::Record)
return isfilled(record) && !ismissing(record, record.seq)
end
"""
quality(record::Record)::Vector{UInt8}
Get the Phred-scaled base quality of `record`.
"""
function quality(record::Record)::Vector{UInt8}
checkfilled(record)
if ismissing(record, record.qual)
missingerror(:quality)
end
qual = record.data[record.qual]
for i in 1:lastindex(qual)
@inbounds qual[i] -= 33
end
return qual
end
function hasquality(record::Record)
return isfilled(record) && !ismissing(record, record.qual)
end
"""
quality(::Type{String}, record::Record)::String
Get the ASCII-encoded base quality of `record`.
"""
function quality(::Type{String}, record::Record)::String
checkfilled(record)
return String(record.data[record.qual])
end
"""
auxdata(record::Record)::Dict{String,Any}
Get the auxiliary data (optional fields) of `record`.
"""
function auxdata(record::Record)::Dict{String,Any}
checkfilled(record)
return Dict(k => record[k] for k in keys(record))
end
function Base.haskey(record::Record, tag::AbstractString)
return findauxtag(record, tag) > 0
end
function Base.getindex(record::Record, tag::AbstractString)
i = findauxtag(record, tag)
if i == 0
throw(KeyError(tag))
end
field = record.fields[i]
# data type
typ = record.data[first(field)+3]
lo = first(field) + 5
if i == lastindex(record.fields)
hi = last(field)
else
hi = first(record.fields[i+1]) - 2
end
if typ == UInt8('A')
@assert lo == hi
return Char(record.data[lo])
elseif typ == UInt8('i')
return unsafe_parse_decimal(Int, record.data, lo:hi)
elseif typ == UInt8('f')
# TODO: Call a C function directly for speed?
return parse(Float32, SubString(record.data[lo:hi]))
elseif typ == UInt8('Z')
return String(record.data[lo:hi])
elseif typ == UInt8('H')
return parse_hexarray(record.data, lo:hi)
elseif typ == UInt8('B')
return parse_typedarray(record.data, lo:hi)
else
throw(ArgumentError("type code '$(Char(typ))' is not defined"))
end
end
function Base.keys(record::Record)
checkfilled(record)
return [String(record.data[first(f):first(f)+1]) for f in record.fields]
end
function Base.values(record::Record)
return [record[k] for k in keys(record)]
end
# Bio Methods
# -----------
function BioCore.isfilled(record::Record)
return !isempty(record.filled)
end
function BioCore.seqname(record::Record)
return tempname(record)
end
function BioCore.hasseqname(record::Record)
return hastempname(record)
end
function BioCore.sequence(record::Record)
return sequence(record)
end
function BioCore.hassequence(record::Record)
return hassequence(record)
end
function BioCore.rightposition(record::Record)
return rightposition(record)
end
function BioCore.hasrightposition(record::Record)
return hasrightposition(record)
end
function BioCore.leftposition(record::Record)
return position(record)
end
function BioCore.hasleftposition(record::Record)
return hasposition(record)
end
# Helper Functions
# ----------------
function initialize!(record::Record)
record.filled = 1:0
record.qname = 1:0
record.flag = 1:0
record.rname = 1:0
record.pos = 1:0
record.mapq = 1:0
record.cigar = 1:0
record.rnext = 1:0
record.pnext = 1:0
record.tlen = 1:0
record.seq = 1:0
record.qual = 1:0
empty!(record.fields)
return record
end
function checkfilled(record::Record)
if !isfilled(record)
throw(ArgumentError("unfilled SAM record"))
end
end
function findauxtag(record::Record, tag::AbstractString)
checkfilled(record)
if sizeof(tag) != 2
return 0
end
t1, t2 = UInt8(tag[1]), UInt8(tag[2])
for (i, field) in enumerate(record.fields)
p = first(field)
if record.data[p] == t1 && record.data[p+1] == t2
return i
end
end
return 0
end
function parse_hexarray(data::Vector{UInt8}, range::UnitRange{Int})
@assert iseven(length(range))
ret = Vector{UInt8}(length(range) >> 1)
byte2hex(b) = b 0x30:0x39 ? (b - 0x30) : b 0x41:0x46 ? (b - 0x41 + 0x0A) : error("not in [0-9A-F]")
j = 1
for i in first(range):2:last(range)-1
ret[j] = (byte2hex(data[range[i]]) << 4) | byte2hex(data[range[i+1]])
j += 1
end
return ret
end
function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int})
# format: [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+
t = data[first(range)]
xs = split(String(data[first(range)+2:last(range)]))
if t == UInt8('c')
return [parse(Int8, x) for x in xs]
elseif t == UInt8('C')
return [parse(UInt8, x) for x in xs]
elseif t == UInt8('s')
return [parse(Int16, x) for x in xs]
elseif t == UInt8('S')
return [parse(UInt16, x) for x in xs]
elseif t == UInt8('i')
return [parse(Int32, x) for x in xs]
elseif t == UInt8('I')
return [parse(UInt32, x) for x in xs]
elseif t == UInt8('f')
return [parse(Float32, x) for x in xs]
else
throw(ArgumentError("type code '$(Char(t))' is not defined"))
end
end
function ismissing(record::Record, range::UnitRange{Int})
return length(range) == 1 && record.data[first(range)] == UInt8('*')
end

23
src/sam/sam.jl Normal file
View file

@ -0,0 +1,23 @@
# SAM File Format
# ===============
module SAM
import Automa
import Automa.RegExp: @re_str
import BioAlignments
import BioCore.Exceptions: missingerror
import BioCore.RecordHelper: unsafe_parse_decimal
import BioCore: BioCore, isfilled
import BioSequences
import BufferedStreams
using Printf: @sprintf
include("flags.jl")
include("metainfo.jl")
include("record.jl")
include("header.jl")
include("reader.jl")
include("writer.jl")
end

43
src/sam/writer.jl Normal file
View file

@ -0,0 +1,43 @@
# SAM Writer
# ==========
"""
Writer(output::IO, header::Header=Header())
Create a data writer of the SAM file format.
# Arguments
* `output`: data sink
* `header=Header()`: SAM header object
"""
mutable struct Writer <: BioCore.IO.AbstractWriter
stream::IO
function Writer(output::IO, header::Header=Header())
writer = new(output)
write(writer, header)
return writer
end
end
function BioCore.IO.stream(writer::Writer)
return writer.stream
end
function Base.write(writer::Writer, header::Header)
n = 0
for metainfo in header
n += write(writer, metainfo)
end
return n
end
function Base.write(writer::Writer, metainfo::MetaInfo)
checkfilled(metainfo)
return write(writer.stream, metainfo, '\n')
end
function Base.write(writer::Writer, record::Record)
checkfilled(record)
return write(writer.stream, record, '\n')
end

426
test/runtests.jl Normal file
View file

@ -0,0 +1,426 @@
using Test
using BioAlignments
using BioSymbols
import BGZFStreams: BGZFStream
import BioCore.Exceptions: MissingFieldException
import BioCore.Testing.get_bio_fmt_specimens
import BioSequences: @dna_str, @aa_str
import GenomicFeatures
import YAML
# Generate a random range within `range`.
function randrange(range)
x = rand(range)
y = rand(range)
if x < y
return x:y
else
return y:x
end
end
@testset "SAM" begin
samdir = joinpath(get_bio_fmt_specimens("master", false, true), "SAM")
@testset "MetaInfo" begin
metainfo = SAM.MetaInfo()
@test !isfilled(metainfo)
@test occursin("not filled", repr(metainfo))
metainfo = SAM.MetaInfo("CO", "some comment (parens)")
@test isfilled(metainfo)
@test string(metainfo) == "@CO\tsome comment (parens)"
@test occursin("CO", repr(metainfo))
@test SAM.tag(metainfo) == "CO"
@test SAM.value(metainfo) == "some comment (parens)"
@test_throws ArgumentError keys(metainfo)
@test_throws ArgumentError values(metainfo)
metainfo = SAM.MetaInfo("HD", ["VN" => "1.0", "SO" => "coordinate"])
@test isfilled(metainfo)
@test string(metainfo) == "@HD\tVN:1.0\tSO:coordinate"
@test occursin("HD", repr(metainfo))
@test SAM.tag(metainfo) == "HD"
@test SAM.value(metainfo) == "VN:1.0\tSO:coordinate"
@test keys(metainfo) == ["VN", "SO"]
@test values(metainfo) == ["1.0", "coordinate"]
@test SAM.keyvalues(metainfo) == ["VN" => "1.0", "SO" => "coordinate"]
@test haskey(metainfo, "VN")
@test haskey(metainfo, "SO")
@test !haskey(metainfo, "GO")
@test metainfo["VN"] == "1.0"
@test metainfo["SO"] == "coordinate"
@test_throws KeyError metainfo["GO"]
end
@testset "Header" begin
header = SAM.Header()
@test isempty(header)
push!(header, SAM.MetaInfo("@HD\tVN:1.0\tSO:coordinate"))
@test !isempty(header)
@test length(header) == 1
push!(header, SAM.MetaInfo("@CO\tsome comment"))
@test length(header) == 2
@test isa(collect(header), Vector{SAM.MetaInfo})
end
@testset "Record" begin
record = SAM.Record()
@test !isfilled(record)
@test !SAM.ismapped(record)
@test repr(record) == "BioAlignments.SAM.Record: <not filled>"
@test_throws ArgumentError SAM.flag(record)
record = SAM.Record("r001\t99\tchr1\t7\t30\t8M2I4M1D3M\t=\t37\t39\tTTAGATAAAGGATACTG\t*")
@test isfilled(record)
@test occursin(r"^BioAlignments.SAM.Record:\n", repr(record))
@test SAM.ismapped(record)
@test SAM.isprimary(record)
@test SAM.hastempname(record)
@test SAM.tempname(record) == "r001"
@test SAM.hasflag(record)
@test SAM.flag(record) === UInt16(99)
@test SAM.hasrefname(record)
@test SAM.refname(record) == "chr1"
@test SAM.hasposition(record)
@test SAM.position(record) === 7
@test SAM.hasmappingquality(record)
@test SAM.mappingquality(record) === UInt8(30)
@test SAM.hascigar(record)
@test SAM.cigar(record) == "8M2I4M1D3M"
@test SAM.hasnextrefname(record)
@test SAM.nextrefname(record) == "="
@test SAM.hasnextposition(record)
@test SAM.nextposition(record) === 37
@test SAM.hastemplength(record)
@test SAM.templength(record) === 39
@test SAM.hassequence(record)
@test SAM.sequence(record) == dna"TTAGATAAAGGATACTG"
@test !SAM.hasquality(record)
@test_throws MissingFieldException SAM.quality(record)
end
@testset "Reader" begin
reader = open(SAM.Reader, joinpath(samdir, "ce#1.sam"))
@test isa(reader, SAM.Reader)
@test eltype(reader) === SAM.Record
# header
h = header(reader)
@test string.(findall(header(reader), "SQ")) == ["@SQ\tSN:CHROMOSOME_I\tLN:1009800"]
# first record
record = SAM.Record()
read!(reader, record)
@test SAM.ismapped(record)
@test SAM.refname(record) == "CHROMOSOME_I"
@test SAM.position(record) == leftposition(record) == 2
@test SAM.rightposition(record) == rightposition(record) == 102
@test SAM.tempname(record) == seqname(record) == "SRR065390.14978392"
@test SAM.sequence(record) == sequence(record) == dna"CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA"
@test SAM.sequence(String, record) == "CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA"
@test SAM.seqlength(record) == 100
@test SAM.quality(record) == (b"#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" .- 33)
@test SAM.quality(String, record) == "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
@test SAM.flag(record) == 16
@test SAM.cigar(record) == "27M1D73M"
@test SAM.alignment(record) == Alignment([
AlignmentAnchor( 0, 1, OP_START),
AlignmentAnchor( 27, 28, OP_MATCH),
AlignmentAnchor( 27, 29, OP_DELETE),
AlignmentAnchor(100, 102, OP_MATCH)])
@test record["XG"] == 1
@test record["XM"] == 5
@test record["XN"] == 0
@test record["XO"] == 1
@test record["AS"] == -18
@test record["XS"] == -18
@test record["YT"] == "UU"
@test eof(reader)
close(reader)
# iterator
@test length(collect(open(SAM.Reader, joinpath(samdir, "ce#1.sam")))) == 1
@test length(collect(open(SAM.Reader, joinpath(samdir, "ce#2.sam")))) == 2
# IOStream
@test length(collect(SAM.Reader(open(joinpath(samdir, "ce#1.sam"))))) == 1
@test length(collect(SAM.Reader(open(joinpath(samdir, "ce#2.sam"))))) == 2
end
@testset "Round trip" begin
function compare_records(xs, ys)
if length(xs) != length(ys)
return false
end
for (x, y) in zip(xs, ys)
if x.data[x.filled] != y.data[y.filled]
return false
end
end
return true
end
for specimen in YAML.load_file(joinpath(samdir, "index.yml"))
filepath = joinpath(samdir, specimen["filename"])
mktemp() do path, io
# copy
reader = open(SAM.Reader, filepath)
writer = SAM.Writer(io, header(reader))
records = SAM.Record[]
for record in reader
push!(records, record)
write(writer, record)
end
close(reader)
close(writer)
@test compare_records(open(collect, SAM.Reader, path), records)
end
end
end
end
@testset "BAM" begin
bamdir = joinpath(get_bio_fmt_specimens("master", false), "BAM")
@testset "AuxData" begin
auxdata = BAM.AuxData(UInt8[])
@test isempty(auxdata)
buf = IOBuffer()
write(buf, "NM", UInt8('s'), Int16(1))
auxdata = BAM.AuxData(take!(buf))
@test length(auxdata) == 1
@test auxdata["NM"] === Int16(1)
@test collect(auxdata) == ["NM" => Int16(1)]
buf = IOBuffer()
write(buf, "AS", UInt8('c'), Int8(-18))
write(buf, "NM", UInt8('s'), Int16(1))
write(buf, "XA", UInt8('f'), Float32(3.14))
write(buf, "XB", UInt8('Z'), "some text\0")
write(buf, "XC", UInt8('B'), UInt8('i'), Int32(3), Int32[10, -5, 8])
auxdata = BAM.AuxData(take!(buf))
@test length(auxdata) == 5
@test auxdata["AS"] === Int8(-18)
@test auxdata["NM"] === Int16(1)
@test auxdata["XA"] === Float32(3.14)
@test auxdata["XB"] == "some text"
@test auxdata["XC"] == Int32[10, -5, 8]
@test convert(Dict{String,Any}, auxdata) == Dict(
"AS" => Int8(-18),
"NM" => Int16(1),
"XA" => Float32(3.14),
"XB" => "some text",
"XC" => Int32[10, -5, 8])
end
@testset "Record" begin
record = BAM.Record()
@test !isfilled(record)
@test repr(record) == "BioAlignments.BAM.Record: <not filled>"
@test_throws ArgumentError BAM.flag(record)
end
@testset "Reader" begin
reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam"))
@test isa(reader, BAM.Reader)
@test eltype(reader) === BAM.Record
@test startswith(repr(reader), "BioAlignments.BAM.Reader{IOStream}:")
# header
h = header(reader)
@test isa(h, SAM.Header)
# first record
record = BAM.Record()
read!(reader, record)
@test BAM.ismapped(record)
@test BAM.isprimary(record)
@test ! BAM.ispositivestrand(record)
@test BAM.refname(record) == "CHROMOSOME_I"
@test BAM.refid(record) === 1
@test BAM.hasnextrefid(record)
@test BAM.nextrefid(record) === 0
@test BAM.hasposition(record) === hasleftposition(record) === true
@test BAM.position(record) === leftposition(record) === 2
@test BAM.hasnextposition(record)
@test BAM.nextposition(record) === 0
@test rightposition(record) == 102
@test BAM.hastempname(record) === hasseqname(record) === true
@test BAM.tempname(record) == seqname(record) == "SRR065390.14978392"
@test BAM.hassequence(record) === hassequence(record) === true
@test BAM.sequence(record) == sequence(record) == dna"""
CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCT
AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA
"""
@test BAM.seqlength(record) === 100
@test BAM.hasquality(record)
@test eltype(BAM.quality(record)) == UInt8
@test BAM.quality(record) == [Int(x) - 33 for x in "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"]
@test BAM.flag(record) === UInt16(16)
@test BAM.cigar(record) == "27M1D73M"
@test BAM.alignment(record) == Alignment([
AlignmentAnchor( 0, 1, OP_START),
AlignmentAnchor( 27, 28, OP_MATCH),
AlignmentAnchor( 27, 29, OP_DELETE),
AlignmentAnchor(100, 102, OP_MATCH)])
@test record["XG"] == 1
@test record["XM"] == 5
@test record["XN"] == 0
@test record["XO"] == 1
@test record["AS"] == -18
@test record["XS"] == -18
@test record["YT"] == "UU"
@test keys(record) == ["XG","XM","XN","XO","AS","XS","YT"]
@test values(record) == [1, 5, 0, 1, -18, -18, "UU"]
@test eof(reader)
close(reader)
# Test conversion from byte array to record
dsize = BAM.data_size(record)
array = Vector{UInt8}(undef, BAM.FIXED_FIELDS_BYTES + dsize)
GC.@preserve array record begin
ptr = Ptr{UInt8}(pointer_from_objref(record))
unsafe_copyto!(pointer(array), ptr, BAM.FIXED_FIELDS_BYTES)
unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize)
end
new_record = convert(BAM.Record, array)
@test record.bin_mq_nl == new_record.bin_mq_nl
@test record.block_size == new_record.block_size
@test record.flag_nc == new_record.flag_nc
@test record.l_seq == new_record.l_seq
@test record.next_refid == new_record.next_refid
@test record.next_pos == new_record.next_pos
@test record.refid == new_record.refid
@test record.pos == new_record.pos
@test record.tlen == new_record.tlen
@test record.data == new_record.data
# iterator
@test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#1.bam")))) == 1
@test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#2.bam")))) == 2
# IOStream
@test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#1.bam"))))) == 1
@test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#2.bam"))))) == 2
end
@testset "Read long CIGARs" begin
function get_cigar_lens(rec::BAM.Record)
cigar_ops, cigar_n = BAM.cigar_rle(rec)
field_ops, field_n = BAM.cigar_rle(rec, false)
cigar_l = length(cigar_ops)
field_l = length(field_ops)
return cigar_l, field_l
end
function check_cigar_vs_field(rec::BAM.Record)
cigar = BAM.cigar(rec)
field = BAM.cigar(rec, false)
cigar_l, field_l = get_cigar_lens(rec)
return cigar != field && cigar_l != field_l
end
function check_cigar_lens(rec::BAM.Record, field_len, cigar_len)
cigar_l, field_l = get_cigar_lens(rec)
return cigar_l == cigar_len && field_l == field_len
end
reader = open(BAM.Reader, joinpath(bamdir, "cigar-64k.bam"))
rec = BAM.Record()
read!(reader, rec)
@test !check_cigar_vs_field(rec)
read!(reader, rec)
@test check_cigar_vs_field(rec)
@test check_cigar_lens(rec, 2, 72091)
end
function compare_records(xs, ys)
if length(xs) != length(ys)
return false
end
for (x, y) in zip(xs, ys)
if !(
x.block_size == y.block_size &&
x.refid == y.refid &&
x.pos == y.pos &&
x.bin_mq_nl == y.bin_mq_nl &&
x.flag_nc == y.flag_nc &&
x.l_seq == y.l_seq &&
x.next_refid == y.next_refid &&
x.next_pos == y.next_pos &&
x.tlen == y.tlen &&
x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)])
return false
end
end
return true
end
@testset "Round trip" begin
for specimen in YAML.load_file(joinpath(bamdir, "index.yml"))
filepath = joinpath(bamdir, specimen["filename"])
mktemp() do path, _
# copy
if occursin("bai", get(specimen, "tags", ""))
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
else
reader = open(BAM.Reader, filepath)
end
writer = BAM.Writer(
BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ"))))
records = BAM.Record[]
for record in reader
push!(records, record)
write(writer, record)
end
close(reader)
close(writer)
@test compare_records(open(collect, BAM.Reader, path), records)
end
end
end
@testset "Random access" begin
filepath = joinpath(bamdir, "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam")
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
@test isa(eachoverlap(reader, "chr1", 1:100), BAM.OverlapIterator)
@test isa(eachoverlap(reader, GenomicFeatures.Interval("chr1", 1, 100)), BAM.OverlapIterator)
# expected values are counted using samtools
for (refname, interval, expected) in [
("chr1", 1_000:10000, 21),
("chr1", 8_000:10000, 20),
("chr1", 766_000:800_000, 142),
("chr1", 786_000:800_000, 1),
("chr1", 796_000:800_000, 0)]
intsect = eachoverlap(reader, refname, interval)
@test eltype(intsect) == BAM.Record
@test count(_ -> true, intsect) == expected
# check that the intersection iterator is stateless
@test count(_ -> true, intsect) == expected
end
# randomized tests
for n in 1:50
refindex = 1
refname = "chr1"
range = randrange(1:1_000_000)
seekstart(reader)
# linear scan
expected = filter(collect(reader)) do record
BAM.compare_intervals(record, (refindex, range)) == 0
end
# indexed scan
actual = collect(eachoverlap(reader, refname, range))
@test compare_records(actual, expected)
end
close(reader)
filepath = joinpath(bamdir, "R_12h_D06.uniq.q40.bam")
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
@test isempty(collect(eachoverlap(reader, "chr19", 5823708:5846478)))
close(reader)
end
end