From 7a56931b90a17986631e4b6c9373845a9a02109d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 18 Jan 2020 05:24:09 +1100 Subject: [PATCH] Naive code roundup --- src/XAM.jl | 7 +- src/bam/auxdata.jl | 153 +++++++++++ src/bam/bai.jl | 55 ++++ src/bam/bam.jl | 19 ++ src/bam/overlap.jl | 95 +++++++ src/bam/reader.jl | 145 ++++++++++ src/bam/record.jl | 634 ++++++++++++++++++++++++++++++++++++++++++++ src/bam/writer.jl | 62 +++++ src/sam/flags.jl | 26 ++ src/sam/header.jl | 53 ++++ src/sam/metainfo.jl | 235 ++++++++++++++++ src/sam/reader.jl | 265 ++++++++++++++++++ src/sam/record.jl | 624 +++++++++++++++++++++++++++++++++++++++++++ src/sam/sam.jl | 23 ++ src/sam/writer.jl | 43 +++ test/runtests.jl | 426 +++++++++++++++++++++++++++++ 16 files changed, 2864 insertions(+), 1 deletion(-) create mode 100644 src/bam/auxdata.jl create mode 100644 src/bam/bai.jl create mode 100644 src/bam/bam.jl create mode 100644 src/bam/overlap.jl create mode 100644 src/bam/reader.jl create mode 100644 src/bam/record.jl create mode 100644 src/bam/writer.jl create mode 100644 src/sam/flags.jl create mode 100644 src/sam/header.jl create mode 100644 src/sam/metainfo.jl create mode 100644 src/sam/reader.jl create mode 100644 src/sam/record.jl create mode 100644 src/sam/sam.jl create mode 100644 src/sam/writer.jl create mode 100644 test/runtests.jl diff --git a/src/XAM.jl b/src/XAM.jl index 5305716..7fc5234 100644 --- a/src/XAM.jl +++ b/src/XAM.jl @@ -1,5 +1,10 @@ module XAM -greet() = print("Hello World!") +export + BAM, + SAM + +include("sam/sam.jl") +include("bam/bam.jl") end # module diff --git a/src/bam/auxdata.jl b/src/bam/auxdata.jl new file mode 100644 index 0000000..2bcfac2 --- /dev/null +++ b/src/bam/auxdata.jl @@ -0,0 +1,153 @@ +# BAM Auxiliary Data +# ================== + +struct AuxData <: AbstractDict{String,Any} + data::Vector{UInt8} +end + +function Base.getindex(aux::AuxData, tag::AbstractString) + checkauxtag(tag) + return getauxvalue(aux.data, 1, length(aux.data), UInt8(tag[1]), UInt8(tag[2])) +end + +function Base.length(aux::AuxData) + data = aux.data + p = 1 + len = 0 + while p ≤ length(data) + len += 1 + p = next_tag_position(data, p) + end + return len +end + +function Base.iterate(aux::AuxData, pos=1) + if pos > length(aux.data) + return nothing + end + + data = aux.data + @label doit + t1 = data[pos] + t2 = data[pos+1] + pos, typ = loadauxtype(data, pos + 2) + pos, value = loadauxvalue(data, pos, typ) + if t1 == t2 == 0xff + @goto doit + end + return Pair{String,Any}(String([t1, t2]), value), pos +end + + +# Internals +# --------- + +function checkauxtag(tag::AbstractString) + if sizeof(tag) != 2 + throw(ArgumentError("tag length must be 2")) + end +end + +function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) + pos = findauxtag(data, start, stop, t1, t2) + if pos == 0 + throw(KeyError(String([t1, t2]))) + end + pos, T = loadauxtype(data, pos + 2) + _, val = loadauxvalue(data, pos, T) + return val +end + +function loadauxtype(data::Vector{UInt8}, p::Int) + function auxtype(b) + return ( + b == UInt8('A') ? Char : + b == UInt8('c') ? Int8 : + b == UInt8('C') ? UInt8 : + b == UInt8('s') ? Int16 : + b == UInt8('S') ? UInt16 : + b == UInt8('i') ? Int32 : + b == UInt8('I') ? UInt32 : + b == UInt8('f') ? Float32 : + b == UInt8('Z') ? String : + error("invalid type tag: '$(Char(b))'")) + end + t = data[p] + if t == UInt8('B') + return p + 2, Vector{auxtype(data[p+1])} + else + return p + 1, auxtype(t) + end +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T + return p + sizeof(T), unsafe_load(Ptr{T}(pointer(data, p))) +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Char}) + return p + 1, Char(unsafe_load(pointer(data, p))) +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T + n = unsafe_load(Ptr{Int32}(pointer(data, p))) + p += 4 + xs = Vector{T}(undef, n) + unsafe_copyto!(pointer(xs), Ptr{T}(pointer(data, p)), n) + return p + n * sizeof(T), xs +end + +function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{String}) + dataptr = pointer(data, p) + endptr = ccall(:memchr, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), dataptr, '\0', length(data) - p + 1) + q::Int = p + (endptr - dataptr) - 1 + return q + 2, String(data[p:q]) +end + +function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) + pos = start + while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2) + pos = next_tag_position(data, pos) + end + if pos > stop + return 0 + else + return pos + end +end + +# Find the starting position of a next tag in `data` after `p`. +# `(data[p], data[p+1])` is supposed to be a current tag. +function next_tag_position(data::Vector{UInt8}, p::Int) + typ = Char(data[p+2]) + p += 3 + if typ == 'A' + p += 1 + elseif typ == 'c' || typ == 'C' + p += 1 + elseif typ == 's' || typ == 'S' + p += 2 + elseif typ == 'i' || typ == 'I' + p += 4 + elseif typ == 'f' + p += 4 + elseif typ == 'd' + p += 8 + elseif typ == 'Z' || typ == 'H' + while data[p] != 0x00 # NULL-terminalted string + p += 1 + end + p += 1 + elseif typ == 'B' + eltyp = Char(data[p]) + elsize = eltyp == 'c' || eltyp == 'C' ? 1 : + eltyp == 's' || eltyp == 'S' ? 2 : + eltyp == 'i' || eltye == 'I' || eltyp == 'f' ? 4 : + error("invalid type tag: '$(Char(eltyp))'") + p += 1 + n = unsafe_load(Ptr{Int32}(pointer(data, p))) + p += 4 + elsize * n + else + error("invalid type tag: '$(Char(typ))'") + end + return p +end diff --git a/src/bam/bai.jl b/src/bam/bai.jl new file mode 100644 index 0000000..e5caf0a --- /dev/null +++ b/src/bam/bai.jl @@ -0,0 +1,55 @@ +# BAI +# === +# +# Index for BAM files. + + +# An index type for the BAM file format. +struct BAI + # BGZF file index + index::GenomicFeatures.Indexes.BGZFIndex + + # number of unmapped reads + n_no_coors::Union{Nothing, Int} +end + +""" + BAI(filename::AbstractString) + +Load a BAI index from `filename`. +""" +function BAI(filename::AbstractString) + return open(read_bai, filename) +end + +""" + BAI(input::IO) + +Load a BAI index from `input`. +""" +function BAI(input::IO) + return read_bai(input) +end + +# Read a BAI object from `input`. +function read_bai(input::IO) + # check magic bytes + B = read(input, UInt8) + A = read(input, UInt8) + I = read(input, UInt8) + x = read(input, UInt8) + if B != UInt8('B') || A != UInt8('A') || I != UInt8('I') || x != 0x01 + error("input is not a valid BAI file") + end + + # read contents + n_refs = read(input, Int32) + index = GenomicFeatures.Indexes.read_bgzfindex(input, n_refs) + if !eof(input) + n_no_coors = read(input, UInt64) + else + n_no_coors = nothing + end + + return BAI(index, n_no_coors) +end diff --git a/src/bam/bam.jl b/src/bam/bam.jl new file mode 100644 index 0000000..f3317f6 --- /dev/null +++ b/src/bam/bam.jl @@ -0,0 +1,19 @@ +# BAM File Format +# =============== + +module BAM + +import BGZFStreams +import BioAlignments: BioAlignments, SAM +import GenomicFeatures: GenomicFeatures, Interval +import BioSequences +import BioCore: BioCore, isfilled + +include("bai.jl") +include("auxdata.jl") +include("reader.jl") +include("record.jl") +include("writer.jl") +include("overlap.jl") + +end diff --git a/src/bam/overlap.jl b/src/bam/overlap.jl new file mode 100644 index 0000000..10641d3 --- /dev/null +++ b/src/bam/overlap.jl @@ -0,0 +1,95 @@ +# BAM Overlap +# =========== + +struct OverlapIterator{T} + reader::Reader{T} + refname::String + interval::UnitRange{Int} +end + +function Base.IteratorSize(::Type{OverlapIterator{T}}) where T + return Base.SizeUnknown() +end + +function Base.eltype(::Type{OverlapIterator{T}}) where T + return Record +end + +function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval) + return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last) +end + +function GenomicFeatures.eachoverlap(reader::Reader, interval) + return GenomicFeatures.eachoverlap(reader, convert(Interval, interval)) +end + +function GenomicFeatures.eachoverlap(reader::Reader, refname::AbstractString, interval::UnitRange) + return OverlapIterator(reader, String(refname), interval) +end + + +# Iterator +# -------- + +mutable struct OverlapIteratorState + # reference index + refindex::Int + + # possibly overlapping chunks + chunks::Vector{GenomicFeatures.Indexes.Chunk} + + # current chunk index + chunkid::Int + + # pre-allocated record + record::Record +end + +function Base.iterate(iter::OverlapIterator) + refindex = findfirst(isequal(iter.refname), iter.reader.refseqnames) + if refindex == 0 + throw(ArgumentError("sequence name $(iter.refname) is not found in the header")) + end + @assert iter.reader.index !== nothing + chunks = GenomicFeatures.Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval) + if !isempty(chunks) + seek(iter.reader, first(chunks).start) + end + state = OverlapIteratorState(refindex, chunks, 1, Record()) + return iterate(iter, state) +end + +function Base.iterate(iter::OverlapIterator, state) + while state.chunkid ≤ lastindex(state.chunks) + chunk = state.chunks[state.chunkid] + while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop + read!(iter.reader, state.record) + c = compare_intervals(state.record, (state.refindex, iter.interval)) + if c == 0 + return copy(state.record), state + elseif c > 0 + # no more overlapping records in this chunk since records are sorted + break + end + end + state.chunkid += 1 + if state.chunkid ≤ lastindex(state.chunks) + seek(iter.reader, state.chunks[state.chunkid].start) + end + end + return nothing +end + +function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}}) + rid = refid(record) + if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2])) + # strictly left + return -1 + elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2])) + # strictly right + return +1 + else + # overlapping + return 0 + end +end diff --git a/src/bam/reader.jl b/src/bam/reader.jl new file mode 100644 index 0000000..5e92b8e --- /dev/null +++ b/src/bam/reader.jl @@ -0,0 +1,145 @@ +# BAM Reader +# ========== + +""" + BAM.Reader(input::IO; index=nothing) + +Create a data reader of the BAM file format. + +# Arguments +* `input`: data source +* `index=nothing`: filepath to a random access index (currently *bai* is supported) +""" +mutable struct Reader{T} <: BioCore.IO.AbstractReader + stream::BGZFStreams.BGZFStream{T} + header::SAM.Header + start_offset::BGZFStreams.VirtualOffset + refseqnames::Vector{String} + refseqlens::Vector{Int} + index::Union{Nothing, BAI} +end + +function Base.eltype(::Type{Reader{T}}) where T + return Record +end + +function BioCore.IO.stream(reader::Reader) + return reader.stream +end + +function Reader(input::IO; index=nothing) + if isa(index, AbstractString) + index = BAI(index) + else + if index != nothing + error("unrecognizable index argument") + end + end + reader = init_bam_reader(input) + reader.index = index + return reader +end + +function Base.show(io::IO, reader::Reader) + println(io, summary(reader), ":") + print(io, " number of contigs: ", length(reader.refseqnames)) +end + +""" + header(reader::Reader; fillSQ::Bool=false)::SAM.Header + +Get the header of `reader`. + +If `fillSQ` is `true`, this function fills missing "SQ" metainfo in the header. +""" +function header(reader::Reader; fillSQ::Bool=false)::SAM.Header + header = reader.header + if fillSQ + if !isempty(findall(reader.header, "SQ")) + throw(ArgumentError("SAM header already has SQ records")) + end + header = copy(header) + for (name, len) in zip(reader.refseqnames, reader.refseqlens) + push!(header, SAM.MetaInfo("SQ", ["SN" => name, "LN" => len])) + end + end + return header +end + +function BioCore.header(reader::Reader) + return header(reader) +end + +function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset) + seek(reader.stream, voffset) +end + +function Base.seekstart(reader::Reader) + seek(reader.stream, reader.start_offset) +end + +function Base.iterate(reader::Reader, rec=Record()) + if eof(reader) + return nothing + end + read!(reader, rec) + return copy(rec), rec +end + +# Initialize a BAM reader by reading the header section. +function init_bam_reader(input::BGZFStreams.BGZFStream) + # magic bytes + B = read(input, UInt8) + A = read(input, UInt8) + M = read(input, UInt8) + x = read(input, UInt8) + if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01 + error("input was not a valid BAM file") + end + + # SAM header + textlen = read(input, Int32) + samreader = SAM.Reader(IOBuffer(read(input, textlen))) + + # reference sequences + refseqnames = String[] + refseqlens = Int[] + n_refs = read(input, Int32) + for _ in 1:n_refs + namelen = read(input, Int32) + data = read(input, namelen) + seqname = unsafe_string(pointer(data)) + seqlen = read(input, Int32) + push!(refseqnames, seqname) + push!(refseqlens, seqlen) + end + + voffset = isa(input.io, Base.AbstractPipe) ? + BGZFStreams.VirtualOffset(0, 0) : + BGZFStreams.virtualoffset(input) + return Reader( + input, + samreader.header, + voffset, + refseqnames, + refseqlens, + nothing) +end + +function init_bam_reader(input::IO) + return init_bam_reader(BGZFStreams.BGZFStream(input)) +end + +function _read!(reader::Reader, record) + unsafe_read( + reader.stream, + pointer_from_objref(record), + FIXED_FIELDS_BYTES) + dsize = data_size(record) + if length(record.data) < dsize + resize!(record.data, dsize) + end + unsafe_read(reader.stream, pointer(record.data), dsize) + record.reader = reader + return record +end diff --git a/src/bam/record.jl b/src/bam/record.jl new file mode 100644 index 0000000..31226d2 --- /dev/null +++ b/src/bam/record.jl @@ -0,0 +1,634 @@ +# BAM Record +# ========== + +""" + BAM.Record() + +Create an unfilled BAM record. +""" +mutable struct Record + # fixed-length fields (see BMA specs for the details) + block_size::Int32 + refid::Int32 + pos::Int32 + bin_mq_nl::UInt32 + flag_nc::UInt32 + l_seq::Int32 + next_refid::Int32 + next_pos::Int32 + tlen::Int32 + # variable length data + data::Vector{UInt8} + reader::Reader + + function Record() + return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[]) + end +end + +# the data size of fixed-length fields (block_size-tlen) +const FIXED_FIELDS_BYTES = 36 + +function Record(data::Vector{UInt8}) + return convert(Record, data) +end + +function Base.convert(::Type{Record}, data::Vector{UInt8}) + length(data) < FIXED_FIELDS_BYTES && throw(ArgumentError("data too short")) + record = Record() + dst_pointer = Ptr{UInt8}(pointer_from_objref(record)) + unsafe_copyto!(dst_pointer, pointer(data), FIXED_FIELDS_BYTES) + dsize = data_size(record) + resize!(record.data, dsize) + length(data) < dsize + FIXED_FIELDS_BYTES && throw(ArgumentError("data too short")) + unsafe_copyto!(record.data, 1, data, FIXED_FIELDS_BYTES + 1, dsize) + return record +end + +function Base.copy(record::Record) + copy = Record() + copy.block_size = record.block_size + copy.refid = record.refid + copy.pos = record.pos + copy.bin_mq_nl = record.bin_mq_nl + copy.flag_nc = record.flag_nc + copy.l_seq = record.l_seq + copy.next_refid = record.next_refid + copy.next_pos = record.next_pos + copy.tlen = record.tlen + copy.data = record.data[1:data_size(record)] + if isdefined(record, :reader) + copy.reader = record.reader + end + return copy +end + +function Base.show(io::IO, record::Record) + print(io, summary(record), ':') + if isfilled(record) + println(io) + println(io, " template name: ", tempname(record)) + println(io, " flag: ", flag(record)) + println(io, " reference ID: ", refid(record)) + println(io, " position: ", position(record)) + println(io, " mapping quality: ", mappingquality(record)) + println(io, " CIGAR: ", cigar(record)) + println(io, " next reference ID: ", nextrefid(record)) + println(io, " next position: ", nextposition(record)) + println(io, " template length: ", templength(record)) + println(io, " sequence: ", sequence(record)) + # TODO: pretty print base quality + println(io, " base quality: ", quality(record)) + print(io, " auxiliary data:") + for field in keys(auxdata(record)) + print(io, ' ', field, '=', record[field]) + end + else + print(io, " ") + end +end + +function Base.read!(reader::Reader, record::Record) + return _read!(reader, record) +end + + +# Accessor Fuctions +# ----------------- + +""" + flag(record::Record)::UInt16 + +Get the bitwise flag of `record`. +""" +function flag(record::Record)::UInt16 + checkfilled(record) + return UInt16(record.flag_nc >> 16) +end + +function hasflag(record::Record) + return isfilled(record) +end + +""" + ismapped(record::Record)::Bool + +Test if `record` is mapped. +""" +function ismapped(record::Record)::Bool + return flag(record) & SAM.FLAG_UNMAP == 0 +end + +""" + isprimary(record::Record)::Bool + +Test if `record` is a primary line of the read. + +This is equivalent to `flag(record) & 0x900 == 0`. +""" +function isprimary(record::Record)::Bool + return flag(record) & 0x900 == 0 +end + +""" + ispositivestrand(record::Record)::Bool + +Test if `record` is aligned to the positive strand. + +This is equivalent to `flag(record) & 0x10 == 0`. +""" +function ispositivestrand(record::Record)::Bool + flag(record) & 0x10 == 0 +end + +""" + refid(record::Record)::Int + +Get the reference sequence ID of `record`. + +The ID is 1-based (i.e. the first sequence is 1) and is 0 for a record without a mapping position. + +See also: `BAM.rname` +""" +function refid(record::Record)::Int + checkfilled(record) + return record.refid + 1 +end + +function hasrefid(record::Record) + return isfilled(record) +end + +function checked_refid(record::Record) + id = refid(record) + if id == 0 + throw(ArgumentError("record is not mapped")) + elseif !isdefined(record, :reader) + throw(ArgumentError("reader is not defined")) + end + return id +end + +""" + refname(record::Record)::String + +Get the reference sequence name of `record`. + +See also: `BAM.refid` +""" +function refname(record::Record)::String + checkfilled(record) + id = checked_refid(record) + return record.reader.refseqnames[id] +end + +""" + reflen(record::Record)::Int + +Get the length of the reference sequence this record applies to. +""" +function reflen(record::Record)::Int + checkfilled(record) + id = checked_refid(record) + return record.reader.refseqlens[id] +end + +function hasrefname(record::Record) + return hasrefid(record) +end + +""" + position(record::Record)::Int + +Get the 1-based leftmost mapping position of `record`. +""" +function position(record::Record)::Int + checkfilled(record) + return record.pos + 1 +end + +function hasposition(record::Record) + return isfilled(record) +end + +""" + rightposition(record::Record)::Int + +Get the 1-based rightmost mapping position of `record`. +""" +function rightposition(record::Record)::Int + checkfilled(record) + return Int32(position(record) + alignlength(record) - 1) +end + +function hasrightposition(record::Record) + return isfilled(record) && ismapped(record) +end + +""" + isnextmapped(record::Record)::Bool + +Test if the mate/next read of `record` is mapped. +""" +function isnextmapped(record::Record)::Bool + return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0) +end + +""" + nextrefid(record::Record)::Int + +Get the next/mate reference sequence ID of `record`. +""" +function nextrefid(record::Record)::Int + checkfilled(record) + return record.next_refid + 1 +end + +function hasnextrefid(record::Record) + return isfilled(record) +end + +""" + nextrefname(record::Record)::String + +Get the reference name of the mate/next read of `record`. +""" +function nextrefname(record::Record)::String + checkfilled(record) + id = nextrefid(record) + if id == 0 + throw(ArgumentError("next record is not mapped")) + elseif !isdefined(record, :reader) + throw(ArgumentError("reader is not defined")) + end + return record.reader.refseqnames[id] +end + +function hasnextrefname(record::Record) + return isfilled(record) && isnextmapped(record) +end + +""" + nextposition(record::Record)::Int + +Get the 1-based leftmost mapping position of the next/mate read of `record`. +""" +function nextposition(record::Record)::Int + checkfilled(record) + return record.next_pos + 1 +end + +function hasnextposition(record::Record) + return isfilled(record) +end + +""" + mappingquality(record::Record)::UInt8 + +Get the mapping quality of `record`. +""" +function mappingquality(record::Record)::UInt8 + return UInt8((record.bin_mq_nl >> 8) & 0xff) +end + +function hasmappingquality(record::Record) + return isfilled(record) +end + +""" + n_cigar_op(record::Record, checkCG::Bool = true) + +Return the number of operations in the CIGAR string of `record`. + +Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record. +However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string. + +Calling this method with `checkCG` set to `true` (default) this method will always yield the number of operations in the true cigar string, because this is probably what you want, the vast majority of the time. + +If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to get the number of operations in the `cigar` field of the BAM record, then set `checkCG` to `false`. +""" +function n_cigar_op(record::Record, checkCG::Bool = true) + return cigar_position(record, checkCG)[2] +end + +""" + cigar(record::Record)::String + +Get the CIGAR string of `record`. + +Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record. +However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string. + +Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time. + +If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`. + +See also `BAM.cigar_rle`. +""" +function cigar(record::Record, checkCG::Bool = true)::String + buf = IOBuffer() + for (op, len) in zip(cigar_rle(record, checkCG)...) + print(buf, len, convert(Char, op)) + end + return String(take!(buf)) +end + +""" + cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}} + +Get a run-length encoded tuple `(ops, lens)` of the CIGAR string in `record`. + +Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record. +However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string. + +Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time. + +If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`. + +See also `BAM.cigar`. +""" +function cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}} + checkfilled(record) + idx, nops = cigar_position(record, checkCG) + ops, lens = extract_cigar_rle(record.data, idx, nops) + return ops, lens +end + +function extract_cigar_rle(data::Vector{UInt8}, offset, n) + ops = Vector{BioAlignments.Operation}() + lens = Vector{Int}() + for i in offset:4:offset + (n - 1) * 4 + x = unsafe_load(Ptr{UInt32}(pointer(data, i))) + op = BioAlignments.Operation(x & 0x0F) + push!(ops, op) + push!(lens, x >> 4) + end + return ops, lens +end + +function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int} + cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF + if !checkCG + return cigaridx, nops + end + if nops != 2 + return cigaridx, nops + end + x = unsafe_load(Ptr{UInt32}(pointer(record.data, cigaridx))) + if x != UInt32(seqlength(record) << 4 | 4) + return cigaridx, nops + end + start = auxdata_position(record) + stop = data_size(record) + tagidx = findauxtag(record.data, start, stop, UInt8('C'), UInt8('G')) + if tagidx == 0 + return cigaridx, nops + end + # Tag exists, validate type is BI. + typ = unsafe_load(Ptr{UInt16}(pointer(record.data, tagidx += 2))) + if typ != (UInt16('I') << 8 | UInt16('B')) + return cigaridx, nops + end + # If got this far, the CG tag is valid and contains the cigar. + # Get the true n_cigar_ops, and return it and the idx of the first + nops = UInt32(unsafe_load(Ptr{Int32}(pointer(record.data, tagidx += 2)))) + tagidx += 4 + return tagidx, nops +end + +""" + alignment(record::Record)::BioAlignments.Alignment + +Get the alignment of `record`. +""" +function alignment(record::Record)::BioAlignments.Alignment + checkfilled(record) + if !ismapped(record) + return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) + end + seqpos = 0 + refpos = position(record) - 1 + anchors = [BioAlignments.AlignmentAnchor(seqpos, refpos, BioAlignments.OP_START)] + for (op, len) in zip(cigar_rle(record)...) + if BioAlignments.ismatchop(op) + seqpos += len + refpos += len + elseif BioAlignments.isinsertop(op) + seqpos += len + elseif BioAlignments.isdeleteop(op) + refpos += len + else + error("operation $(op) is not supported") + end + push!(anchors, BioAlignments.AlignmentAnchor(seqpos, refpos, op)) + end + return BioAlignments.Alignment(anchors) +end + +function hasalignment(record::Record) + return ismapped(record) +end + +""" + alignlength(record::Record)::Int + +Get the alignment length of `record`. +""" +function alignlength(record::Record)::Int + offset = seqname_length(record) + length::Int = 0 + for i in offset + 1:4:offset + n_cigar_op(record, false) * 4 + x = unsafe_load(Ptr{UInt32}(pointer(record.data, i))) + op = BioAlignments.Operation(x & 0x0F) + if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op) + length += x >> 4 + end + end + return length +end + +""" + tempname(record::Record)::String + +Get the query template name of `record`. +""" +function tempname(record::Record)::String + checkfilled(record) + # drop the last NUL character + return unsafe_string(pointer(record.data), max(seqname_length(record) - 1, 0)) +end + +function hastempname(record::Record) + return isfilled(record) +end + +""" + templength(record::Record)::Int + +Get the template length of `record`. +""" +function templength(record::Record)::Int + checkfilled(record) + return record.tlen +end + +function hastemplength(record::Record) + return isfilled(record) +end + +""" + sequence(record::Record)::BioSequences.DNASequence + +Get the segment sequence of `record`. +""" +function sequence(record::Record)::BioSequences.DNASequence + checkfilled(record) + seqlen = seqlength(record) + data = Vector{UInt64}(undef, cld(seqlen, 16)) + src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1) + for i in 1:lastindex(data) + # copy data flipping high and low nybble + x = unsafe_load(src, i) + data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4 + end + return BioSequences.DNASequence(data, 1:seqlen, false) +end + +function hassequence(record::Record) + return isfilled(record) +end + +""" + seqlength(record::Record)::Int + +Get the sequence length of `record`. +""" +function seqlength(record::Record)::Int + checkfilled(record) + return record.l_seq % Int +end + +function hasseqlength(record::Record) + return isfilled(record) +end + +""" + quality(record::Record)::Vector{UInt8} + +Get the base quality of `record`. +""" +function quality(record::Record)::Vector{UInt8} + checkfilled(record) + seqlen = seqlength(record) + offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen] +end + +function hasquality(record::Record) + return isfilled(record) +end + +""" + auxdata(record::Record)::BAM.AuxData + +Get the auxiliary data of `record`. +""" +function auxdata(record::Record) + checkfilled(record) + return AuxData(record.data[auxdata_position(record):data_size(record)]) +end + +function hasauxdata(record::Record) + return isfilled(record) +end + +function Base.getindex(record::Record, tag::AbstractString) + checkauxtag(tag) + start = auxdata_position(record) + stop = data_size(record) + return getauxvalue(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) +end + +function Base.haskey(record::Record, tag::AbstractString) + checkauxtag(tag) + start = auxdata_position(record) + stop = data_size(record) + return findauxtag(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) > 0 +end + +function Base.keys(record::Record) + return collect(keys(auxdata(record))) +end + +function Base.values(record::Record) + return [record[key] for key in keys(record)] +end + + +# BioCore Methods +# ----------- + +function BioCore.isfilled(record::Record) + return record.block_size != 0 +end + +function BioCore.seqname(record::Record) + return tempname(record) +end + +function BioCore.hasseqname(record::Record) + return hastempname(record) +end + +function BioCore.sequence(record::Record) + return sequence(record) +end + +function BioCore.hassequence(record::Record) + return hassequence(record) +end + +function BioCore.leftposition(record::Record) + return position(record) +end + +function BioCore.hasleftposition(record::Record) + return hasposition(record) +end + +function BioCore.rightposition(record::Record) + return rightposition(record) +end + +function BioCore.hasrightposition(record::Record) + return hasrightposition(record) +end + + +# Helper Functions +# ---------------- + +# Return the size of the `.data` field. +function data_size(record::Record) + if isfilled(record) + return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size) + else + return 0 + end +end + +function checkfilled(record::Record) + if !isfilled(record) + throw(ArgumentError("unfilled BAM record")) + end +end + +function auxdata_position(record::Record) + seqlen = seqlength(record) + return seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + seqlen + 1 +end + +# Return the length of the read name. +function seqname_length(record::Record) + return record.bin_mq_nl & 0xff +end diff --git a/src/bam/writer.jl b/src/bam/writer.jl new file mode 100644 index 0000000..f017424 --- /dev/null +++ b/src/bam/writer.jl @@ -0,0 +1,62 @@ +# BAM Writer +# ========== + +""" + BAM.Writer(output::BGZFStream, header::SAM.Header) + +Create a data writer of the BAM file format. + +# Arguments +* `output`: data sink +* `header`: SAM header object +""" +mutable struct Writer <: BioCore.IO.AbstractWriter + stream::BGZFStreams.BGZFStream +end + +function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header) + refseqnames = String[] + refseqlens = Int[] + for metainfo in findall(header, "SQ") + push!(refseqnames, metainfo["SN"]) + push!(refseqlens, parse(Int, metainfo["LN"])) + end + write_header(stream, header, refseqnames, refseqlens) + return Writer(stream) +end + +function BioCore.IO.stream(writer::Writer) + return writer.stream +end + +function Base.write(writer::Writer, record::Record) + n = 0 + n += unsafe_write(writer.stream, pointer_from_objref(record), FIXED_FIELDS_BYTES) + n += unsafe_write(writer.stream, pointer(record.data), data_size(record)) + return n +end + +function write_header(stream, header, refseqnames, refseqlens) + @assert length(refseqnames) == length(refseqlens) + n = 0 + + # magic bytes + n += write(stream, "BAM\1") + + # SAM header + buf = IOBuffer() + l = write(SAM.Writer(buf), header) + n += write(stream, Int32(l)) + n += write(stream, take!(buf)) + + # reference sequences + n += write(stream, Int32(length(refseqnames))) + for (seqname, seqlen) in zip(refseqnames, refseqlens) + namelen = length(seqname) + n += write(stream, Int32(namelen + 1)) + n += write(stream, seqname, '\0') + n += write(stream, Int32(seqlen)) + end + + return n +end diff --git a/src/sam/flags.jl b/src/sam/flags.jl new file mode 100644 index 0000000..0b629d1 --- /dev/null +++ b/src/sam/flags.jl @@ -0,0 +1,26 @@ +# SAM Flags +# ========= +# +# Bitwise flags (or FLAG). + +for (name, bits, doc) in [ + (:PAIRED, UInt16(0x001), "the read is paired in sequencing, no matter whether it is mapped in a pair"), + (:PROPER_PAIR, UInt16(0x002), "the read is mapped in a proper pair" ), + (:UNMAP, UInt16(0x004), "the read itself is unmapped; conflictive with SAM.FLAG_PROPER_PAIR" ), + (:MUNMAP, UInt16(0x008), "the mate is unmapped" ), + (:REVERSE, UInt16(0x010), "the read is mapped to the reverse strand" ), + (:MREVERSE, UInt16(0x020), "the mate is mapped to the reverse strand" ), + (:READ1, UInt16(0x040), "this is read1" ), + (:READ2, UInt16(0x080), "this is read2" ), + (:SECONDARY, UInt16(0x100), "not primary alignment" ), + (:QCFAIL, UInt16(0x200), "QC failure" ), + (:DUP, UInt16(0x400), "optical or PCR duplicate" ), + (:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),] + @assert bits isa UInt16 + sym = Symbol("FLAG_", name) + doc = string(@sprintf("0x%04x: ", bits), doc) + @eval begin + @doc $(doc) const $(sym) = $(bits) + end +end + diff --git a/src/sam/header.jl b/src/sam/header.jl new file mode 100644 index 0000000..1369e37 --- /dev/null +++ b/src/sam/header.jl @@ -0,0 +1,53 @@ +# SAM Header +# ========== + +struct Header + metainfo::Vector{MetaInfo} +end + +""" + SAM.Header() + +Create an empty header. +""" +function Header() + return Header(MetaInfo[]) +end + +function Base.copy(header::Header) + return Header(header.metainfo) +end + +function Base.eltype(::Type{Header}) + return MetaInfo +end + +function Base.length(header::Header) + return length(header.metainfo) +end + +function Base.iterate(header::Header, i=1) + if i > length(header.metainfo) + return nothing + end + return header.metainfo[i], i + 1 +end + +""" + find(header::Header, key::AbstractString)::Vector{MetaInfo} + +Find metainfo objects satisfying `SAM.tag(metainfo) == key`. +""" +function Base.findall(header::Header, key::AbstractString)::Vector{MetaInfo} + return filter(m -> isequalkey(m, key), header.metainfo) +end + +function Base.pushfirst!(header::Header, metainfo::MetaInfo) + pushfirst!(header.metainfo, metainfo) + return header +end + +function Base.push!(header::Header, metainfo::MetaInfo) + push!(header.metainfo, metainfo) + return header +end diff --git a/src/sam/metainfo.jl b/src/sam/metainfo.jl new file mode 100644 index 0000000..f40e6ae --- /dev/null +++ b/src/sam/metainfo.jl @@ -0,0 +1,235 @@ +# SAM Meta-Information +# ==================== + +mutable struct MetaInfo + # data and filled range + data::Vector{UInt8} + filled::UnitRange{Int} + # indexes + tag::UnitRange{Int} + val::UnitRange{Int} + dictkey::Vector{UnitRange{Int}} + dictval::Vector{UnitRange{Int}} +end + +function MetaInfo(data::Vector{UInt8}=UInt8[]) + metainfo = MetaInfo(data, 1:0, 1:0, 1:0, UnitRange{Int}[], UnitRange{Int}[]) + if !isempty(data) + index!(metainfo) + end + return metainfo +end + +""" + MetaInfo(str::AbstractString) + +Create a SAM metainfo from `str`. + +# Examples + + julia> SAM.MetaInfo("@CO\tsome comment") + BioAlignments.SAM.MetaInfo: + tag: CO + value: some comment + + julia> SAM.MetaInfo("@SQ\tSN:chr1\tLN:12345") + BioAlignments.SAM.MetaInfo: + tag: SQ + value: SN=chr1 LN=12345 + +""" +function MetaInfo(str::AbstractString) + return MetaInfo(Vector{UInt8}(str)) +end + +""" + MetaInfo(tag::AbstractString, value) + +Create a SAM metainfo with `tag` and `value`. + +`tag` is a two-byte ASCII string. If `tag` is `"CO"`, `value` must be a string; otherwise, `value` is an iterable object with key and value pairs. + +# Examples + + julia> SAM.MetaInfo("CO", "some comment") + BioAlignments.SAM.MetaInfo: + tag: CO + value: some comment + + julia> string(ans) + "@CO\tsome comment" + + julia> SAM.MetaInfo("SQ", ["SN" => "chr1", "LN" => 12345]) + BioAlignments.SAM.MetaInfo: + tag: SQ + value: SN=chr1 LN=12345 + + julia> string(ans) + "@SQ\tSN:chr1\tLN:12345" + +""" +function MetaInfo(tag::AbstractString, value) + buf = IOBuffer() + if tag == "CO" # comment + if !isa(value, AbstractString) + throw(ArgumentError("value must be a string")) + end + write(buf, "@CO\t", value) + elseif occursin(r"[A-Z][A-Z]", tag) + print(buf, '@', tag) + for (key, val) in value + print(buf, '\t', key, ':', val) + end + else + throw(ArgumentError("tag must match r\"[A-Z][A-Z]\"")) + end + return MetaInfo(take!(buf)) +end + +function initialize!(metainfo::MetaInfo) + metainfo.filled = 1:0 + metainfo.tag = 1:0 + metainfo.val = 1:0 + empty!(metainfo.dictkey) + empty!(metainfo.dictval) + return metainfo +end + +function isfilled(metainfo::MetaInfo) + return !isempty(metainfo.filled) +end + +function datarange(metainfo::MetaInfo) + return metainfo.filled +end + +function checkfilled(metainfo::MetaInfo) + if !isfilled(metainfo) + throw(ArgumentError("unfilled SAM metainfo")) + end +end + +function isequalkey(metainfo::MetaInfo, key::AbstractString) + if !isfilled(metainfo) || sizeof(key) != 2 + return false + end + k1, k2 = UInt8(key[1]), UInt8(key[2]) + return metainfo.data[metainfo.tag[1]] == k1 && metainfo.data[metainfo.tag[2]] == k2 +end + +function Base.show(io::IO, metainfo::MetaInfo) + print(io, summary(metainfo), ':') + if isfilled(metainfo) + println(io) + println(io, " tag: ", tag(metainfo)) + print(io, " value:") + if !iscomment(metainfo) + for (key, val) in zip(keys(metainfo), values(metainfo)) + print(io, ' ', key, '=', val) + end + else + print(io, ' ', value(metainfo)) + end + else + print(io, " ") + end +end + +function Base.print(io::IO, metainfo::MetaInfo) + write(io, metainfo) + return nothing +end + +function Base.write(io::IO, metainfo::MetaInfo) + checkfilled(metainfo) + r = datarange(metainfo) + return unsafe_write(io, pointer(metainfo.data, first(r)), length(r)) +end + + +# Accessor Functions +# ------------------ + +""" + iscomment(metainfo::MetaInfo)::Bool + +Test if `metainfo` is a comment (i.e. its tag is "CO"). +""" +function iscomment(metainfo::MetaInfo)::Bool + return isequalkey(metainfo, "CO") +end + +""" + tag(metainfo::MetaInfo)::String + +Get the tag of `metainfo`. +""" +function tag(metainfo::MetaInfo)::String + checkfilled(metainfo) + return String(metainfo.data[metainfo.tag]) +end + +""" + value(metainfo::MetaInfo)::String + +Get the value of `metainfo` as a string. +""" +function value(metainfo::MetaInfo)::String + checkfilled(metainfo) + return String(metainfo.data[metainfo.val]) +end + +""" + keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}} + +Get the values of `metainfo` as string pairs. +""" +function keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}} + checkfilled(metainfo) + if iscomment(metainfo) + throw(ArgumentError("not a dictionary")) + end + return Pair{String,String}[key => val for (key, val) in zip(keys(metainfo), values(metainfo))] +end + +function Base.keys(metainfo::MetaInfo) + checkfilled(metainfo) + if iscomment(metainfo) + throw(ArgumentError("not a dictionary")) + end + return [String(metainfo.data[r]) for r in metainfo.dictkey] +end + +function Base.values(metainfo::MetaInfo) + checkfilled(metainfo) + if iscomment(metainfo) + throw(ArgumentError("not a dictionary")) + end + return [String(metainfo.data[r]) for r in metainfo.dictval] +end + +function Base.haskey(metainfo::MetaInfo, key::AbstractString) + return findkey(metainfo, key) > 0 +end + +function Base.getindex(metainfo::MetaInfo, key::AbstractString) + i = findkey(metainfo, key) + if i == 0 + throw(KeyError(key)) + end + return String(metainfo.data[metainfo.dictval[i]]) +end + +function findkey(metainfo::MetaInfo, key::AbstractString) + checkfilled(metainfo) + if sizeof(key) != 2 + return 0 + end + t1, t2 = UInt8(key[1]), UInt8(key[2]) + for (i, k) in enumerate(metainfo.dictkey) + if metainfo.data[first(k)] == t1 && metainfo.data[first(k)+1] == t2 + return i + end + end + return 0 +end diff --git a/src/sam/reader.jl b/src/sam/reader.jl new file mode 100644 index 0000000..ee016e4 --- /dev/null +++ b/src/sam/reader.jl @@ -0,0 +1,265 @@ +# SAM Reader +# ========= + +mutable struct Reader <: BioCore.IO.AbstractReader + state::BioCore.Ragel.State + header::Header + + function Reader(input::BufferedStreams.BufferedInputStream) + reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header()) + readheader!(reader) + reader.state.cs = sam_body_machine.start_state + return reader + end +end + +""" + SAM.Reader(input::IO) + +Create a data reader of the SAM file format. + +# Arguments +* `input`: data source +""" +function Reader(input::IO) + return Reader(BufferedStreams.BufferedInputStream(input)) +end + +function BioCore.IO.stream(reader::Reader) + return reader.state.stream +end + +""" + header(reader::Reader)::Header + +Get the header of `reader`. +""" +function header(reader::Reader)::Header + return reader.header +end + +function BioCore.header(reader::Reader) + return header(reader) +end + +function Base.eltype(::Type{Reader}) + return Record +end + +# file = header . body +# header = metainfo* +# body = record* +isinteractive() && info("compiling SAM") +const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () + cat = Automa.RegExp.cat + rep = Automa.RegExp.rep + alt = Automa.RegExp.alt + opt = Automa.RegExp.opt + any = Automa.RegExp.any + + metainfo = let + tag = re"[A-Z][A-Z]" \ cat("CO") + tag.actions[:enter] = [:mark1] + tag.actions[:exit] = [:metainfo_tag] + + dict = let + key = re"[A-Za-z][A-Za-z0-9]" + key.actions[:enter] = [:mark2] + key.actions[:exit] = [:metainfo_dict_key] + val = re"[ -~]+" + val.actions[:enter] = [:mark2] + val.actions[:exit] = [:metainfo_dict_val] + keyval = cat(key, ':', val) + + cat(keyval, rep(cat('\t', keyval))) + end + dict.actions[:enter] = [:mark1] + dict.actions[:exit] = [:metainfo_val] + + co = cat("CO") + co.actions[:enter] = [:mark1] + co.actions[:exit] = [:metainfo_tag] + + comment = re"[^\r\n]*" + comment.actions[:enter] = [:mark1] + comment.actions[:exit] = [:metainfo_val] + + cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) + end + metainfo.actions[:enter] = [:anchor] + metainfo.actions[:exit] = [:metainfo] + + record = let + qname = re"[!-?A-~]+" + qname.actions[:enter] = [:mark] + qname.actions[:exit] = [:record_qname] + + flag = re"[0-9]+" + flag.actions[:enter] = [:mark] + flag.actions[:exit] = [:record_flag] + + rname = re"\*|[!-()+-<>-~][!-~]*" + rname.actions[:enter] = [:mark] + rname.actions[:exit] = [:record_rname] + + pos = re"[0-9]+" + pos.actions[:enter] = [:mark] + pos.actions[:exit] = [:record_pos] + + mapq = re"[0-9]+" + mapq.actions[:enter] = [:mark] + mapq.actions[:exit] = [:record_mapq] + + cigar = re"\*|([0-9]+[MIDNSHPX=])+" + cigar.actions[:enter] = [:mark] + cigar.actions[:exit] = [:record_cigar] + + rnext = re"\*|=|[!-()+-<>-~][!-~]*" + rnext.actions[:enter] = [:mark] + rnext.actions[:exit] = [:record_rnext] + + pnext = re"[0-9]+" + pnext.actions[:enter] = [:mark] + pnext.actions[:exit] = [:record_pnext] + + tlen = re"[-+]?[0-9]+" + tlen.actions[:enter] = [:mark] + tlen.actions[:exit] = [:record_tlen] + + seq = re"\*|[A-Za-z=.]+" + seq.actions[:enter] = [:mark] + seq.actions[:exit] = [:record_seq] + + qual = re"[!-~]+" + qual.actions[:enter] = [:mark] + qual.actions[:exit] = [:record_qual] + + field = let + tag = re"[A-Za-z][A-Za-z0-9]" + val = alt( + re"A:[!-~]", + re"i:[-+]?[0-9]+", + re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", + re"Z:[ !-~]*", + re"H:([0-9A-F][0-9A-F])*", + re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+") + + cat(tag, ':', val) + end + field.actions[:enter] = [:mark] + field.actions[:exit] = [:record_field] + + cat( + qname, '\t', + flag, '\t', + rname, '\t', + pos, '\t', + mapq, '\t', + cigar, '\t', + rnext, '\t', + pnext, '\t', + tlen, '\t', + seq, '\t', + qual, + rep(cat('\t', field))) + end + record.actions[:enter] = [:anchor] + record.actions[:exit] = [:record] + + newline = let + lf = re"\n" + lf.actions[:enter] = [:countline] + + cat(re"\r?", lf) + end + + header′ = rep(cat(metainfo, newline)) + header′.actions[:exit] = [:header] + header = cat(header′, opt(any() \ cat('@'))) # look ahead + + body = rep(cat(record, newline)) + + return map(Automa.compile, (metainfo, record, header, body)) +end)() + +const sam_metainfo_actions = Dict( + :metainfo_tag => :(record.tag = (mark1:p-1) .- offset), + :metainfo_val => :(record.val = (mark1:p-1) .- offset), + :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), + :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), + :metainfo => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1) + record.filled = (offset+1:p-1) .- offset + end, + :anchor => :(), + :mark1 => :(mark1 = p), + :mark2 => :(mark2 = p)) +eval( + BioCore.ReaderHelper.generate_index_function( + MetaInfo, + sam_metainfo_machine, + :(mark1 = mark2 = offset = 0), + sam_metainfo_actions)) +eval( + BioCore.ReaderHelper.generate_readheader_function( + Reader, + MetaInfo, + sam_header_machine, + :(mark1 = mark2 = offset = 0), + merge(sam_metainfo_actions, Dict( + :metainfo => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + @assert isfilled(record) + push!(reader.header.metainfo, record) + BioCore.ReaderHelper.ensure_margin!(stream) + record = MetaInfo() + end, + :header => :(finish_header = true; @escape), + :countline => :(linenum += 1), + :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))), + quote + if !eof(stream) + stream.position -= 1 # cancel look-ahead + end + end)) + +const sam_record_actions = Dict( + :record_qname => :(record.qname = (mark:p-1) .- offset), + :record_flag => :(record.flag = (mark:p-1) .- offset), + :record_rname => :(record.rname = (mark:p-1) .- offset), + :record_pos => :(record.pos = (mark:p-1) .- offset), + :record_mapq => :(record.mapq = (mark:p-1) .- offset), + :record_cigar => :(record.cigar = (mark:p-1) .- offset), + :record_rnext => :(record.rnext = (mark:p-1) .- offset), + :record_pnext => :(record.pnext = (mark:p-1) .- offset), + :record_tlen => :(record.tlen = (mark:p-1) .- offset), + :record_seq => :(record.seq = (mark:p-1) .- offset), + :record_qual => :(record.qual = (mark:p-1) .- offset), + :record_field => :(push!(record.fields, (mark:p-1) .- offset)), + :record => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1) + record.filled = (offset+1:p-1) .- offset + end, + :anchor => :(), + :mark => :(mark = p)) +eval( + BioCore.ReaderHelper.generate_index_function( + Record, + sam_record_machine, + :(mark = offset = 0), + sam_record_actions)) +eval( + BioCore.ReaderHelper.generate_read_function( + Reader, + sam_body_machine, + :(mark = offset = 0), + merge(sam_record_actions, Dict( + :record => quote + BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + found_record = true + @escape + end, + :countline => :(linenum += 1), + :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))))) diff --git a/src/sam/record.jl b/src/sam/record.jl new file mode 100644 index 0000000..25d39d8 --- /dev/null +++ b/src/sam/record.jl @@ -0,0 +1,624 @@ +# SAM Record +# ========== + +mutable struct Record + # data and filled range + data::Vector{UInt8} + filled::UnitRange{Int} + # indexes + qname::UnitRange{Int} + flag::UnitRange{Int} + rname::UnitRange{Int} + pos::UnitRange{Int} + mapq::UnitRange{Int} + cigar::UnitRange{Int} + rnext::UnitRange{Int} + pnext::UnitRange{Int} + tlen::UnitRange{Int} + seq::UnitRange{Int} + qual::UnitRange{Int} + fields::Vector{UnitRange{Int}} +end + +""" + SAM.Record() + +Create an unfilled SAM record. +""" +function Record() + return Record( + UInt8[], 1:0, + # qname-mapq + 1:0, 1:0, 1:0, 1:0, 1:0, + # cigar-seq + 1:0, 1:0, 1:0, 1:0, 1:0, + # qual and fields + 1:0, UnitRange{Int}[]) +end + +""" + SAM.Record(data::Vector{UInt8}) + +Create a SAM record from `data`. +This function verifies the format and indexes fields for accessors. +Note that the ownership of `data` is transferred to a new record object. +""" +function Record(data::Vector{UInt8}) + return convert(Record, data) +end + +function Base.convert(::Type{Record}, data::Vector{UInt8}) + record = Record( + data, 1:0, + # qname-mapq + 1:0, 1:0, 1:0, 1:0, 1:0, + # cigar-seq + 1:0, 1:0, 1:0, 1:0, 1:0, + # qual and fields + 1:0, UnitRange{Int}[]) + index!(record) + return record +end + +""" + SAM.Record(str::AbstractString) + +Create a SAM record from `str`. +This function verifies the format and indexes fields for accessors. +""" +function Record(str::AbstractString) + return convert(Record, str) +end + +function Base.convert(::Type{Record}, str::AbstractString) + return Record(Vector{UInt8}(str)) +end + +function Base.show(io::IO, record::Record) + print(io, summary(record), ':') + if isfilled(record) + println(io) + println(io, " template name: ", hastempname(record) ? tempname(record) : "") + println(io, " flag: ", hasflag(record) ? flag(record) : "") + println(io, " reference: ", hasrefname(record) ? refname(record) : "") + println(io, " position: ", hasposition(record) ? position(record) : "") + println(io, " mapping quality: ", hasmappingquality(record) ? mappingquality(record) : "") + println(io, " CIGAR: ", hascigar(record) ? cigar(record) : "") + println(io, " next reference: ", hasnextrefname(record) ? nextrefname(record) : "") + println(io, " next position: ", hasnextposition(record) ? nextposition(record) : "") + println(io, " template length: ", hastemplength(record) ? templength(record) : "") + println(io, " sequence: ", hassequence(record) ? sequence(String, record) : "") + println(io, " base quality: ", hasquality(record) ? quality(String, record) : "") + print(io, " auxiliary data:") + for field in record.fields + print(io, ' ', String(record.data[field])) + end + else + print(io, " ") + end +end + +function Base.print(io::IO, record::Record) + write(io, record) + return nothing +end + +function Base.write(io::IO, record::Record) + checkfilled(record) + return unsafe_write(io, pointer(record.data, first(record.filled)), length(record.filled)) +end + +function Base.copy(record::Record) + return Record( + copy(record.data), + record.filled, + record.qname, + record.flag, + record.rname, + record.pos, + record.mapq, + record.cigar, + record.rnext, + record.pnext, + record.tlen, + record.seq, + record.qual, + copy(record.fields)) +end + + +# Accessor Functions +# ------------------ + +""" + flag(record::Record)::UInt16 + +Get the bitwise flag of `record`. +""" +function flag(record::Record)::UInt16 + checkfilled(record) + return unsafe_parse_decimal(UInt16, record.data, record.flag) +end + +function hasflag(record::Record) + return isfilled(record) +end + +""" + ismapped(record::Record)::Bool + +Test if `record` is mapped. +""" +function ismapped(record::Record)::Bool + return isfilled(record) && (flag(record) & FLAG_UNMAP == 0) +end + +""" + isprimary(record::Record)::Bool + +Test if `record` is a primary line of the read. + +This is equivalent to `flag(record) & 0x900 == 0`. +""" +function isprimary(record::Record)::Bool + return flag(record) & 0x900 == 0 +end + +""" + refname(record::Record)::String + +Get the reference sequence name of `record`. +""" +function refname(record::Record) + checkfilled(record) + if ismissing(record, record.rname) + missingerror(:refname) + end + return String(record.data[record.rname]) +end + +function hasrefname(record::Record) + return isfilled(record) && !ismissing(record, record.rname) +end + +""" + position(record::Record)::Int + +Get the 1-based leftmost mapping position of `record`. +""" +function position(record::Record)::Int + checkfilled(record) + pos = unsafe_parse_decimal(Int, record.data, record.pos) + if pos == 0 + missingerror(:position) + end + return pos +end + +function hasposition(record::Record) + return isfilled(record) && (length(record.pos) != 1 || record.data[first(record.pos)] != UInt8('0')) +end + +""" + rightposition(record::Record)::Int + +Get the 1-based rightmost mapping position of `record`. +""" +function rightposition(record::Record) + return position(record) + alignlength(record) - 1 +end + +function hasrightposition(record::Record) + return hasposition(record) && hasalignment(record) +end + +""" + isnextmapped(record::Record)::Bool + +Test if the mate/next read of `record` is mapped. +""" +function isnextmapped(record::Record)::Bool + return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0) +end + +""" + nextrefname(record::Record)::String + +Get the reference name of the mate/next read of `record`. +""" +function nextrefname(record::Record)::String + checkfilled(record) + if ismissing(record, record.rnext) + missingerror(:nextrefname) + end + return String(record.data[record.rnext]) +end + +function hasnextrefname(record::Record) + return isfilled(record) && !ismissing(record, record.rnext) +end + +""" + nextposition(record::Record)::Int + +Get the position of the mate/next read of `record`. +""" +function nextposition(record::Record)::Int + checkfilled(record) + pos = unsafe_parse_decimal(Int, record.data, record.pnext) + if pos == 0 + missingerror(:nextposition) + end + return pos +end + +function hasnextposition(record::Record) + return isfilled(record) && (length(record.pnext) != 1 || record.data[first(record.pnext)] != UInt8('0')) +end + +""" + mappingquality(record::Record)::UInt8 + +Get the mapping quality of `record`. +""" +function mappingquality(record::Record)::UInt8 + checkfilled(record) + qual = unsafe_parse_decimal(UInt8, record.data, record.mapq) + if qual == 0xff + missingerror(:mappingquality) + end + return qual +end + +function hasmappingquality(record::Record) + return isfilled(record) && unsafe_parse_decimal(UInt8, record.data, record.mapq) != 0xff +end + +""" + cigar(record::Record)::String + +Get the CIGAR string of `record`. +""" +function cigar(record::Record)::String + checkfilled(record) + if ismissing(record, record.cigar) + missingerror(:cigar) + end + return String(record.data[record.cigar]) +end + +function hascigar(record::Record) + return isfilled(record) && !ismissing(record, record.cigar) +end + +""" + alignment(record::Record)::BioAlignments.Alignment + +Get the alignment of `record`. +""" +function alignment(record::Record)::BioAlignments.Alignment + if ismapped(record) + return BioAlignments.Alignment(cigar(record), 1, position(record)) + else + return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) + end +end + +function hasalignment(record::Record) + return isfilled(record) && hascigar(record) +end + +""" + alignlength(record::Record)::Int + +Get the alignment length of `record`. +""" +function alignlength(record::Record)::Int + if length(record.cigar) == 1 && record.data[first(record.cigar)] == UInt8('*') + return 0 + end + ret::Int = 0 + len = 0 # operation length + for i in record.cigar + c = record.data[i] + if c ∈ UInt8('0'):UInt8('9') + len = len * 10 + (c - UInt8('0')) + else + op = convert(BioAlignments.Operation, Char(c)) + if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op) + ret += len + len = 0 + end + end + end + return ret +end + +""" + tempname(record::Record)::String + +Get the query template name of `record`. +""" +function tempname(record::Record)::String + checkfilled(record) + if ismissing(record, record.qname) + missingerror(:tempname) + end + return String(record.data[record.qname]) +end + +function hastempname(record::Record) + return isfilled(record) && !ismissing(record, record.qname) +end + +""" + templength(record::Record)::Int + +Get the template length of `record`. +""" +function templength(record::Record)::Int + checkfilled(record) + len = unsafe_parse_decimal(Int, record.data, record.tlen) + if len == 0 + missingerror(:tlen) + end + return len +end + +function hastemplength(record::Record) + return isfilled(record) && (length(record.tlen) != 1 || record.data[first(record.tlen)] != UInt8('0')) +end + +""" + sequence(record::Record)::BioSequences.DNASequence + +Get the segment sequence of `record`. +""" +function sequence(record::Record)::BioSequences.DNASequence + checkfilled(record) + if ismissing(record, record.seq) + missingerror(:sequence) + end + seqlen = length(record.seq) + ret = BioSequences.DNASequence(seqlen) + BioSequences.encode_copy!(ret, 1, record.data, first(record.seq), seqlen) + return ret +end + +function hassequence(record::Record) + return isfilled(record) && !ismissing(record, record.seq) +end +""" + sequence(::Type{String}, record::Record)::String + +Get the segment sequence of `record` as `String`. +""" +function sequence(::Type{String}, record::Record)::String + checkfilled(record) + return String(record.data[record.seq]) +end + +""" + seqlength(record::Record)::Int + +Get the sequence length of `record`. +""" +function seqlength(record::Record)::Int + checkfilled(record) + if ismissing(record, record.seq) + missingerror(:seq) + end + return length(record.seq) +end + +function hasseqlength(record::Record) + return isfilled(record) && !ismissing(record, record.seq) +end + +""" + quality(record::Record)::Vector{UInt8} + +Get the Phred-scaled base quality of `record`. +""" +function quality(record::Record)::Vector{UInt8} + checkfilled(record) + if ismissing(record, record.qual) + missingerror(:quality) + end + qual = record.data[record.qual] + for i in 1:lastindex(qual) + @inbounds qual[i] -= 33 + end + return qual +end + +function hasquality(record::Record) + return isfilled(record) && !ismissing(record, record.qual) +end + +""" + quality(::Type{String}, record::Record)::String + +Get the ASCII-encoded base quality of `record`. +""" +function quality(::Type{String}, record::Record)::String + checkfilled(record) + return String(record.data[record.qual]) +end + +""" + auxdata(record::Record)::Dict{String,Any} + +Get the auxiliary data (optional fields) of `record`. +""" +function auxdata(record::Record)::Dict{String,Any} + checkfilled(record) + return Dict(k => record[k] for k in keys(record)) +end + +function Base.haskey(record::Record, tag::AbstractString) + return findauxtag(record, tag) > 0 +end + +function Base.getindex(record::Record, tag::AbstractString) + i = findauxtag(record, tag) + if i == 0 + throw(KeyError(tag)) + end + field = record.fields[i] + # data type + typ = record.data[first(field)+3] + lo = first(field) + 5 + if i == lastindex(record.fields) + hi = last(field) + else + hi = first(record.fields[i+1]) - 2 + end + if typ == UInt8('A') + @assert lo == hi + return Char(record.data[lo]) + elseif typ == UInt8('i') + return unsafe_parse_decimal(Int, record.data, lo:hi) + elseif typ == UInt8('f') + # TODO: Call a C function directly for speed? + return parse(Float32, SubString(record.data[lo:hi])) + elseif typ == UInt8('Z') + return String(record.data[lo:hi]) + elseif typ == UInt8('H') + return parse_hexarray(record.data, lo:hi) + elseif typ == UInt8('B') + return parse_typedarray(record.data, lo:hi) + else + throw(ArgumentError("type code '$(Char(typ))' is not defined")) + end +end + +function Base.keys(record::Record) + checkfilled(record) + return [String(record.data[first(f):first(f)+1]) for f in record.fields] +end + +function Base.values(record::Record) + return [record[k] for k in keys(record)] +end + + +# Bio Methods +# ----------- + +function BioCore.isfilled(record::Record) + return !isempty(record.filled) +end + +function BioCore.seqname(record::Record) + return tempname(record) +end + +function BioCore.hasseqname(record::Record) + return hastempname(record) +end + +function BioCore.sequence(record::Record) + return sequence(record) +end + +function BioCore.hassequence(record::Record) + return hassequence(record) +end + +function BioCore.rightposition(record::Record) + return rightposition(record) +end + +function BioCore.hasrightposition(record::Record) + return hasrightposition(record) +end + +function BioCore.leftposition(record::Record) + return position(record) +end + +function BioCore.hasleftposition(record::Record) + return hasposition(record) +end + + +# Helper Functions +# ---------------- + +function initialize!(record::Record) + record.filled = 1:0 + record.qname = 1:0 + record.flag = 1:0 + record.rname = 1:0 + record.pos = 1:0 + record.mapq = 1:0 + record.cigar = 1:0 + record.rnext = 1:0 + record.pnext = 1:0 + record.tlen = 1:0 + record.seq = 1:0 + record.qual = 1:0 + empty!(record.fields) + return record +end + +function checkfilled(record::Record) + if !isfilled(record) + throw(ArgumentError("unfilled SAM record")) + end +end + +function findauxtag(record::Record, tag::AbstractString) + checkfilled(record) + if sizeof(tag) != 2 + return 0 + end + t1, t2 = UInt8(tag[1]), UInt8(tag[2]) + for (i, field) in enumerate(record.fields) + p = first(field) + if record.data[p] == t1 && record.data[p+1] == t2 + return i + end + end + return 0 +end + +function parse_hexarray(data::Vector{UInt8}, range::UnitRange{Int}) + @assert iseven(length(range)) + ret = Vector{UInt8}(length(range) >> 1) + byte2hex(b) = b ∈ 0x30:0x39 ? (b - 0x30) : b ∈ 0x41:0x46 ? (b - 0x41 + 0x0A) : error("not in [0-9A-F]") + j = 1 + for i in first(range):2:last(range)-1 + ret[j] = (byte2hex(data[range[i]]) << 4) | byte2hex(data[range[i+1]]) + j += 1 + end + return ret +end + +function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int}) + # format: [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ + t = data[first(range)] + xs = split(String(data[first(range)+2:last(range)])) + if t == UInt8('c') + return [parse(Int8, x) for x in xs] + elseif t == UInt8('C') + return [parse(UInt8, x) for x in xs] + elseif t == UInt8('s') + return [parse(Int16, x) for x in xs] + elseif t == UInt8('S') + return [parse(UInt16, x) for x in xs] + elseif t == UInt8('i') + return [parse(Int32, x) for x in xs] + elseif t == UInt8('I') + return [parse(UInt32, x) for x in xs] + elseif t == UInt8('f') + return [parse(Float32, x) for x in xs] + else + throw(ArgumentError("type code '$(Char(t))' is not defined")) + end +end + +function ismissing(record::Record, range::UnitRange{Int}) + return length(range) == 1 && record.data[first(range)] == UInt8('*') +end diff --git a/src/sam/sam.jl b/src/sam/sam.jl new file mode 100644 index 0000000..2b46653 --- /dev/null +++ b/src/sam/sam.jl @@ -0,0 +1,23 @@ +# SAM File Format +# =============== + +module SAM + +import Automa +import Automa.RegExp: @re_str +import BioAlignments +import BioCore.Exceptions: missingerror +import BioCore.RecordHelper: unsafe_parse_decimal +import BioCore: BioCore, isfilled +import BioSequences +import BufferedStreams +using Printf: @sprintf + +include("flags.jl") +include("metainfo.jl") +include("record.jl") +include("header.jl") +include("reader.jl") +include("writer.jl") + +end diff --git a/src/sam/writer.jl b/src/sam/writer.jl new file mode 100644 index 0000000..e7a5ddd --- /dev/null +++ b/src/sam/writer.jl @@ -0,0 +1,43 @@ +# SAM Writer +# ========== + +""" + Writer(output::IO, header::Header=Header()) + +Create a data writer of the SAM file format. + +# Arguments +* `output`: data sink +* `header=Header()`: SAM header object +""" +mutable struct Writer <: BioCore.IO.AbstractWriter + stream::IO + + function Writer(output::IO, header::Header=Header()) + writer = new(output) + write(writer, header) + return writer + end +end + +function BioCore.IO.stream(writer::Writer) + return writer.stream +end + +function Base.write(writer::Writer, header::Header) + n = 0 + for metainfo in header + n += write(writer, metainfo) + end + return n +end + +function Base.write(writer::Writer, metainfo::MetaInfo) + checkfilled(metainfo) + return write(writer.stream, metainfo, '\n') +end + +function Base.write(writer::Writer, record::Record) + checkfilled(record) + return write(writer.stream, record, '\n') +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..1182fb5 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,426 @@ +using Test +using BioAlignments +using BioSymbols +import BGZFStreams: BGZFStream +import BioCore.Exceptions: MissingFieldException +import BioCore.Testing.get_bio_fmt_specimens +import BioSequences: @dna_str, @aa_str +import GenomicFeatures +import YAML + +# Generate a random range within `range`. +function randrange(range) + x = rand(range) + y = rand(range) + if x < y + return x:y + else + return y:x + end +end + +@testset "SAM" begin + samdir = joinpath(get_bio_fmt_specimens("master", false, true), "SAM") + + @testset "MetaInfo" begin + metainfo = SAM.MetaInfo() + @test !isfilled(metainfo) + @test occursin("not filled", repr(metainfo)) + + metainfo = SAM.MetaInfo("CO", "some comment (parens)") + @test isfilled(metainfo) + @test string(metainfo) == "@CO\tsome comment (parens)" + @test occursin("CO", repr(metainfo)) + @test SAM.tag(metainfo) == "CO" + @test SAM.value(metainfo) == "some comment (parens)" + @test_throws ArgumentError keys(metainfo) + @test_throws ArgumentError values(metainfo) + + metainfo = SAM.MetaInfo("HD", ["VN" => "1.0", "SO" => "coordinate"]) + @test isfilled(metainfo) + @test string(metainfo) == "@HD\tVN:1.0\tSO:coordinate" + @test occursin("HD", repr(metainfo)) + @test SAM.tag(metainfo) == "HD" + @test SAM.value(metainfo) == "VN:1.0\tSO:coordinate" + @test keys(metainfo) == ["VN", "SO"] + @test values(metainfo) == ["1.0", "coordinate"] + @test SAM.keyvalues(metainfo) == ["VN" => "1.0", "SO" => "coordinate"] + @test haskey(metainfo, "VN") + @test haskey(metainfo, "SO") + @test !haskey(metainfo, "GO") + @test metainfo["VN"] == "1.0" + @test metainfo["SO"] == "coordinate" + @test_throws KeyError metainfo["GO"] + end + + @testset "Header" begin + header = SAM.Header() + @test isempty(header) + push!(header, SAM.MetaInfo("@HD\tVN:1.0\tSO:coordinate")) + @test !isempty(header) + @test length(header) == 1 + push!(header, SAM.MetaInfo("@CO\tsome comment")) + @test length(header) == 2 + @test isa(collect(header), Vector{SAM.MetaInfo}) + end + + @testset "Record" begin + record = SAM.Record() + @test !isfilled(record) + @test !SAM.ismapped(record) + @test repr(record) == "BioAlignments.SAM.Record: " + @test_throws ArgumentError SAM.flag(record) + + record = SAM.Record("r001\t99\tchr1\t7\t30\t8M2I4M1D3M\t=\t37\t39\tTTAGATAAAGGATACTG\t*") + @test isfilled(record) + @test occursin(r"^BioAlignments.SAM.Record:\n", repr(record)) + @test SAM.ismapped(record) + @test SAM.isprimary(record) + @test SAM.hastempname(record) + @test SAM.tempname(record) == "r001" + @test SAM.hasflag(record) + @test SAM.flag(record) === UInt16(99) + @test SAM.hasrefname(record) + @test SAM.refname(record) == "chr1" + @test SAM.hasposition(record) + @test SAM.position(record) === 7 + @test SAM.hasmappingquality(record) + @test SAM.mappingquality(record) === UInt8(30) + @test SAM.hascigar(record) + @test SAM.cigar(record) == "8M2I4M1D3M" + @test SAM.hasnextrefname(record) + @test SAM.nextrefname(record) == "=" + @test SAM.hasnextposition(record) + @test SAM.nextposition(record) === 37 + @test SAM.hastemplength(record) + @test SAM.templength(record) === 39 + @test SAM.hassequence(record) + @test SAM.sequence(record) == dna"TTAGATAAAGGATACTG" + @test !SAM.hasquality(record) + @test_throws MissingFieldException SAM.quality(record) + end + + @testset "Reader" begin + reader = open(SAM.Reader, joinpath(samdir, "ce#1.sam")) + @test isa(reader, SAM.Reader) + @test eltype(reader) === SAM.Record + + # header + h = header(reader) + @test string.(findall(header(reader), "SQ")) == ["@SQ\tSN:CHROMOSOME_I\tLN:1009800"] + + # first record + record = SAM.Record() + read!(reader, record) + @test SAM.ismapped(record) + @test SAM.refname(record) == "CHROMOSOME_I" + @test SAM.position(record) == leftposition(record) == 2 + @test SAM.rightposition(record) == rightposition(record) == 102 + @test SAM.tempname(record) == seqname(record) == "SRR065390.14978392" + @test SAM.sequence(record) == sequence(record) == dna"CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA" + @test SAM.sequence(String, record) == "CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA" + @test SAM.seqlength(record) == 100 + @test SAM.quality(record) == (b"#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" .- 33) + @test SAM.quality(String, record) == "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" + @test SAM.flag(record) == 16 + @test SAM.cigar(record) == "27M1D73M" + @test SAM.alignment(record) == Alignment([ + AlignmentAnchor( 0, 1, OP_START), + AlignmentAnchor( 27, 28, OP_MATCH), + AlignmentAnchor( 27, 29, OP_DELETE), + AlignmentAnchor(100, 102, OP_MATCH)]) + @test record["XG"] == 1 + @test record["XM"] == 5 + @test record["XN"] == 0 + @test record["XO"] == 1 + @test record["AS"] == -18 + @test record["XS"] == -18 + @test record["YT"] == "UU" + @test eof(reader) + close(reader) + + # iterator + @test length(collect(open(SAM.Reader, joinpath(samdir, "ce#1.sam")))) == 1 + @test length(collect(open(SAM.Reader, joinpath(samdir, "ce#2.sam")))) == 2 + + # IOStream + @test length(collect(SAM.Reader(open(joinpath(samdir, "ce#1.sam"))))) == 1 + @test length(collect(SAM.Reader(open(joinpath(samdir, "ce#2.sam"))))) == 2 + end + + @testset "Round trip" begin + function compare_records(xs, ys) + if length(xs) != length(ys) + return false + end + for (x, y) in zip(xs, ys) + if x.data[x.filled] != y.data[y.filled] + return false + end + end + return true + end + for specimen in YAML.load_file(joinpath(samdir, "index.yml")) + filepath = joinpath(samdir, specimen["filename"]) + mktemp() do path, io + # copy + reader = open(SAM.Reader, filepath) + writer = SAM.Writer(io, header(reader)) + records = SAM.Record[] + for record in reader + push!(records, record) + write(writer, record) + end + close(reader) + close(writer) + @test compare_records(open(collect, SAM.Reader, path), records) + end + end + end +end + +@testset "BAM" begin + bamdir = joinpath(get_bio_fmt_specimens("master", false), "BAM") + + @testset "AuxData" begin + auxdata = BAM.AuxData(UInt8[]) + @test isempty(auxdata) + + buf = IOBuffer() + write(buf, "NM", UInt8('s'), Int16(1)) + auxdata = BAM.AuxData(take!(buf)) + @test length(auxdata) == 1 + @test auxdata["NM"] === Int16(1) + @test collect(auxdata) == ["NM" => Int16(1)] + + buf = IOBuffer() + write(buf, "AS", UInt8('c'), Int8(-18)) + write(buf, "NM", UInt8('s'), Int16(1)) + write(buf, "XA", UInt8('f'), Float32(3.14)) + write(buf, "XB", UInt8('Z'), "some text\0") + write(buf, "XC", UInt8('B'), UInt8('i'), Int32(3), Int32[10, -5, 8]) + auxdata = BAM.AuxData(take!(buf)) + @test length(auxdata) == 5 + @test auxdata["AS"] === Int8(-18) + @test auxdata["NM"] === Int16(1) + @test auxdata["XA"] === Float32(3.14) + @test auxdata["XB"] == "some text" + @test auxdata["XC"] == Int32[10, -5, 8] + @test convert(Dict{String,Any}, auxdata) == Dict( + "AS" => Int8(-18), + "NM" => Int16(1), + "XA" => Float32(3.14), + "XB" => "some text", + "XC" => Int32[10, -5, 8]) + end + + @testset "Record" begin + record = BAM.Record() + @test !isfilled(record) + @test repr(record) == "BioAlignments.BAM.Record: " + @test_throws ArgumentError BAM.flag(record) + end + + @testset "Reader" begin + reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam")) + @test isa(reader, BAM.Reader) + @test eltype(reader) === BAM.Record + @test startswith(repr(reader), "BioAlignments.BAM.Reader{IOStream}:") + + # header + h = header(reader) + @test isa(h, SAM.Header) + + # first record + record = BAM.Record() + read!(reader, record) + @test BAM.ismapped(record) + @test BAM.isprimary(record) + @test ! BAM.ispositivestrand(record) + @test BAM.refname(record) == "CHROMOSOME_I" + @test BAM.refid(record) === 1 + @test BAM.hasnextrefid(record) + @test BAM.nextrefid(record) === 0 + @test BAM.hasposition(record) === hasleftposition(record) === true + @test BAM.position(record) === leftposition(record) === 2 + @test BAM.hasnextposition(record) + @test BAM.nextposition(record) === 0 + @test rightposition(record) == 102 + @test BAM.hastempname(record) === hasseqname(record) === true + @test BAM.tempname(record) == seqname(record) == "SRR065390.14978392" + @test BAM.hassequence(record) === hassequence(record) === true + @test BAM.sequence(record) == sequence(record) == dna""" + CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCT + AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA + """ + @test BAM.seqlength(record) === 100 + @test BAM.hasquality(record) + @test eltype(BAM.quality(record)) == UInt8 + @test BAM.quality(record) == [Int(x) - 33 for x in "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"] + @test BAM.flag(record) === UInt16(16) + @test BAM.cigar(record) == "27M1D73M" + @test BAM.alignment(record) == Alignment([ + AlignmentAnchor( 0, 1, OP_START), + AlignmentAnchor( 27, 28, OP_MATCH), + AlignmentAnchor( 27, 29, OP_DELETE), + AlignmentAnchor(100, 102, OP_MATCH)]) + @test record["XG"] == 1 + @test record["XM"] == 5 + @test record["XN"] == 0 + @test record["XO"] == 1 + @test record["AS"] == -18 + @test record["XS"] == -18 + @test record["YT"] == "UU" + @test keys(record) == ["XG","XM","XN","XO","AS","XS","YT"] + @test values(record) == [1, 5, 0, 1, -18, -18, "UU"] + @test eof(reader) + close(reader) + + # Test conversion from byte array to record + dsize = BAM.data_size(record) + array = Vector{UInt8}(undef, BAM.FIXED_FIELDS_BYTES + dsize) + GC.@preserve array record begin + ptr = Ptr{UInt8}(pointer_from_objref(record)) + unsafe_copyto!(pointer(array), ptr, BAM.FIXED_FIELDS_BYTES) + unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize) + end + new_record = convert(BAM.Record, array) + @test record.bin_mq_nl == new_record.bin_mq_nl + @test record.block_size == new_record.block_size + @test record.flag_nc == new_record.flag_nc + @test record.l_seq == new_record.l_seq + @test record.next_refid == new_record.next_refid + @test record.next_pos == new_record.next_pos + @test record.refid == new_record.refid + @test record.pos == new_record.pos + @test record.tlen == new_record.tlen + @test record.data == new_record.data + + # iterator + @test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#1.bam")))) == 1 + @test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#2.bam")))) == 2 + + # IOStream + @test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#1.bam"))))) == 1 + @test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#2.bam"))))) == 2 + end + + @testset "Read long CIGARs" begin + function get_cigar_lens(rec::BAM.Record) + cigar_ops, cigar_n = BAM.cigar_rle(rec) + field_ops, field_n = BAM.cigar_rle(rec, false) + cigar_l = length(cigar_ops) + field_l = length(field_ops) + return cigar_l, field_l + end + + function check_cigar_vs_field(rec::BAM.Record) + cigar = BAM.cigar(rec) + field = BAM.cigar(rec, false) + cigar_l, field_l = get_cigar_lens(rec) + return cigar != field && cigar_l != field_l + end + + function check_cigar_lens(rec::BAM.Record, field_len, cigar_len) + cigar_l, field_l = get_cigar_lens(rec) + return cigar_l == cigar_len && field_l == field_len + end + + reader = open(BAM.Reader, joinpath(bamdir, "cigar-64k.bam")) + rec = BAM.Record() + read!(reader, rec) + @test !check_cigar_vs_field(rec) + read!(reader, rec) + @test check_cigar_vs_field(rec) + @test check_cigar_lens(rec, 2, 72091) + end + + function compare_records(xs, ys) + if length(xs) != length(ys) + return false + end + for (x, y) in zip(xs, ys) + if !( + x.block_size == y.block_size && + x.refid == y.refid && + x.pos == y.pos && + x.bin_mq_nl == y.bin_mq_nl && + x.flag_nc == y.flag_nc && + x.l_seq == y.l_seq && + x.next_refid == y.next_refid && + x.next_pos == y.next_pos && + x.tlen == y.tlen && + x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)]) + return false + end + end + return true + end + + @testset "Round trip" begin + for specimen in YAML.load_file(joinpath(bamdir, "index.yml")) + filepath = joinpath(bamdir, specimen["filename"]) + mktemp() do path, _ + # copy + if occursin("bai", get(specimen, "tags", "")) + reader = open(BAM.Reader, filepath, index=filepath * ".bai") + else + reader = open(BAM.Reader, filepath) + end + writer = BAM.Writer( + BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ")))) + records = BAM.Record[] + for record in reader + push!(records, record) + write(writer, record) + end + close(reader) + close(writer) + @test compare_records(open(collect, BAM.Reader, path), records) + end + end + end + + @testset "Random access" begin + filepath = joinpath(bamdir, "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam") + reader = open(BAM.Reader, filepath, index=filepath * ".bai") + + @test isa(eachoverlap(reader, "chr1", 1:100), BAM.OverlapIterator) + @test isa(eachoverlap(reader, GenomicFeatures.Interval("chr1", 1, 100)), BAM.OverlapIterator) + + # expected values are counted using samtools + for (refname, interval, expected) in [ + ("chr1", 1_000:10000, 21), + ("chr1", 8_000:10000, 20), + ("chr1", 766_000:800_000, 142), + ("chr1", 786_000:800_000, 1), + ("chr1", 796_000:800_000, 0)] + intsect = eachoverlap(reader, refname, interval) + @test eltype(intsect) == BAM.Record + @test count(_ -> true, intsect) == expected + # check that the intersection iterator is stateless + @test count(_ -> true, intsect) == expected + end + + # randomized tests + for n in 1:50 + refindex = 1 + refname = "chr1" + range = randrange(1:1_000_000) + seekstart(reader) + # linear scan + expected = filter(collect(reader)) do record + BAM.compare_intervals(record, (refindex, range)) == 0 + end + # indexed scan + actual = collect(eachoverlap(reader, refname, range)) + @test compare_records(actual, expected) + end + close(reader) + + filepath = joinpath(bamdir, "R_12h_D06.uniq.q40.bam") + reader = open(BAM.Reader, filepath, index=filepath * ".bai") + @test isempty(collect(eachoverlap(reader, "chr19", 5823708:5846478))) + close(reader) + end +end