diff --git a/Project.toml b/Project.toml index 10f7cb4..2acfbc3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,28 +1,28 @@ name = "XAM" uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" authors = ["Kenta Sato ", "Ben J. Ward ", "Ciarán O'Mara "] -version = "0.1.1" +version = "0.2.0" [deps] Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" -BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81" +BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" -BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" [compat] Automa = "0.7, 0.8" BGZFStreams = "0.3" BioAlignments = "2" -BioCore = "2" +BioGenerics = "0.1" BioSequences = "2" -BufferedStreams = "1" GenomicFeatures = "2" Indexes = "0.1" +TranscodingStreams = "0.6, 0.7, 0.8, 0.9" julia = "1.1" [extras] diff --git a/src/bam/auxdata.jl b/src/bam/auxdata.jl index 2bcfac2..3ce0050 100644 --- a/src/bam/auxdata.jl +++ b/src/bam/auxdata.jl @@ -72,12 +72,15 @@ function loadauxtype(data::Vector{UInt8}, p::Int) b == UInt8('Z') ? String : error("invalid type tag: '$(Char(b))'")) end + t = data[p] + if t == UInt8('B') return p + 2, Vector{auxtype(data[p+1])} - else - return p + 1, auxtype(t) end + + return p + 1, auxtype(t) + end function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T @@ -105,14 +108,17 @@ end function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) pos = start + while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2) pos = next_tag_position(data, pos) end + if pos > stop return 0 - else - return pos end + + return pos + end # Find the starting position of a next tag in `data` after `p`. @@ -120,24 +126,40 @@ end function next_tag_position(data::Vector{UInt8}, p::Int) typ = Char(data[p+2]) p += 3 + if typ == 'A' - p += 1 - elseif typ == 'c' || typ == 'C' - p += 1 - elseif typ == 's' || typ == 'S' - p += 2 - elseif typ == 'i' || typ == 'I' - p += 4 - elseif typ == 'f' - p += 4 - elseif typ == 'd' - p += 8 - elseif typ == 'Z' || typ == 'H' + return p += 1 + end + + if typ == 'c' || typ == 'C' + return p += 1 + end + + if typ == 's' || typ == 'S' + return p += 2 + end + + if typ == 'i' || typ == 'I' + return p += 4 + end + + if typ == 'f' + return p += 4 + end + + if typ == 'd' + return p += 8 + end + + if typ == 'Z' || typ == 'H' while data[p] != 0x00 # NULL-terminalted string p += 1 end - p += 1 - elseif typ == 'B' + return p += 1 + + end + + if typ == 'B' eltyp = Char(data[p]) elsize = eltyp == 'c' || eltyp == 'C' ? 1 : eltyp == 's' || eltyp == 'S' ? 2 : @@ -145,9 +167,9 @@ function next_tag_position(data::Vector{UInt8}, p::Int) error("invalid type tag: '$(Char(eltyp))'") p += 1 n = unsafe_load(Ptr{Int32}(pointer(data, p))) - p += 4 + elsize * n - else - error("invalid type tag: '$(Char(typ))'") + return p += 4 + elsize * n end - return p + + error("invalid type tag: '$(Char(typ))'") + end diff --git a/src/bam/bam.jl b/src/bam/bam.jl index e055231..6fd1174 100644 --- a/src/bam/bam.jl +++ b/src/bam/bam.jl @@ -3,7 +3,7 @@ module BAM -using BioCore +using BioGenerics using GenomicFeatures using XAM.SAM @@ -11,7 +11,7 @@ import BGZFStreams import BioAlignments import Indexes import BioSequences -import BioCore: isfilled, header +import BioGenerics: isfilled, header import GenomicFeatures: eachoverlap diff --git a/src/bam/overlap.jl b/src/bam/overlap.jl index c3475ba..bb7b3c1 100644 --- a/src/bam/overlap.jl +++ b/src/bam/overlap.jl @@ -16,7 +16,7 @@ function Base.eltype(::Type{OverlapIterator{T}}) where T end function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval) - return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last) + return GenomicFeatures.eachoverlap(reader, GenomicFeatures.seqname(interval), GenomicFeatures.leftposition(interval):GenomicFeatures.rightposition(interval)) end function GenomicFeatures.eachoverlap(reader::Reader, interval) @@ -67,7 +67,8 @@ function Base.iterate(iter::OverlapIterator, state) c = compare_intervals(state.record, (state.refindex, iter.interval)) if c == 0 return copy(state.record), state - elseif c > 0 + end + if c > 0 # no more overlapping records in this chunk since records are sorted break end @@ -82,14 +83,18 @@ end function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}}) rid = refid(record) + if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2])) # strictly left return -1 - elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2])) + end + + if rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2])) # strictly right return +1 - else - # overlapping - return 0 end + + # overlapping + return 0 + end diff --git a/src/bam/reader.jl b/src/bam/reader.jl index 906110a..953f5ca 100644 --- a/src/bam/reader.jl +++ b/src/bam/reader.jl @@ -10,7 +10,7 @@ Create a data reader of the BAM file format. * `input`: data source * `index=nothing`: filepath to a random access index (currently *bai* is supported) """ -mutable struct Reader{T} <: BioCore.IO.AbstractReader +mutable struct Reader{T} <: BioGenerics.IO.AbstractReader stream::BGZFStreams.BGZFStream{T} header::SAM.Header start_offset::BGZFStreams.VirtualOffset @@ -23,17 +23,15 @@ function Base.eltype(::Type{Reader{T}}) where T return Record end -function BioCore.IO.stream(reader::Reader) +function BioGenerics.IO.stream(reader::Reader) return reader.stream end function Reader(input::IO; index=nothing) if isa(index, AbstractString) index = BAI(index) - else - if index != nothing - error("unrecognizable index argument") - end + elseif index != nothing + error("unrecognizable index argument") end reader = init_bam_reader(input) reader.index = index @@ -89,6 +87,7 @@ function init_bam_reader(input::BGZFStreams.BGZFStream) A = read(input, UInt8) M = read(input, UInt8) x = read(input, UInt8) + if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01 error("input was not a valid BAM file") end @@ -98,21 +97,22 @@ function init_bam_reader(input::BGZFStreams.BGZFStream) samreader = SAM.Reader(IOBuffer(read(input, textlen))) # reference sequences - refseqnames = String[] - refseqlens = Int[] n_refs = read(input, Int32) - for _ in 1:n_refs + refseqnames = Vector{String}(undef, n_refs) + refseqlens = Vector{Int}(undef, n_refs) + @inbounds for i in 1:n_refs namelen = read(input, Int32) data = read(input, namelen) seqname = unsafe_string(pointer(data)) seqlen = read(input, Int32) - push!(refseqnames, seqname) - push!(refseqlens, seqlen) + refseqnames[i] = seqname + refseqlens[i] = seqlen end voffset = isa(input.io, Base.AbstractPipe) ? BGZFStreams.VirtualOffset(0, 0) : BGZFStreams.virtualoffset(input) + return Reader( input, samreader.header, diff --git a/src/bam/record.jl b/src/bam/record.jl index 107f127..13c5608 100644 --- a/src/bam/record.jl +++ b/src/bam/record.jl @@ -163,7 +163,8 @@ function checked_refid(record::Record) id = refid(record) if id == 0 throw(ArgumentError("record is not mapped")) - elseif !isdefined(record, :reader) + end + if !isdefined(record, :reader) throw(ArgumentError("reader is not defined")) end return id @@ -565,42 +566,42 @@ function Base.values(record::Record) end -# BioCore Methods +# BioGenerics Methods # ----------- -function BioCore.isfilled(record::Record) +function BioGenerics.isfilled(record::Record) return record.block_size != 0 end -function BioCore.seqname(record::Record) +function BioGenerics.seqname(record::Record) return tempname(record) end -function BioCore.hasseqname(record::Record) +function BioGenerics.hasseqname(record::Record) return hastempname(record) end -function BioCore.sequence(record::Record) +function BioGenerics.sequence(record::Record) return sequence(record) end -function BioCore.hassequence(record::Record) +function BioGenerics.hassequence(record::Record) return hassequence(record) end -function BioCore.leftposition(record::Record) +function BioGenerics.leftposition(record::Record) return position(record) end -function BioCore.hasleftposition(record::Record) +function BioGenerics.hasleftposition(record::Record) return hasposition(record) end -function BioCore.rightposition(record::Record) +function BioGenerics.rightposition(record::Record) return rightposition(record) end -function BioCore.hasrightposition(record::Record) +function BioGenerics.hasrightposition(record::Record) return hasrightposition(record) end @@ -612,9 +613,10 @@ end function data_size(record::Record) if isfilled(record) return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size) - else - return 0 end + + return 0 + end function checkfilled(record::Record) diff --git a/src/bam/writer.jl b/src/bam/writer.jl index f017424..2460b3f 100644 --- a/src/bam/writer.jl +++ b/src/bam/writer.jl @@ -10,7 +10,7 @@ Create a data writer of the BAM file format. * `output`: data sink * `header`: SAM header object """ -mutable struct Writer <: BioCore.IO.AbstractWriter +mutable struct Writer <: BioGenerics.IO.AbstractWriter stream::BGZFStreams.BGZFStream end @@ -25,7 +25,7 @@ function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header) return Writer(stream) end -function BioCore.IO.stream(writer::Writer) +function BioGenerics.IO.stream(writer::Writer) return writer.stream end diff --git a/src/sam/reader.jl b/src/sam/reader.jl index d26a840..3270957 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -1,16 +1,22 @@ # SAM Reader # ========= -mutable struct Reader <: BioCore.IO.AbstractReader - state::BioCore.Ragel.State +mutable struct Reader{S <: TranscodingStream} <: BioGenerics.IO.AbstractReader + state::State{S} header::Header +end - function Reader(input::BufferedStreams.BufferedInputStream) - reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header()) - readheader!(reader) - reader.state.cs = sam_body_machine.start_state - return reader - end +function Reader(state::State{S}) where {S <: TranscodingStream} + + rdr = Reader(state, Header()) + + cs, ln, f = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum)) + + rdr.state.state = sam_machine_body.start_state + rdr.state.linenum = ln + rdr.state.filled = false + + return rdr end """ @@ -22,10 +28,22 @@ Create a data reader of the SAM file format. * `input`: data source """ function Reader(input::IO) - return Reader(BufferedStreams.BufferedInputStream(input)) + + if input isa TranscodingStream + return Reader(State(input, 1, 1, false)) + end + + stream = TranscodingStreams.NoopStream(input) + + return Reader(State(stream, 1, 1, false)) + end -function BioCore.IO.stream(reader::Reader) +function Base.eltype(::Type{<:Reader}) + return Record +end + +function BioGenerics.IO.stream(reader::Reader) return reader.state.stream end @@ -38,224 +56,42 @@ function header(reader::Reader)::Header return reader.header end -function Base.eltype(::Type{Reader}) - return Record +function Base.close(reader::Reader) + if reader.state.stream isa IO + close(reader.state.stream) + end + return nothing end -# file = header . body -# header = metainfo* -# body = record* -isinteractive() && info("compiling SAM") -const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () - cat = Automa.RegExp.cat - rep = Automa.RegExp.rep - alt = Automa.RegExp.alt - opt = Automa.RegExp.opt - any = Automa.RegExp.any - - metainfo = let - tag = re"[A-Z][A-Z]" \ cat("CO") - tag.actions[:enter] = [:mark1] - tag.actions[:exit] = [:metainfo_tag] - - dict = let - key = re"[A-Za-z][A-Za-z0-9]" - key.actions[:enter] = [:mark2] - key.actions[:exit] = [:metainfo_dict_key] - val = re"[ -~]+" - val.actions[:enter] = [:mark2] - val.actions[:exit] = [:metainfo_dict_val] - keyval = cat(key, ':', val) - - cat(keyval, rep(cat('\t', keyval))) - end - dict.actions[:enter] = [:mark1] - dict.actions[:exit] = [:metainfo_val] - - co = cat("CO") - co.actions[:enter] = [:mark1] - co.actions[:exit] = [:metainfo_tag] - - comment = re"[^\r\n]*" - comment.actions[:enter] = [:mark1] - comment.actions[:exit] = [:metainfo_val] - - cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) +function index!(record::MetaInfo) + stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) + found = index!(stream, record) + if !found + throw(ArgumentError("invalid SAM metadata")) end - metainfo.actions[:enter] = [:anchor] - metainfo.actions[:exit] = [:metainfo] + return record +end - record = let - qname = re"[!-?A-~]+" - qname.actions[:enter] = [:mark] - qname.actions[:exit] = [:record_qname] - - flag = re"[0-9]+" - flag.actions[:enter] = [:mark] - flag.actions[:exit] = [:record_flag] - - rname = re"\*|[!-()+-<>-~][!-~]*" - rname.actions[:enter] = [:mark] - rname.actions[:exit] = [:record_rname] - - pos = re"[0-9]+" - pos.actions[:enter] = [:mark] - pos.actions[:exit] = [:record_pos] - - mapq = re"[0-9]+" - mapq.actions[:enter] = [:mark] - mapq.actions[:exit] = [:record_mapq] - - cigar = re"\*|([0-9]+[MIDNSHPX=])+" - cigar.actions[:enter] = [:mark] - cigar.actions[:exit] = [:record_cigar] - - rnext = re"\*|=|[!-()+-<>-~][!-~]*" - rnext.actions[:enter] = [:mark] - rnext.actions[:exit] = [:record_rnext] - - pnext = re"[0-9]+" - pnext.actions[:enter] = [:mark] - pnext.actions[:exit] = [:record_pnext] - - tlen = re"[-+]?[0-9]+" - tlen.actions[:enter] = [:mark] - tlen.actions[:exit] = [:record_tlen] - - seq = re"\*|[A-Za-z=.]+" - seq.actions[:enter] = [:mark] - seq.actions[:exit] = [:record_seq] - - qual = re"[!-~]+" - qual.actions[:enter] = [:mark] - qual.actions[:exit] = [:record_qual] - - field = let - tag = re"[A-Za-z][A-Za-z0-9]" - val = alt( - re"A:[!-~]", - re"i:[-+]?[0-9]+", - re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", - re"Z:[ !-~]*", - re"H:([0-9A-F][0-9A-F])*", - re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+") - - cat(tag, ':', val) - end - field.actions[:enter] = [:mark] - field.actions[:exit] = [:record_field] - - cat( - qname, '\t', - flag, '\t', - rname, '\t', - pos, '\t', - mapq, '\t', - cigar, '\t', - rnext, '\t', - pnext, '\t', - tlen, '\t', - seq, '\t', - qual, - rep(cat('\t', field))) +function index!(record::Record) + stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) + found = index!(stream, record) + if !found + throw(ArgumentError("invalid SAM record")) end - record.actions[:enter] = [:anchor] - record.actions[:exit] = [:record] + return record +end - newline = let - lf = re"\n" - lf.actions[:enter] = [:countline] +function Base.read!(rdr::Reader, rec::Record) - cat(re"\r?", lf) + cs, ln, f = readrecord!(rdr.state.stream, rec, (rdr.state.state, rdr.state.linenum)) + + rdr.state.state = cs + rdr.state.linenum = ln + rdr.state.filled = f + + if !f + cs == 0 && throw(EOFError()) + throw(ArgumentError("malformed SAM file")) end - - header′ = rep(cat(metainfo, newline)) - header′.actions[:exit] = [:header] - header = cat(header′, opt(any() \ cat('@'))) # look ahead - - body = rep(cat(record, newline)) - - return map(Automa.compile, (metainfo, record, header, body)) -end)() - -const sam_metainfo_actions = Dict( - :metainfo_tag => :(record.tag = (mark1:p-1) .- offset), - :metainfo_val => :(record.val = (mark1:p-1) .- offset), - :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), - :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), - :metainfo => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1) - record.filled = (offset+1:p-1) .- offset - end, - :anchor => :(), - :mark1 => :(mark1 = p), - :mark2 => :(mark2 = p)) -eval( - BioCore.ReaderHelper.generate_index_function( - MetaInfo, - sam_metainfo_machine, - :(mark1 = mark2 = offset = 0), - sam_metainfo_actions)) -eval( - BioCore.ReaderHelper.generate_readheader_function( - Reader, - MetaInfo, - sam_header_machine, - :(mark1 = mark2 = offset = 0), - merge(sam_metainfo_actions, Dict( - :metainfo => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - @assert isfilled(record) - push!(reader.header.metainfo, record) - BioCore.ReaderHelper.ensure_margin!(stream) - record = MetaInfo() - end, - :header => :(finish_header = true; @escape), - :countline => :(linenum += 1), - :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))), - quote - if !eof(stream) - stream.position -= 1 # cancel look-ahead - end - end)) - -const sam_record_actions = Dict( - :record_qname => :(record.qname = (mark:p-1) .- offset), - :record_flag => :(record.flag = (mark:p-1) .- offset), - :record_rname => :(record.rname = (mark:p-1) .- offset), - :record_pos => :(record.pos = (mark:p-1) .- offset), - :record_mapq => :(record.mapq = (mark:p-1) .- offset), - :record_cigar => :(record.cigar = (mark:p-1) .- offset), - :record_rnext => :(record.rnext = (mark:p-1) .- offset), - :record_pnext => :(record.pnext = (mark:p-1) .- offset), - :record_tlen => :(record.tlen = (mark:p-1) .- offset), - :record_seq => :(record.seq = (mark:p-1) .- offset), - :record_qual => :(record.qual = (mark:p-1) .- offset), - :record_field => :(push!(record.fields, (mark:p-1) .- offset)), - :record => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1) - record.filled = (offset+1:p-1) .- offset - end, - :anchor => :(), - :mark => :(mark = p)) -eval( - BioCore.ReaderHelper.generate_index_function( - Record, - sam_record_machine, - :(mark = offset = 0), - sam_record_actions)) -eval( - BioCore.ReaderHelper.generate_read_function( - Reader, - sam_body_machine, - :(mark = offset = 0), - merge(sam_record_actions, Dict( - :record => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - found_record = true - @escape - end, - :countline => :(linenum += 1), - :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))))) + return rec +end diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl new file mode 100644 index 0000000..e7ea75f --- /dev/null +++ b/src/sam/readrecord.jl @@ -0,0 +1,395 @@ +# Automa.jl generated readrecord! and readmetainfo! functions +# ======================================== + +# file = header . body +# header = metainfo* +# body = record* +const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_body, sam_machine = (function () + + isinteractive() && info("compiling SAM") + + cat = Automa.RegExp.cat + rep = Automa.RegExp.rep + alt = Automa.RegExp.alt + opt = Automa.RegExp.opt + any = Automa.RegExp.any + + metainfo = let + tag = re"[A-Z][A-Z]" \ cat("CO") + tag.actions[:enter] = [:pos1] + tag.actions[:exit] = [:metainfo_tag] + + dict = let + key = re"[A-Za-z][A-Za-z0-9]" + key.actions[:enter] = [:pos2] + key.actions[:exit] = [:metainfo_dict_key] + val = re"[ -~]+" + val.actions[:enter] = [:pos2] + val.actions[:exit] = [:metainfo_dict_val] + keyval = cat(key, ':', val) + + cat(keyval, rep(cat('\t', keyval))) + end + dict.actions[:enter] = [:pos1] + dict.actions[:exit] = [:metainfo_val] + + co = cat("CO") + co.actions[:enter] = [:pos1] + co.actions[:exit] = [:metainfo_tag] + + comment = re"[^\r\n]*" + comment.actions[:enter] = [:pos1] + comment.actions[:exit] = [:metainfo_val] + + cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) + end + metainfo.actions[:enter] = [:mark] + metainfo.actions[:exit] = [:metainfo] + + record = let + qname = re"[!-?A-~]+" + qname.actions[:enter] = [:pos] + qname.actions[:exit] = [:record_qname] + + flag = re"[0-9]+" + flag.actions[:enter] = [:pos] + flag.actions[:exit] = [:record_flag] + + rname = re"\*|[!-()+-<>-~][!-~]*" + rname.actions[:enter] = [:pos] + rname.actions[:exit] = [:record_rname] + + pos = re"[0-9]+" + pos.actions[:enter] = [:pos] + pos.actions[:exit] = [:record_pos] + + mapq = re"[0-9]+" + mapq.actions[:enter] = [:pos] + mapq.actions[:exit] = [:record_mapq] + + cigar = re"\*|([0-9]+[MIDNSHPX=])+" + cigar.actions[:enter] = [:pos] + cigar.actions[:exit] = [:record_cigar] + + rnext = re"\*|=|[!-()+-<>-~][!-~]*" + rnext.actions[:enter] = [:pos] + rnext.actions[:exit] = [:record_rnext] + + pnext = re"[0-9]+" + pnext.actions[:enter] = [:pos] + pnext.actions[:exit] = [:record_pnext] + + tlen = re"[-+]?[0-9]+" + tlen.actions[:enter] = [:pos] + tlen.actions[:exit] = [:record_tlen] + + seq = re"\*|[A-Za-z=.]+" + seq.actions[:enter] = [:pos] + seq.actions[:exit] = [:record_seq] + + qual = re"[!-~]+" + qual.actions[:enter] = [:pos] + qual.actions[:exit] = [:record_qual] + + field = let + tag = re"[A-Za-z][A-Za-z0-9]" + val = alt( + re"A:[!-~]", + re"i:[-+]?[0-9]+", + re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", + re"Z:[ !-~]*", + re"H:([0-9A-F][0-9A-F])*", + re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+") + + cat(tag, ':', val) + end + field.actions[:enter] = [:pos] + field.actions[:exit] = [:record_field] + + cat( + qname, '\t', + flag, '\t', + rname, '\t', + pos, '\t', + mapq, '\t', + cigar, '\t', + rnext, '\t', + pnext, '\t', + tlen, '\t', + seq, '\t', + qual, + rep(cat('\t', field))) + end + record.actions[:enter] = [:mark] + record.actions[:exit] = [:record] + + newline = let + lf = re"\n" + lf.actions[:enter] = [:countline] + + cat(re"\r?", lf) + end + + header′ = rep(cat(metainfo, newline)) + header′.actions[:exit] = [:header] + header = cat(header′, opt(any() \ cat('@'))) # look ahead + + body = rep(cat(record, newline)) + body.actions[:exit] = [:body] + + sam = cat(header, body) + + return map(Automa.compile, (metainfo, record, header, body, sam)) +end)() + +# write("sam_machine_metainfo.dot", Automa.machine2dot(sam_machine_metainfo)) +# run(`dot -Tsvg -o sam_machine_metainfo.svg sam_machine_metainfo.dot`) +# +# write("sam_machine_record.dot", Automa.machine2dot(sam_machine_record)) +# run(`dot -Tsvg -o sam_machine_record.svg sam_machine_record.dot`) +# +# write("sam_machine_header.dot", Automa.machine2dot(sam_machine_header)) +# run(`dot -Tsvg -o sam_machine_header.svg sam_machine_header.dot`) +# +# write("sam_machine_body.dot", Automa.machine2dot(sam_machine_body)) +# run(`dot -Tsvg -o sam_machine_body.svg sam_machine_body.dot`) +# +# write("sam_machine.dot", Automa.machine2dot(sam_machine)) +# run(`dot -Tsvg -o sam_machine.svg sam_machine.dot`) + +function appendfrom!(dst, dpos, src, spos, n) + if length(dst) < dpos + n - 1 + resize!(dst, dpos + n - 1) + end + copyto!(dst, dpos, src, spos, n) + return dst +end + +const sam_actions_metainfo = Dict( + :mark => :(@mark), + :pos1 => :(pos1 = @relpos(p)), + :pos2 => :(pos2 = @relpos(p)), + :metainfo_tag => :(metainfo.tag = pos1:@relpos(p-1)), + :metainfo_val => :(metainfo.val = pos1:@relpos(p-1)), + :metainfo_dict_key => :(push!(metainfo.dictkey, pos2:@relpos(p-1))), + :metainfo_dict_val => :(push!(metainfo.dictval, pos2:@relpos(p-1))), + :metainfo => quote + let markpos = @markpos() + + appendfrom!(metainfo.data, 1, data, markpos, length(markpos:p-1)) + + metainfo.filled = @relpos(markpos):@relpos(p-1) + + found_metainfo = true + end + end +) + +const sam_actions_header = merge( + sam_actions_metainfo, + Dict( + :countline => :(linenum += 1), + :header => quote + + finish_header = true + + if !eof(stream) + p -= 1 # cancel look-ahead + end + + @escape + end + ) +) + +const sam_actions_record = Dict( + :mark => :(@mark), + :pos => :(pos = @relpos(p)), + :record_qname => :(record.qname = pos:@relpos(p-1)), + :record_flag => :(record.flag = pos:@relpos(p-1)), + :record_rname => :(record.rname = pos:@relpos(p-1)), + :record_pos => :(record.pos = pos:@relpos(p-1)), + :record_mapq => :(record.mapq = pos:@relpos(p-1)), + :record_cigar => :(record.cigar = pos:@relpos(p-1)), + :record_rnext => :(record.rnext = pos:@relpos(p-1)), + :record_pnext => :(record.pnext = pos:@relpos(p-1)), + :record_tlen => :(record.tlen = pos:@relpos(p-1)), + :record_seq => :(record.seq = pos:@relpos(p-1)), + :record_qual => :(record.qual = pos:@relpos(p-1)), + :record_field => :(push!(record.fields, pos:@relpos(p-1))), + :record => quote + let markpos = @markpos() + + appendfrom!(record.data, 1, data, markpos, length(markpos:p-1)) + + record.filled = @relpos(markpos):@relpos(p-1) + + found_record = true + @escape + end + end +) + +const sam_actions_body = merge( + sam_actions_record, + Dict( + :countline => :(linenum += 1), + :body => quote + finish_body = true + @escape + end + ) +) + +# const sam_actions = merge( +# sam_actions_header, +# sam_actions_body +# ) + +const sam_context = Automa.CodeGenContext( + generator = :goto, + checkbounds = false, + loopunroll = 0 +) + +const sam_initcode_metainfo = quote + pos1 = 0 + pos2 = 0 + found_metainfo = false +end + +const sam_initcode_record = quote + pos = 0 + found_record = false +end + +const sam_initcode_header = quote + $(sam_initcode_metainfo) + metainfo = MetaInfo() + finish_header = false + cs, linenum = state +end + +const sam_initcode_body = quote + $(sam_initcode_record) + finish_body = false + cs, linenum = state +end + +const sam_loopcode_metainfo = quote + + if cs < 0 + throw(ArgumentError("malformed metainfo at pos $(p)")) + end + + if found_metainfo + @goto __return__ + end +end + +const sam_returncode_metainfo = quote + return found_metainfo +end + + +Automa.Stream.generate_reader( + :index!, + sam_machine_metainfo, + arguments = (:(metainfo::MetaInfo),), + actions = sam_actions_metainfo, + context = sam_context, + initcode = sam_initcode_metainfo, + loopcode = sam_loopcode_metainfo, + returncode = sam_returncode_metainfo +) |> eval + +const sam_loopcode_header = quote + + if cs < 0 + throw(ArgumentError("malformed metainfo at line $(linenum)")) + end + + if found_metainfo + push!(header, metainfo) + found_metainfo = false + end + + metainfo = MetaInfo() + + if finish_header + @goto __return__ + end +end + +const sam_returncode_header = quote + return cs, linenum, finish_header +end + +Automa.Stream.generate_reader( + :readheader!, + sam_machine_header, + arguments = (:(header::SAM.Header), :(state::Tuple{Int,Int})), + actions = sam_actions_header, + context = sam_context, + initcode = sam_initcode_header, + loopcode = sam_loopcode_header, + returncode = sam_returncode_header +) |> eval + + +const sam_loopcode_record = quote + + if cs < 0 + throw(ArgumentError("malformed SAM record at position $(p), line $(linenum)")) + end + + # # if cs != 0 + # # throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) + # # end + + if found_record + @goto __return__ + end + +end + +const sam_loopcode_body = quote + + $(sam_loopcode_record) + + if finish_body + @goto __return__ + end +end + +const sam_returncode_record = quote + return found_record +end + +Automa.Stream.generate_reader( + :index!, + sam_machine_record, + arguments = (:(record::Record),), + actions = sam_actions_record, + context = sam_context, + initcode = sam_initcode_record, + loopcode = sam_loopcode_record, + returncode = sam_returncode_record +) |> eval + + + +const sam_returncode_body = quote + return cs, linenum, found_record +end + +Automa.Stream.generate_reader( + :readrecord!, + sam_machine_body, + arguments = (:(record::Record), :(state::Tuple{Int,Int})), + actions = sam_actions_body, + context = sam_context, + initcode = sam_initcode_body, + loopcode = sam_loopcode_body, + returncode = sam_returncode_body +) |> eval diff --git a/src/sam/record.jl b/src/sam/record.jl index e627727..a61e0df 100644 --- a/src/sam/record.jl +++ b/src/sam/record.jl @@ -299,9 +299,10 @@ Get the alignment of `record`. function alignment(record::Record)::BioAlignments.Alignment if ismapped(record) return BioAlignments.Alignment(cigar(record), 1, position(record)) - else - return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) end + + return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) + end function hasalignment(record::Record) @@ -474,23 +475,30 @@ function Base.getindex(record::Record, tag::AbstractString) else hi = first(record.fields[i+1]) - 2 end + if typ == UInt8('A') @assert lo == hi return Char(record.data[lo]) - elseif typ == UInt8('i') + end + if typ == UInt8('i') return unsafe_parse_decimal(Int, record.data, lo:hi) - elseif typ == UInt8('f') + end + if typ == UInt8('f') # TODO: Call a C function directly for speed? return parse(Float32, SubString(record.data[lo:hi])) - elseif typ == UInt8('Z') - return String(record.data[lo:hi]) - elseif typ == UInt8('H') - return parse_hexarray(record.data, lo:hi) - elseif typ == UInt8('B') - return parse_typedarray(record.data, lo:hi) - else - throw(ArgumentError("type code '$(Char(typ))' is not defined")) end + if typ == UInt8('Z') + return String(record.data[lo:hi]) + end + if typ == UInt8('H') + return parse_hexarray(record.data, lo:hi) + end + if typ == UInt8('B') + return parse_typedarray(record.data, lo:hi) + end + + throw(ArgumentError("type code '$(Char(typ))' is not defined")) + end function Base.keys(record::Record) @@ -506,39 +514,39 @@ end # Bio Methods # ----------- -function BioCore.isfilled(record::Record) +function BioGenerics.isfilled(record::Record) return !isempty(record.filled) end -function BioCore.seqname(record::Record) +function BioGenerics.seqname(record::Record) return tempname(record) end -function BioCore.hasseqname(record::Record) +function BioGenerics.hasseqname(record::Record) return hastempname(record) end -function BioCore.sequence(record::Record) +function BioGenerics.sequence(record::Record) return sequence(record) end -function BioCore.hassequence(record::Record) +function BioGenerics.hassequence(record::Record) return hassequence(record) end -function BioCore.rightposition(record::Record) +function BioGenerics.rightposition(record::Record) return rightposition(record) end -function BioCore.hasrightposition(record::Record) +function BioGenerics.hasrightposition(record::Record) return hasrightposition(record) end -function BioCore.leftposition(record::Record) +function BioGenerics.leftposition(record::Record) return position(record) end -function BioCore.hasleftposition(record::Record) +function BioGenerics.hasleftposition(record::Record) return hasposition(record) end @@ -602,21 +610,28 @@ function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int}) xs = split(String(data[first(range)+2:last(range)])) if t == UInt8('c') return [parse(Int8, x) for x in xs] - elseif t == UInt8('C') - return [parse(UInt8, x) for x in xs] - elseif t == UInt8('s') - return [parse(Int16, x) for x in xs] - elseif t == UInt8('S') - return [parse(UInt16, x) for x in xs] - elseif t == UInt8('i') - return [parse(Int32, x) for x in xs] - elseif t == UInt8('I') - return [parse(UInt32, x) for x in xs] - elseif t == UInt8('f') - return [parse(Float32, x) for x in xs] - else - throw(ArgumentError("type code '$(Char(t))' is not defined")) end + if t == UInt8('C') + return [parse(UInt8, x) for x in xs] + end + if t == UInt8('s') + return [parse(Int16, x) for x in xs] + end + if t == UInt8('S') + return [parse(UInt16, x) for x in xs] + end + if t == UInt8('i') + return [parse(Int32, x) for x in xs] + end + if t == UInt8('I') + return [parse(UInt32, x) for x in xs] + end + if t == UInt8('f') + return [parse(Float32, x) for x in xs] + end + + throw(ArgumentError("type code '$(Char(t))' is not defined")) + end function ismissing(record::Record, range::UnitRange{Int}) diff --git a/src/sam/sam.jl b/src/sam/sam.jl index b917f37..95e36f6 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -3,23 +3,73 @@ module SAM -using BioCore +using BioGenerics import Automa import Automa.RegExp: @re_str +import Automa.Stream: @mark, @markpos, @relpos, @abspos import BioAlignments -import BioCore.Exceptions: missingerror -import BioCore.RecordHelper: unsafe_parse_decimal -import BioCore: isfilled, header +import BioGenerics: BioGenerics, isfilled, header +import BioGenerics.Exceptions: missingerror +import BioGenerics.Automa: State import BioSequences -import BufferedStreams +import TranscodingStreams: TranscodingStreams, TranscodingStream + using Printf: @sprintf + +#TODO: update import BioCore.RecordHelper: unsafe_parse_decimal +# r"[0-9]+" must match `data[range]`. +function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{Int}) where {T<:Unsigned} + x = zero(T) + @inbounds for i in range + x = Base.Checked.checked_mul(x, 10 % T) + x = Base.Checked.checked_add(x, (data[i] - UInt8('0')) % T) + end + return x +end + +# r"[-+]?[0-9]+" must match `data[range]`. +function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{Int}) where {T<:Signed} + lo = first(range) + if data[lo] == UInt8('-') + sign = T(-1) + lo += 1 + elseif data[lo] == UInt8('+') + sign = T(+1) + lo += 1 + else + sign = T(+1) + end + x = zero(T) + @inbounds for i in lo:last(range) + x = Base.Checked.checked_mul(x, 10 % T) + x = Base.Checked.checked_add(x, (data[i] - UInt8('0')) % T) + end + return sign * x +end + +# #TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams). +# import BufferedStreams: BufferedStreams, BufferedInputStream +# # A type keeping track of a ragel-based parser's state. +# mutable struct State{T<:BufferedInputStream} +# stream::T # input stream +# cs::Int # current DFA state of Ragel +# linenum::Int # line number: parser is responsible for updating this +# finished::Bool # true if finished (regardless of where in the stream we are) +# end + +# function State(initstate::Int, input::BufferedInputStream) +# return State(input, initstate, 1, false) +# end + + include("flags.jl") include("metainfo.jl") include("record.jl") include("header.jl") include("reader.jl") +include("readrecord.jl") include("writer.jl") end diff --git a/src/sam/writer.jl b/src/sam/writer.jl index e7a5ddd..801ed68 100644 --- a/src/sam/writer.jl +++ b/src/sam/writer.jl @@ -10,7 +10,7 @@ Create a data writer of the SAM file format. * `output`: data sink * `header=Header()`: SAM header object """ -mutable struct Writer <: BioCore.IO.AbstractWriter +mutable struct Writer <: BioGenerics.IO.AbstractWriter stream::IO function Writer(output::IO, header::Header=Header()) @@ -20,7 +20,7 @@ mutable struct Writer <: BioCore.IO.AbstractWriter end end -function BioCore.IO.stream(writer::Writer) +function BioGenerics.IO.stream(writer::Writer) return writer.stream end diff --git a/test/runtests.jl b/test/runtests.jl index d76d99f..10ca2d0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,25 +1,16 @@ using Test + +using BioGenerics +using FormatSpecimens using GenomicFeatures using XAM + import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE -using FormatSpecimens import BGZFStreams: BGZFStream -import BioCore.Exceptions: MissingFieldException +import BioGenerics.Exceptions: MissingFieldException import BioSequences: @dna_str, @aa_str -import BioCore: - header, - isfilled, - seqname, - hasseqname, - sequence, - hassequence, - leftposition, - rightposition, - hasleftposition, - hasrightposition - # Generate a random range within `range`. function randrange(range) x = rand(range)