From c4f3c4b0df70264cfba627263f9f7fb74c51666d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 2 Feb 2020 22:50:25 +1100 Subject: [PATCH 1/9] Migrate to BioGenerics Basic find and replace. --- Project.toml | 4 ++-- src/bam/bam.jl | 4 ++-- src/bam/reader.jl | 4 ++-- src/bam/record.jl | 20 ++++++++++---------- src/bam/writer.jl | 4 ++-- src/sam/reader.jl | 30 +++++++++++++++--------------- src/sam/record.jl | 18 +++++++++--------- src/sam/sam.jl | 8 ++++---- src/sam/writer.jl | 4 ++-- test/runtests.jl | 19 +++++-------------- 10 files changed, 53 insertions(+), 62 deletions(-) diff --git a/Project.toml b/Project.toml index 10f7cb4..850b570 100644 --- a/Project.toml +++ b/Project.toml @@ -7,7 +7,7 @@ version = "0.1.1" Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" -BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81" +BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" @@ -18,7 +18,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Automa = "0.7, 0.8" BGZFStreams = "0.3" BioAlignments = "2" -BioCore = "2" +BioGenerics = "0.1" BioSequences = "2" BufferedStreams = "1" GenomicFeatures = "2" diff --git a/src/bam/bam.jl b/src/bam/bam.jl index e055231..6fd1174 100644 --- a/src/bam/bam.jl +++ b/src/bam/bam.jl @@ -3,7 +3,7 @@ module BAM -using BioCore +using BioGenerics using GenomicFeatures using XAM.SAM @@ -11,7 +11,7 @@ import BGZFStreams import BioAlignments import Indexes import BioSequences -import BioCore: isfilled, header +import BioGenerics: isfilled, header import GenomicFeatures: eachoverlap diff --git a/src/bam/reader.jl b/src/bam/reader.jl index 0fabb8f..953f5ca 100644 --- a/src/bam/reader.jl +++ b/src/bam/reader.jl @@ -10,7 +10,7 @@ Create a data reader of the BAM file format. * `input`: data source * `index=nothing`: filepath to a random access index (currently *bai* is supported) """ -mutable struct Reader{T} <: BioCore.IO.AbstractReader +mutable struct Reader{T} <: BioGenerics.IO.AbstractReader stream::BGZFStreams.BGZFStream{T} header::SAM.Header start_offset::BGZFStreams.VirtualOffset @@ -23,7 +23,7 @@ function Base.eltype(::Type{Reader{T}}) where T return Record end -function BioCore.IO.stream(reader::Reader) +function BioGenerics.IO.stream(reader::Reader) return reader.stream end diff --git a/src/bam/record.jl b/src/bam/record.jl index ad9c6f1..13c5608 100644 --- a/src/bam/record.jl +++ b/src/bam/record.jl @@ -566,42 +566,42 @@ function Base.values(record::Record) end -# BioCore Methods +# BioGenerics Methods # ----------- -function BioCore.isfilled(record::Record) +function BioGenerics.isfilled(record::Record) return record.block_size != 0 end -function BioCore.seqname(record::Record) +function BioGenerics.seqname(record::Record) return tempname(record) end -function BioCore.hasseqname(record::Record) +function BioGenerics.hasseqname(record::Record) return hastempname(record) end -function BioCore.sequence(record::Record) +function BioGenerics.sequence(record::Record) return sequence(record) end -function BioCore.hassequence(record::Record) +function BioGenerics.hassequence(record::Record) return hassequence(record) end -function BioCore.leftposition(record::Record) +function BioGenerics.leftposition(record::Record) return position(record) end -function BioCore.hasleftposition(record::Record) +function BioGenerics.hasleftposition(record::Record) return hasposition(record) end -function BioCore.rightposition(record::Record) +function BioGenerics.rightposition(record::Record) return rightposition(record) end -function BioCore.hasrightposition(record::Record) +function BioGenerics.hasrightposition(record::Record) return hasrightposition(record) end diff --git a/src/bam/writer.jl b/src/bam/writer.jl index f017424..2460b3f 100644 --- a/src/bam/writer.jl +++ b/src/bam/writer.jl @@ -10,7 +10,7 @@ Create a data writer of the BAM file format. * `output`: data sink * `header`: SAM header object """ -mutable struct Writer <: BioCore.IO.AbstractWriter +mutable struct Writer <: BioGenerics.IO.AbstractWriter stream::BGZFStreams.BGZFStream end @@ -25,7 +25,7 @@ function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header) return Writer(stream) end -function BioCore.IO.stream(writer::Writer) +function BioGenerics.IO.stream(writer::Writer) return writer.stream end diff --git a/src/sam/reader.jl b/src/sam/reader.jl index d26a840..5e4f0ea 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -1,12 +1,12 @@ # SAM Reader # ========= -mutable struct Reader <: BioCore.IO.AbstractReader - state::BioCore.Ragel.State +mutable struct Reader <: BioGenerics.IO.AbstractReader + state::BioGenerics.Ragel.State header::Header function Reader(input::BufferedStreams.BufferedInputStream) - reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header()) + reader = new(BioGenerics.Ragel.State(sam_header_machine.start_state, input), Header()) readheader!(reader) reader.state.cs = sam_body_machine.start_state return reader @@ -25,7 +25,7 @@ function Reader(input::IO) return Reader(BufferedStreams.BufferedInputStream(input)) end -function BioCore.IO.stream(reader::Reader) +function BioGenerics.IO.stream(reader::Reader) return reader.state.stream end @@ -184,36 +184,36 @@ const sam_metainfo_actions = Dict( :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), :metainfo => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1) + BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1) record.filled = (offset+1:p-1) .- offset end, :anchor => :(), :mark1 => :(mark1 = p), :mark2 => :(mark2 = p)) eval( - BioCore.ReaderHelper.generate_index_function( + BioGenerics.ReaderHelper.generate_index_function( MetaInfo, sam_metainfo_machine, :(mark1 = mark2 = offset = 0), sam_metainfo_actions)) eval( - BioCore.ReaderHelper.generate_readheader_function( + BioGenerics.ReaderHelper.generate_readheader_function( Reader, MetaInfo, sam_header_machine, :(mark1 = mark2 = offset = 0), merge(sam_metainfo_actions, Dict( :metainfo => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) + BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, BioGenerics.ReaderHelper.upanchor!(stream):p-1) record.filled = (offset+1:p-1) .- offset @assert isfilled(record) push!(reader.header.metainfo, record) - BioCore.ReaderHelper.ensure_margin!(stream) + BioGenerics.ReaderHelper.ensure_margin!(stream) record = MetaInfo() end, :header => :(finish_header = true; @escape), :countline => :(linenum += 1), - :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))), + :anchor => :(BioGenerics.ReaderHelper.anchor!(stream, p); offset = p - 1))), quote if !eof(stream) stream.position -= 1 # cancel look-ahead @@ -234,28 +234,28 @@ const sam_record_actions = Dict( :record_qual => :(record.qual = (mark:p-1) .- offset), :record_field => :(push!(record.fields, (mark:p-1) .- offset)), :record => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1) + BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1) record.filled = (offset+1:p-1) .- offset end, :anchor => :(), :mark => :(mark = p)) eval( - BioCore.ReaderHelper.generate_index_function( + BioGenerics.ReaderHelper.generate_index_function( Record, sam_record_machine, :(mark = offset = 0), sam_record_actions)) eval( - BioCore.ReaderHelper.generate_read_function( + BioGenerics.ReaderHelper.generate_read_function( Reader, sam_body_machine, :(mark = offset = 0), merge(sam_record_actions, Dict( :record => quote - BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1) + BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, BioGenerics.ReaderHelper.upanchor!(stream):p-1) record.filled = (offset+1:p-1) .- offset found_record = true @escape end, :countline => :(linenum += 1), - :anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))))) + :anchor => :(BioGenerics.ReaderHelper.anchor!(stream, p); offset = p - 1))))) diff --git a/src/sam/record.jl b/src/sam/record.jl index 8989747..a61e0df 100644 --- a/src/sam/record.jl +++ b/src/sam/record.jl @@ -514,39 +514,39 @@ end # Bio Methods # ----------- -function BioCore.isfilled(record::Record) +function BioGenerics.isfilled(record::Record) return !isempty(record.filled) end -function BioCore.seqname(record::Record) +function BioGenerics.seqname(record::Record) return tempname(record) end -function BioCore.hasseqname(record::Record) +function BioGenerics.hasseqname(record::Record) return hastempname(record) end -function BioCore.sequence(record::Record) +function BioGenerics.sequence(record::Record) return sequence(record) end -function BioCore.hassequence(record::Record) +function BioGenerics.hassequence(record::Record) return hassequence(record) end -function BioCore.rightposition(record::Record) +function BioGenerics.rightposition(record::Record) return rightposition(record) end -function BioCore.hasrightposition(record::Record) +function BioGenerics.hasrightposition(record::Record) return hasrightposition(record) end -function BioCore.leftposition(record::Record) +function BioGenerics.leftposition(record::Record) return position(record) end -function BioCore.hasleftposition(record::Record) +function BioGenerics.hasleftposition(record::Record) return hasposition(record) end diff --git a/src/sam/sam.jl b/src/sam/sam.jl index b917f37..d0f006b 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -3,14 +3,14 @@ module SAM -using BioCore +using BioGenerics import Automa import Automa.RegExp: @re_str import BioAlignments -import BioCore.Exceptions: missingerror -import BioCore.RecordHelper: unsafe_parse_decimal -import BioCore: isfilled, header +import BioGenerics.Exceptions: missingerror +import BioGenerics.RecordHelper: unsafe_parse_decimal +import BioGenerics: isfilled, header import BioSequences import BufferedStreams using Printf: @sprintf diff --git a/src/sam/writer.jl b/src/sam/writer.jl index e7a5ddd..801ed68 100644 --- a/src/sam/writer.jl +++ b/src/sam/writer.jl @@ -10,7 +10,7 @@ Create a data writer of the SAM file format. * `output`: data sink * `header=Header()`: SAM header object """ -mutable struct Writer <: BioCore.IO.AbstractWriter +mutable struct Writer <: BioGenerics.IO.AbstractWriter stream::IO function Writer(output::IO, header::Header=Header()) @@ -20,7 +20,7 @@ mutable struct Writer <: BioCore.IO.AbstractWriter end end -function BioCore.IO.stream(writer::Writer) +function BioGenerics.IO.stream(writer::Writer) return writer.stream end diff --git a/test/runtests.jl b/test/runtests.jl index d76d99f..10ca2d0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,25 +1,16 @@ using Test + +using BioGenerics +using FormatSpecimens using GenomicFeatures using XAM + import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE -using FormatSpecimens import BGZFStreams: BGZFStream -import BioCore.Exceptions: MissingFieldException +import BioGenerics.Exceptions: MissingFieldException import BioSequences: @dna_str, @aa_str -import BioCore: - header, - isfilled, - seqname, - hasseqname, - sequence, - hassequence, - leftposition, - rightposition, - hasleftposition, - hasrightposition - # Generate a random range within `range`. function randrange(range) x = rand(range) From 2cad552f6308b28cd7a99aa3584da98716019bc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 2 Feb 2020 22:51:19 +1100 Subject: [PATCH 2/9] Quick fix for BioCore.RecordHelper: unsafe_parse_decimal --- src/sam/sam.jl | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/sam/sam.jl b/src/sam/sam.jl index d0f006b..f7b6c92 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -9,12 +9,43 @@ import Automa import Automa.RegExp: @re_str import BioAlignments import BioGenerics.Exceptions: missingerror -import BioGenerics.RecordHelper: unsafe_parse_decimal import BioGenerics: isfilled, header import BioSequences import BufferedStreams using Printf: @sprintf + +#TODO: update import BioCore.RecordHelper: unsafe_parse_decimal +# r"[0-9]+" must match `data[range]`. +function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{Int}) where {T<:Unsigned} + x = zero(T) + @inbounds for i in range + x = Base.Checked.checked_mul(x, 10 % T) + x = Base.Checked.checked_add(x, (data[i] - UInt8('0')) % T) + end + return x +end + +# r"[-+]?[0-9]+" must match `data[range]`. +function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{Int}) where {T<:Signed} + lo = first(range) + if data[lo] == UInt8('-') + sign = T(-1) + lo += 1 + elseif data[lo] == UInt8('+') + sign = T(+1) + lo += 1 + else + sign = T(+1) + end + x = zero(T) + @inbounds for i in lo:last(range) + x = Base.Checked.checked_mul(x, 10 % T) + x = Base.Checked.checked_add(x, (data[i] - UInt8('0')) % T) + end + return sign * x +end + include("flags.jl") include("metainfo.jl") include("record.jl") From c59a1576755f109e989545e2510a433487b4ec3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Tue, 27 Aug 2019 08:52:35 +1000 Subject: [PATCH 3/9] Quick fix for BioCore.Ragel.State This fix is likely to be replaced when migrating to TranscodingStreams. --- src/sam/reader.jl | 4 ++-- src/sam/sam.jl | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/sam/reader.jl b/src/sam/reader.jl index 5e4f0ea..1dc206b 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -2,11 +2,11 @@ # ========= mutable struct Reader <: BioGenerics.IO.AbstractReader - state::BioGenerics.Ragel.State + state::State header::Header function Reader(input::BufferedStreams.BufferedInputStream) - reader = new(BioGenerics.Ragel.State(sam_header_machine.start_state, input), Header()) + reader = new(State(sam_header_machine.start_state, input), Header()) readheader!(reader) reader.state.cs = sam_body_machine.start_state return reader diff --git a/src/sam/sam.jl b/src/sam/sam.jl index f7b6c92..628b085 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -46,6 +46,21 @@ function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{I return sign * x end +#TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams). +import BufferedStreams: BufferedStreams, BufferedInputStream +# A type keeping track of a ragel-based parser's state. +mutable struct State{T<:BufferedInputStream} + stream::T # input stream + cs::Int # current DFA state of Ragel + linenum::Int # line number: parser is responsible for updating this + finished::Bool # true if finished (regardless of where in the stream we are) +end + +function State(initstate::Int, input::BufferedInputStream) + return State(input, initstate, 1, false) +end + + include("flags.jl") include("metainfo.jl") include("record.jl") From 365f7922fae84beea86a350e35887f3c54d8df18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 2 Feb 2020 23:16:46 +1100 Subject: [PATCH 4/9] Quick fix for BioGenerics.ReaderHelper functions This fix is likely to be replaced when migrating to TranscodingStreams. --- src/sam/reader.jl | 191 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 180 insertions(+), 11 deletions(-) diff --git a/src/sam/reader.jl b/src/sam/reader.jl index 1dc206b..ceb7a86 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -1,6 +1,175 @@ # SAM Reader # ========= +@inline function anchor!(stream::BufferedStreams.BufferedInputStream, p, immobilize = true) + stream.anchor = p + stream.immobilized = immobilize + return stream +end + +@inline function upanchor!(stream::BufferedStreams.BufferedInputStream) + @assert stream.anchor != 0 "upanchor! called with no anchor set" + anchor = stream.anchor + stream.anchor = 0 + stream.immobilized = false + return anchor +end + +function ensure_margin!(stream::BufferedStreams.BufferedInputStream) + if stream.position * 20 > length(stream.buffer) * 19 + BufferedStreams.shiftdata!(stream) + end + return nothing +end + +@inline function resize_and_copy!(dst::Vector{UInt8}, src::Vector{UInt8}, r::UnitRange{Int}) + return resize_and_copy!(dst, 1, src, r) +end + +@inline function resize_and_copy!(dst::Vector{UInt8}, dstart::Int, src::Vector{UInt8}, r::UnitRange{Int}) + rlen = length(r) + if length(dst) != dstart + rlen - 1 + resize!(dst, dstart + rlen - 1) + end + copyto!(dst, dstart, src, first(r), rlen) + return dst +end + +function generate_index_function(record_type, machine, init_code, actions; kwargs...) + kwargs = Dict(kwargs) + context = Automa.CodeGenContext( + generator = get(kwargs, :generator, :goto), + checkbounds = get(kwargs, :checkbounds, false), + loopunroll = get(kwargs, :loopunroll, 0) + ) + quote + function index!(record::$(record_type)) + data = record.data + p = 1 + p_end = p_eof = sizeof(data) + initialize!(record) + $(init_code) + cs = $(machine.start_state) + $(Automa.generate_exec_code(context, machine, actions)) + if cs != 0 + throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) + end + @assert isfilled(record) + return record + end + end +end + +function generate_readheader_function(reader_type, metainfo_type, machine, init_code, actions, finish_code=:()) + quote + function readheader!(reader::$(reader_type)) + _readheader!(reader, reader.state) + end + + function _readheader!(reader::$(reader_type), state::State) + stream = state.stream + ensure_margin!(stream) + cs = state.cs + linenum = state.linenum + data = stream.buffer + p = stream.position + p_end = stream.available + p_eof = -1 + finish_header = false + record = $(metainfo_type)() + + $(init_code) + + while true + $(Automa.generate_exec_code(Automa.CodeGenContext(generator=:table), machine, actions)) + + state.cs = cs + state.finished = cs == 0 + state.linenum = linenum + stream.position = p + + if cs < 0 + error("$($(reader_type)) file format error on line ", linenum) + elseif finish_header + $(finish_code) + break + elseif p > p_eof ≥ 0 + error("incomplete $($(reader_type)) input on line ", linenum) + else + hits_eof = BufferedStreams.fillbuffer!(stream) == 0 + p = stream.position + p_end = stream.available + if hits_eof + p_eof = p_end + end + end + end + end + end +end + +function generate_read_function(reader_type, machine, init_code, actions; kwargs...) + kwargs = Dict(kwargs) + context = Automa.CodeGenContext( + generator=get(kwargs, :generator, :goto), + checkbounds=get(kwargs, :checkbounds, false), + loopunroll=get(kwargs, :loopunroll, 0) + ) + quote + function Base.read!(reader::$(reader_type), record::eltype($(reader_type)))::eltype($(reader_type)) + return _read!(reader, reader.state, record) + end + + function _read!(reader::$(reader_type), state::State, record::eltype($(reader_type))) + stream = state.stream + ensure_margin!(stream) + cs = state.cs + linenum = state.linenum + data = stream.buffer + p = stream.position + p_end = stream.available + p_eof = -1 + found_record = false + initialize!(record) + + $(init_code) + + if state.finished + throw(EOFError()) + end + + while true + $(Automa.generate_exec_code(context, machine, actions)) + + state.cs = cs + state.finished |= cs == 0 + state.linenum = linenum + stream.position = p + + if cs < 0 + error($(reader_type), " file format error on line ", linenum, " ~>", repr(String(data[p:min(p+7,p_end)]))) + elseif found_record + break + elseif cs == 0 + throw(EOFError()) + elseif p > p_eof ≥ 0 + error("incomplete $($(reader_type)) input on line ", linenum) + elseif BufferedStreams.available_bytes(stream) < 64 + hits_eof = BufferedStreams.fillbuffer!(stream) == 0 + p = stream.position + p_end = stream.available + if hits_eof + p_eof = p_end + end + end + end + + @assert isfilled(record) + return record + end + end +end + mutable struct Reader <: BioGenerics.IO.AbstractReader state::State header::Header @@ -184,36 +353,36 @@ const sam_metainfo_actions = Dict( :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), :metainfo => quote - BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1) + resize_and_copy!(record.data, data, offset+1:p-1) record.filled = (offset+1:p-1) .- offset end, :anchor => :(), :mark1 => :(mark1 = p), :mark2 => :(mark2 = p)) eval( - BioGenerics.ReaderHelper.generate_index_function( + generate_index_function( MetaInfo, sam_metainfo_machine, :(mark1 = mark2 = offset = 0), sam_metainfo_actions)) eval( - BioGenerics.ReaderHelper.generate_readheader_function( + generate_readheader_function( Reader, MetaInfo, sam_header_machine, :(mark1 = mark2 = offset = 0), merge(sam_metainfo_actions, Dict( :metainfo => quote - BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, BioGenerics.ReaderHelper.upanchor!(stream):p-1) + resize_and_copy!(record.data, data, upanchor!(stream):p-1) record.filled = (offset+1:p-1) .- offset @assert isfilled(record) push!(reader.header.metainfo, record) - BioGenerics.ReaderHelper.ensure_margin!(stream) + ensure_margin!(stream) record = MetaInfo() end, :header => :(finish_header = true; @escape), :countline => :(linenum += 1), - :anchor => :(BioGenerics.ReaderHelper.anchor!(stream, p); offset = p - 1))), + :anchor => :(anchor!(stream, p); offset = p - 1))), quote if !eof(stream) stream.position -= 1 # cancel look-ahead @@ -234,28 +403,28 @@ const sam_record_actions = Dict( :record_qual => :(record.qual = (mark:p-1) .- offset), :record_field => :(push!(record.fields, (mark:p-1) .- offset)), :record => quote - BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1) + resize_and_copy!(record.data, data, 1:p-1) record.filled = (offset+1:p-1) .- offset end, :anchor => :(), :mark => :(mark = p)) eval( - BioGenerics.ReaderHelper.generate_index_function( + generate_index_function( Record, sam_record_machine, :(mark = offset = 0), sam_record_actions)) eval( - BioGenerics.ReaderHelper.generate_read_function( + generate_read_function( Reader, sam_body_machine, :(mark = offset = 0), merge(sam_record_actions, Dict( :record => quote - BioGenerics.ReaderHelper.resize_and_copy!(record.data, data, BioGenerics.ReaderHelper.upanchor!(stream):p-1) + resize_and_copy!(record.data, data, upanchor!(stream):p-1) record.filled = (offset+1:p-1) .- offset found_record = true @escape end, :countline => :(linenum += 1), - :anchor => :(BioGenerics.ReaderHelper.anchor!(stream, p); offset = p - 1))))) + :anchor => :(anchor!(stream, p); offset = p - 1))))) From e63ce2e3986ada38e67f4e0b5477d8e541983299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 7 Sep 2019 21:42:06 +1000 Subject: [PATCH 5/9] Group Automa into readrecord.jl --- src/sam/reader.jl | 387 ----------------------------------------- src/sam/readrecord.jl | 389 ++++++++++++++++++++++++++++++++++++++++++ src/sam/sam.jl | 1 + 3 files changed, 390 insertions(+), 387 deletions(-) create mode 100644 src/sam/readrecord.jl diff --git a/src/sam/reader.jl b/src/sam/reader.jl index ceb7a86..0406165 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -1,175 +1,6 @@ # SAM Reader # ========= -@inline function anchor!(stream::BufferedStreams.BufferedInputStream, p, immobilize = true) - stream.anchor = p - stream.immobilized = immobilize - return stream -end - -@inline function upanchor!(stream::BufferedStreams.BufferedInputStream) - @assert stream.anchor != 0 "upanchor! called with no anchor set" - anchor = stream.anchor - stream.anchor = 0 - stream.immobilized = false - return anchor -end - -function ensure_margin!(stream::BufferedStreams.BufferedInputStream) - if stream.position * 20 > length(stream.buffer) * 19 - BufferedStreams.shiftdata!(stream) - end - return nothing -end - -@inline function resize_and_copy!(dst::Vector{UInt8}, src::Vector{UInt8}, r::UnitRange{Int}) - return resize_and_copy!(dst, 1, src, r) -end - -@inline function resize_and_copy!(dst::Vector{UInt8}, dstart::Int, src::Vector{UInt8}, r::UnitRange{Int}) - rlen = length(r) - if length(dst) != dstart + rlen - 1 - resize!(dst, dstart + rlen - 1) - end - copyto!(dst, dstart, src, first(r), rlen) - return dst -end - -function generate_index_function(record_type, machine, init_code, actions; kwargs...) - kwargs = Dict(kwargs) - context = Automa.CodeGenContext( - generator = get(kwargs, :generator, :goto), - checkbounds = get(kwargs, :checkbounds, false), - loopunroll = get(kwargs, :loopunroll, 0) - ) - quote - function index!(record::$(record_type)) - data = record.data - p = 1 - p_end = p_eof = sizeof(data) - initialize!(record) - $(init_code) - cs = $(machine.start_state) - $(Automa.generate_exec_code(context, machine, actions)) - if cs != 0 - throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) - end - @assert isfilled(record) - return record - end - end -end - -function generate_readheader_function(reader_type, metainfo_type, machine, init_code, actions, finish_code=:()) - quote - function readheader!(reader::$(reader_type)) - _readheader!(reader, reader.state) - end - - function _readheader!(reader::$(reader_type), state::State) - stream = state.stream - ensure_margin!(stream) - cs = state.cs - linenum = state.linenum - data = stream.buffer - p = stream.position - p_end = stream.available - p_eof = -1 - finish_header = false - record = $(metainfo_type)() - - $(init_code) - - while true - $(Automa.generate_exec_code(Automa.CodeGenContext(generator=:table), machine, actions)) - - state.cs = cs - state.finished = cs == 0 - state.linenum = linenum - stream.position = p - - if cs < 0 - error("$($(reader_type)) file format error on line ", linenum) - elseif finish_header - $(finish_code) - break - elseif p > p_eof ≥ 0 - error("incomplete $($(reader_type)) input on line ", linenum) - else - hits_eof = BufferedStreams.fillbuffer!(stream) == 0 - p = stream.position - p_end = stream.available - if hits_eof - p_eof = p_end - end - end - end - end - end -end - -function generate_read_function(reader_type, machine, init_code, actions; kwargs...) - kwargs = Dict(kwargs) - context = Automa.CodeGenContext( - generator=get(kwargs, :generator, :goto), - checkbounds=get(kwargs, :checkbounds, false), - loopunroll=get(kwargs, :loopunroll, 0) - ) - quote - function Base.read!(reader::$(reader_type), record::eltype($(reader_type)))::eltype($(reader_type)) - return _read!(reader, reader.state, record) - end - - function _read!(reader::$(reader_type), state::State, record::eltype($(reader_type))) - stream = state.stream - ensure_margin!(stream) - cs = state.cs - linenum = state.linenum - data = stream.buffer - p = stream.position - p_end = stream.available - p_eof = -1 - found_record = false - initialize!(record) - - $(init_code) - - if state.finished - throw(EOFError()) - end - - while true - $(Automa.generate_exec_code(context, machine, actions)) - - state.cs = cs - state.finished |= cs == 0 - state.linenum = linenum - stream.position = p - - if cs < 0 - error($(reader_type), " file format error on line ", linenum, " ~>", repr(String(data[p:min(p+7,p_end)]))) - elseif found_record - break - elseif cs == 0 - throw(EOFError()) - elseif p > p_eof ≥ 0 - error("incomplete $($(reader_type)) input on line ", linenum) - elseif BufferedStreams.available_bytes(stream) < 64 - hits_eof = BufferedStreams.fillbuffer!(stream) == 0 - p = stream.position - p_end = stream.available - if hits_eof - p_eof = p_end - end - end - end - - @assert isfilled(record) - return record - end - end -end - mutable struct Reader <: BioGenerics.IO.AbstractReader state::State header::Header @@ -210,221 +41,3 @@ end function Base.eltype(::Type{Reader}) return Record end - -# file = header . body -# header = metainfo* -# body = record* -isinteractive() && info("compiling SAM") -const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () - cat = Automa.RegExp.cat - rep = Automa.RegExp.rep - alt = Automa.RegExp.alt - opt = Automa.RegExp.opt - any = Automa.RegExp.any - - metainfo = let - tag = re"[A-Z][A-Z]" \ cat("CO") - tag.actions[:enter] = [:mark1] - tag.actions[:exit] = [:metainfo_tag] - - dict = let - key = re"[A-Za-z][A-Za-z0-9]" - key.actions[:enter] = [:mark2] - key.actions[:exit] = [:metainfo_dict_key] - val = re"[ -~]+" - val.actions[:enter] = [:mark2] - val.actions[:exit] = [:metainfo_dict_val] - keyval = cat(key, ':', val) - - cat(keyval, rep(cat('\t', keyval))) - end - dict.actions[:enter] = [:mark1] - dict.actions[:exit] = [:metainfo_val] - - co = cat("CO") - co.actions[:enter] = [:mark1] - co.actions[:exit] = [:metainfo_tag] - - comment = re"[^\r\n]*" - comment.actions[:enter] = [:mark1] - comment.actions[:exit] = [:metainfo_val] - - cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) - end - metainfo.actions[:enter] = [:anchor] - metainfo.actions[:exit] = [:metainfo] - - record = let - qname = re"[!-?A-~]+" - qname.actions[:enter] = [:mark] - qname.actions[:exit] = [:record_qname] - - flag = re"[0-9]+" - flag.actions[:enter] = [:mark] - flag.actions[:exit] = [:record_flag] - - rname = re"\*|[!-()+-<>-~][!-~]*" - rname.actions[:enter] = [:mark] - rname.actions[:exit] = [:record_rname] - - pos = re"[0-9]+" - pos.actions[:enter] = [:mark] - pos.actions[:exit] = [:record_pos] - - mapq = re"[0-9]+" - mapq.actions[:enter] = [:mark] - mapq.actions[:exit] = [:record_mapq] - - cigar = re"\*|([0-9]+[MIDNSHPX=])+" - cigar.actions[:enter] = [:mark] - cigar.actions[:exit] = [:record_cigar] - - rnext = re"\*|=|[!-()+-<>-~][!-~]*" - rnext.actions[:enter] = [:mark] - rnext.actions[:exit] = [:record_rnext] - - pnext = re"[0-9]+" - pnext.actions[:enter] = [:mark] - pnext.actions[:exit] = [:record_pnext] - - tlen = re"[-+]?[0-9]+" - tlen.actions[:enter] = [:mark] - tlen.actions[:exit] = [:record_tlen] - - seq = re"\*|[A-Za-z=.]+" - seq.actions[:enter] = [:mark] - seq.actions[:exit] = [:record_seq] - - qual = re"[!-~]+" - qual.actions[:enter] = [:mark] - qual.actions[:exit] = [:record_qual] - - field = let - tag = re"[A-Za-z][A-Za-z0-9]" - val = alt( - re"A:[!-~]", - re"i:[-+]?[0-9]+", - re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", - re"Z:[ !-~]*", - re"H:([0-9A-F][0-9A-F])*", - re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+") - - cat(tag, ':', val) - end - field.actions[:enter] = [:mark] - field.actions[:exit] = [:record_field] - - cat( - qname, '\t', - flag, '\t', - rname, '\t', - pos, '\t', - mapq, '\t', - cigar, '\t', - rnext, '\t', - pnext, '\t', - tlen, '\t', - seq, '\t', - qual, - rep(cat('\t', field))) - end - record.actions[:enter] = [:anchor] - record.actions[:exit] = [:record] - - newline = let - lf = re"\n" - lf.actions[:enter] = [:countline] - - cat(re"\r?", lf) - end - - header′ = rep(cat(metainfo, newline)) - header′.actions[:exit] = [:header] - header = cat(header′, opt(any() \ cat('@'))) # look ahead - - body = rep(cat(record, newline)) - - return map(Automa.compile, (metainfo, record, header, body)) -end)() - -const sam_metainfo_actions = Dict( - :metainfo_tag => :(record.tag = (mark1:p-1) .- offset), - :metainfo_val => :(record.val = (mark1:p-1) .- offset), - :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), - :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), - :metainfo => quote - resize_and_copy!(record.data, data, offset+1:p-1) - record.filled = (offset+1:p-1) .- offset - end, - :anchor => :(), - :mark1 => :(mark1 = p), - :mark2 => :(mark2 = p)) -eval( - generate_index_function( - MetaInfo, - sam_metainfo_machine, - :(mark1 = mark2 = offset = 0), - sam_metainfo_actions)) -eval( - generate_readheader_function( - Reader, - MetaInfo, - sam_header_machine, - :(mark1 = mark2 = offset = 0), - merge(sam_metainfo_actions, Dict( - :metainfo => quote - resize_and_copy!(record.data, data, upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - @assert isfilled(record) - push!(reader.header.metainfo, record) - ensure_margin!(stream) - record = MetaInfo() - end, - :header => :(finish_header = true; @escape), - :countline => :(linenum += 1), - :anchor => :(anchor!(stream, p); offset = p - 1))), - quote - if !eof(stream) - stream.position -= 1 # cancel look-ahead - end - end)) - -const sam_record_actions = Dict( - :record_qname => :(record.qname = (mark:p-1) .- offset), - :record_flag => :(record.flag = (mark:p-1) .- offset), - :record_rname => :(record.rname = (mark:p-1) .- offset), - :record_pos => :(record.pos = (mark:p-1) .- offset), - :record_mapq => :(record.mapq = (mark:p-1) .- offset), - :record_cigar => :(record.cigar = (mark:p-1) .- offset), - :record_rnext => :(record.rnext = (mark:p-1) .- offset), - :record_pnext => :(record.pnext = (mark:p-1) .- offset), - :record_tlen => :(record.tlen = (mark:p-1) .- offset), - :record_seq => :(record.seq = (mark:p-1) .- offset), - :record_qual => :(record.qual = (mark:p-1) .- offset), - :record_field => :(push!(record.fields, (mark:p-1) .- offset)), - :record => quote - resize_and_copy!(record.data, data, 1:p-1) - record.filled = (offset+1:p-1) .- offset - end, - :anchor => :(), - :mark => :(mark = p)) -eval( - generate_index_function( - Record, - sam_record_machine, - :(mark = offset = 0), - sam_record_actions)) -eval( - generate_read_function( - Reader, - sam_body_machine, - :(mark = offset = 0), - merge(sam_record_actions, Dict( - :record => quote - resize_and_copy!(record.data, data, upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - found_record = true - @escape - end, - :countline => :(linenum += 1), - :anchor => :(anchor!(stream, p); offset = p - 1))))) diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl new file mode 100644 index 0000000..4a18064 --- /dev/null +++ b/src/sam/readrecord.jl @@ -0,0 +1,389 @@ +@inline function anchor!(stream::BufferedStreams.BufferedInputStream, p, immobilize = true) + stream.anchor = p + stream.immobilized = immobilize + return stream +end + +@inline function upanchor!(stream::BufferedStreams.BufferedInputStream) + @assert stream.anchor != 0 "upanchor! called with no anchor set" + anchor = stream.anchor + stream.anchor = 0 + stream.immobilized = false + return anchor +end + +function ensure_margin!(stream::BufferedStreams.BufferedInputStream) + if stream.position * 20 > length(stream.buffer) * 19 + BufferedStreams.shiftdata!(stream) + end + return nothing +end + +@inline function resize_and_copy!(dst::Vector{UInt8}, src::Vector{UInt8}, r::UnitRange{Int}) + return resize_and_copy!(dst, 1, src, r) +end + +@inline function resize_and_copy!(dst::Vector{UInt8}, dstart::Int, src::Vector{UInt8}, r::UnitRange{Int}) + rlen = length(r) + if length(dst) != dstart + rlen - 1 + resize!(dst, dstart + rlen - 1) + end + copyto!(dst, dstart, src, first(r), rlen) + return dst +end + +function generate_index_function(record_type, machine, init_code, actions; kwargs...) + kwargs = Dict(kwargs) + context = Automa.CodeGenContext( + generator = get(kwargs, :generator, :goto), + checkbounds = get(kwargs, :checkbounds, false), + loopunroll = get(kwargs, :loopunroll, 0) + ) + quote + function index!(record::$(record_type)) + data = record.data + p = 1 + p_end = p_eof = sizeof(data) + initialize!(record) + $(init_code) + cs = $(machine.start_state) + $(Automa.generate_exec_code(context, machine, actions)) + if cs != 0 + throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) + end + @assert isfilled(record) + return record + end + end +end + +function generate_readheader_function(reader_type, metainfo_type, machine, init_code, actions, finish_code=:()) + quote + function readheader!(reader::$(reader_type)) + _readheader!(reader, reader.state) + end + + function _readheader!(reader::$(reader_type), state::State) + stream = state.stream + ensure_margin!(stream) + cs = state.cs + linenum = state.linenum + data = stream.buffer + p = stream.position + p_end = stream.available + p_eof = -1 + finish_header = false + record = $(metainfo_type)() + + $(init_code) + + while true + $(Automa.generate_exec_code(Automa.CodeGenContext(generator=:table), machine, actions)) + + state.cs = cs + state.finished = cs == 0 + state.linenum = linenum + stream.position = p + + if cs < 0 + error("$($(reader_type)) file format error on line ", linenum) + elseif finish_header + $(finish_code) + break + elseif p > p_eof ≥ 0 + error("incomplete $($(reader_type)) input on line ", linenum) + else + hits_eof = BufferedStreams.fillbuffer!(stream) == 0 + p = stream.position + p_end = stream.available + if hits_eof + p_eof = p_end + end + end + end + end + end +end + +function generate_read_function(reader_type, machine, init_code, actions; kwargs...) + kwargs = Dict(kwargs) + context = Automa.CodeGenContext( + generator=get(kwargs, :generator, :goto), + checkbounds=get(kwargs, :checkbounds, false), + loopunroll=get(kwargs, :loopunroll, 0) + ) + quote + function Base.read!(reader::$(reader_type), record::eltype($(reader_type)))::eltype($(reader_type)) + return _read!(reader, reader.state, record) + end + + function _read!(reader::$(reader_type), state::State, record::eltype($(reader_type))) + stream = state.stream + ensure_margin!(stream) + cs = state.cs + linenum = state.linenum + data = stream.buffer + p = stream.position + p_end = stream.available + p_eof = -1 + found_record = false + initialize!(record) + + $(init_code) + + if state.finished + throw(EOFError()) + end + + while true + $(Automa.generate_exec_code(context, machine, actions)) + + state.cs = cs + state.finished |= cs == 0 + state.linenum = linenum + stream.position = p + + if cs < 0 + error($(reader_type), " file format error on line ", linenum, " ~>", repr(String(data[p:min(p+7,p_end)]))) + elseif found_record + break + elseif cs == 0 + throw(EOFError()) + elseif p > p_eof ≥ 0 + error("incomplete $($(reader_type)) input on line ", linenum) + elseif BufferedStreams.available_bytes(stream) < 64 + hits_eof = BufferedStreams.fillbuffer!(stream) == 0 + p = stream.position + p_end = stream.available + if hits_eof + p_eof = p_end + end + end + end + + @assert isfilled(record) + return record + end + end +end + +# Automa.jl generated readrecord! and readmetainfo! functions +# ======================================== + +# file = header . body +# header = metainfo* +# body = record* +isinteractive() && info("compiling SAM") +const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () + cat = Automa.RegExp.cat + rep = Automa.RegExp.rep + alt = Automa.RegExp.alt + opt = Automa.RegExp.opt + any = Automa.RegExp.any + + metainfo = let + tag = re"[A-Z][A-Z]" \ cat("CO") + tag.actions[:enter] = [:mark1] + tag.actions[:exit] = [:metainfo_tag] + + dict = let + key = re"[A-Za-z][A-Za-z0-9]" + key.actions[:enter] = [:mark2] + key.actions[:exit] = [:metainfo_dict_key] + val = re"[ -~]+" + val.actions[:enter] = [:mark2] + val.actions[:exit] = [:metainfo_dict_val] + keyval = cat(key, ':', val) + + cat(keyval, rep(cat('\t', keyval))) + end + dict.actions[:enter] = [:mark1] + dict.actions[:exit] = [:metainfo_val] + + co = cat("CO") + co.actions[:enter] = [:mark1] + co.actions[:exit] = [:metainfo_tag] + + comment = re"[^\r\n]*" + comment.actions[:enter] = [:mark1] + comment.actions[:exit] = [:metainfo_val] + + cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) + end + metainfo.actions[:enter] = [:anchor] + metainfo.actions[:exit] = [:metainfo] + + record = let + qname = re"[!-?A-~]+" + qname.actions[:enter] = [:mark] + qname.actions[:exit] = [:record_qname] + + flag = re"[0-9]+" + flag.actions[:enter] = [:mark] + flag.actions[:exit] = [:record_flag] + + rname = re"\*|[!-()+-<>-~][!-~]*" + rname.actions[:enter] = [:mark] + rname.actions[:exit] = [:record_rname] + + pos = re"[0-9]+" + pos.actions[:enter] = [:mark] + pos.actions[:exit] = [:record_pos] + + mapq = re"[0-9]+" + mapq.actions[:enter] = [:mark] + mapq.actions[:exit] = [:record_mapq] + + cigar = re"\*|([0-9]+[MIDNSHPX=])+" + cigar.actions[:enter] = [:mark] + cigar.actions[:exit] = [:record_cigar] + + rnext = re"\*|=|[!-()+-<>-~][!-~]*" + rnext.actions[:enter] = [:mark] + rnext.actions[:exit] = [:record_rnext] + + pnext = re"[0-9]+" + pnext.actions[:enter] = [:mark] + pnext.actions[:exit] = [:record_pnext] + + tlen = re"[-+]?[0-9]+" + tlen.actions[:enter] = [:mark] + tlen.actions[:exit] = [:record_tlen] + + seq = re"\*|[A-Za-z=.]+" + seq.actions[:enter] = [:mark] + seq.actions[:exit] = [:record_seq] + + qual = re"[!-~]+" + qual.actions[:enter] = [:mark] + qual.actions[:exit] = [:record_qual] + + field = let + tag = re"[A-Za-z][A-Za-z0-9]" + val = alt( + re"A:[!-~]", + re"i:[-+]?[0-9]+", + re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?", + re"Z:[ !-~]*", + re"H:([0-9A-F][0-9A-F])*", + re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+") + + cat(tag, ':', val) + end + field.actions[:enter] = [:mark] + field.actions[:exit] = [:record_field] + + cat( + qname, '\t', + flag, '\t', + rname, '\t', + pos, '\t', + mapq, '\t', + cigar, '\t', + rnext, '\t', + pnext, '\t', + tlen, '\t', + seq, '\t', + qual, + rep(cat('\t', field))) + end + record.actions[:enter] = [:anchor] + record.actions[:exit] = [:record] + + newline = let + lf = re"\n" + lf.actions[:enter] = [:countline] + + cat(re"\r?", lf) + end + + header′ = rep(cat(metainfo, newline)) + header′.actions[:exit] = [:header] + header = cat(header′, opt(any() \ cat('@'))) # look ahead + + body = rep(cat(record, newline)) + + return map(Automa.compile, (metainfo, record, header, body)) +end)() + +const sam_metainfo_actions = Dict( + :metainfo_tag => :(record.tag = (mark1:p-1) .- offset), + :metainfo_val => :(record.val = (mark1:p-1) .- offset), + :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), + :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), + :metainfo => quote + resize_and_copy!(record.data, data, offset+1:p-1) + record.filled = (offset+1:p-1) .- offset + end, + :anchor => :(), + :mark1 => :(mark1 = p), + :mark2 => :(mark2 = p)) +eval( + generate_index_function( + MetaInfo, + sam_metainfo_machine, + :(mark1 = mark2 = offset = 0), + sam_metainfo_actions)) +eval( + generate_readheader_function( + Reader, + MetaInfo, + sam_header_machine, + :(mark1 = mark2 = offset = 0), + merge(sam_metainfo_actions, Dict( + :metainfo => quote + resize_and_copy!(record.data, data, upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + @assert isfilled(record) + push!(reader.header.metainfo, record) + ensure_margin!(stream) + record = MetaInfo() + end, + :header => :(finish_header = true; @escape), + :countline => :(linenum += 1), + :anchor => :(anchor!(stream, p); offset = p - 1))), + quote + if !eof(stream) + stream.position -= 1 # cancel look-ahead + end + end)) + +const sam_record_actions = Dict( + :record_qname => :(record.qname = (mark:p-1) .- offset), + :record_flag => :(record.flag = (mark:p-1) .- offset), + :record_rname => :(record.rname = (mark:p-1) .- offset), + :record_pos => :(record.pos = (mark:p-1) .- offset), + :record_mapq => :(record.mapq = (mark:p-1) .- offset), + :record_cigar => :(record.cigar = (mark:p-1) .- offset), + :record_rnext => :(record.rnext = (mark:p-1) .- offset), + :record_pnext => :(record.pnext = (mark:p-1) .- offset), + :record_tlen => :(record.tlen = (mark:p-1) .- offset), + :record_seq => :(record.seq = (mark:p-1) .- offset), + :record_qual => :(record.qual = (mark:p-1) .- offset), + :record_field => :(push!(record.fields, (mark:p-1) .- offset)), + :record => quote + resize_and_copy!(record.data, data, 1:p-1) + record.filled = (offset+1:p-1) .- offset + end, + :anchor => :(), + :mark => :(mark = p)) +eval( + generate_index_function( + Record, + sam_record_machine, + :(mark = offset = 0), + sam_record_actions)) +eval( + generate_read_function( + Reader, + sam_body_machine, + :(mark = offset = 0), + merge(sam_record_actions, Dict( + :record => quote + resize_and_copy!(record.data, data, upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + found_record = true + @escape + end, + :countline => :(linenum += 1), + :anchor => :(anchor!(stream, p); offset = p - 1))))) diff --git a/src/sam/sam.jl b/src/sam/sam.jl index 628b085..5b6bab8 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -66,6 +66,7 @@ include("metainfo.jl") include("record.jl") include("header.jl") include("reader.jl") +include("readrecord.jl") include("writer.jl") end From 892ad0d7dc8578fae4f9c24c4624561e5b8cfd54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sun, 8 Sep 2019 09:08:12 +1000 Subject: [PATCH 6/9] Pipe into eval --- src/sam/readrecord.jl | 109 ++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl index 4a18064..0f1811a 100644 --- a/src/sam/readrecord.jl +++ b/src/sam/readrecord.jl @@ -317,36 +317,39 @@ const sam_metainfo_actions = Dict( end, :anchor => :(), :mark1 => :(mark1 = p), - :mark2 => :(mark2 = p)) -eval( - generate_index_function( - MetaInfo, - sam_metainfo_machine, - :(mark1 = mark2 = offset = 0), - sam_metainfo_actions)) -eval( - generate_readheader_function( - Reader, - MetaInfo, - sam_header_machine, - :(mark1 = mark2 = offset = 0), - merge(sam_metainfo_actions, Dict( - :metainfo => quote - resize_and_copy!(record.data, data, upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - @assert isfilled(record) - push!(reader.header.metainfo, record) - ensure_margin!(stream) - record = MetaInfo() - end, - :header => :(finish_header = true; @escape), - :countline => :(linenum += 1), - :anchor => :(anchor!(stream, p); offset = p - 1))), - quote - if !eof(stream) - stream.position -= 1 # cancel look-ahead - end - end)) + :mark2 => :(mark2 = p) +) + +generate_index_function( + MetaInfo, + sam_metainfo_machine, + :(mark1 = mark2 = offset = 0), + sam_metainfo_actions +) |> eval + +generate_readheader_function( + Reader, + MetaInfo, + sam_header_machine, + :(mark1 = mark2 = offset = 0), + merge(sam_metainfo_actions, Dict( + :metainfo => quote + resize_and_copy!(record.data, data, upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + @assert isfilled(record) + push!(reader.header.metainfo, record) + ensure_margin!(stream) + record = MetaInfo() + end, + :header => :(finish_header = true; @escape), + :countline => :(linenum += 1), + :anchor => :(anchor!(stream, p); offset = p - 1))), + quote + if !eof(stream) + stream.position -= 1 # cancel look-ahead + end + end +) |> eval const sam_record_actions = Dict( :record_qname => :(record.qname = (mark:p-1) .- offset), @@ -366,24 +369,28 @@ const sam_record_actions = Dict( record.filled = (offset+1:p-1) .- offset end, :anchor => :(), - :mark => :(mark = p)) -eval( - generate_index_function( - Record, - sam_record_machine, - :(mark = offset = 0), - sam_record_actions)) -eval( - generate_read_function( - Reader, - sam_body_machine, - :(mark = offset = 0), - merge(sam_record_actions, Dict( - :record => quote - resize_and_copy!(record.data, data, upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - found_record = true - @escape - end, - :countline => :(linenum += 1), - :anchor => :(anchor!(stream, p); offset = p - 1))))) + :mark => :(mark = p) +) + +generate_index_function( + Record, + sam_record_machine, + :(mark = offset = 0), + sam_record_actions +) |> eval + +generate_read_function( + Reader, + sam_body_machine, + :(mark = offset = 0), + merge(sam_record_actions, Dict( + :record => quote + resize_and_copy!(record.data, data, upanchor!(stream):p-1) + record.filled = (offset+1:p-1) .- offset + found_record = true + @escape + end, + :countline => :(linenum += 1), + :anchor => :(anchor!(stream, p); offset = p - 1)) + ) +) |> eval From 6cd89aba08cca53cc6888f6109edb7119907438e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 7 Sep 2019 21:47:00 +1000 Subject: [PATCH 7/9] Move isinteractive into machine --- src/sam/readrecord.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl index 0f1811a..1cb27d5 100644 --- a/src/sam/readrecord.jl +++ b/src/sam/readrecord.jl @@ -173,8 +173,10 @@ end # file = header . body # header = metainfo* # body = record* -isinteractive() && info("compiling SAM") const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () + + isinteractive() && info("compiling SAM") + cat = Automa.RegExp.cat rep = Automa.RegExp.rep alt = Automa.RegExp.alt From 12db2c4ea84c1ba045f49ae2cbac7f6b518fe583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Sat, 30 Nov 2019 21:18:34 +1100 Subject: [PATCH 8/9] Align emission symbols for TranscodingStreams --- src/sam/readrecord.jl | 94 +++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl index 1cb27d5..82ebb93 100644 --- a/src/sam/readrecord.jl +++ b/src/sam/readrecord.jl @@ -185,79 +185,79 @@ const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_mac metainfo = let tag = re"[A-Z][A-Z]" \ cat("CO") - tag.actions[:enter] = [:mark1] + tag.actions[:enter] = [:pos1] tag.actions[:exit] = [:metainfo_tag] dict = let key = re"[A-Za-z][A-Za-z0-9]" - key.actions[:enter] = [:mark2] + key.actions[:enter] = [:pos2] key.actions[:exit] = [:metainfo_dict_key] val = re"[ -~]+" - val.actions[:enter] = [:mark2] + val.actions[:enter] = [:pos2] val.actions[:exit] = [:metainfo_dict_val] keyval = cat(key, ':', val) cat(keyval, rep(cat('\t', keyval))) end - dict.actions[:enter] = [:mark1] + dict.actions[:enter] = [:pos1] dict.actions[:exit] = [:metainfo_val] co = cat("CO") - co.actions[:enter] = [:mark1] + co.actions[:enter] = [:pos1] co.actions[:exit] = [:metainfo_tag] comment = re"[^\r\n]*" - comment.actions[:enter] = [:mark1] + comment.actions[:enter] = [:pos1] comment.actions[:exit] = [:metainfo_val] cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment))) end - metainfo.actions[:enter] = [:anchor] + metainfo.actions[:enter] = [:mark] metainfo.actions[:exit] = [:metainfo] record = let qname = re"[!-?A-~]+" - qname.actions[:enter] = [:mark] + qname.actions[:enter] = [:pos] qname.actions[:exit] = [:record_qname] flag = re"[0-9]+" - flag.actions[:enter] = [:mark] + flag.actions[:enter] = [:pos] flag.actions[:exit] = [:record_flag] rname = re"\*|[!-()+-<>-~][!-~]*" - rname.actions[:enter] = [:mark] + rname.actions[:enter] = [:pos] rname.actions[:exit] = [:record_rname] pos = re"[0-9]+" - pos.actions[:enter] = [:mark] + pos.actions[:enter] = [:pos] pos.actions[:exit] = [:record_pos] mapq = re"[0-9]+" - mapq.actions[:enter] = [:mark] + mapq.actions[:enter] = [:pos] mapq.actions[:exit] = [:record_mapq] cigar = re"\*|([0-9]+[MIDNSHPX=])+" - cigar.actions[:enter] = [:mark] + cigar.actions[:enter] = [:pos] cigar.actions[:exit] = [:record_cigar] rnext = re"\*|=|[!-()+-<>-~][!-~]*" - rnext.actions[:enter] = [:mark] + rnext.actions[:enter] = [:pos] rnext.actions[:exit] = [:record_rnext] pnext = re"[0-9]+" - pnext.actions[:enter] = [:mark] + pnext.actions[:enter] = [:pos] pnext.actions[:exit] = [:record_pnext] tlen = re"[-+]?[0-9]+" - tlen.actions[:enter] = [:mark] + tlen.actions[:enter] = [:pos] tlen.actions[:exit] = [:record_tlen] seq = re"\*|[A-Za-z=.]+" - seq.actions[:enter] = [:mark] + seq.actions[:enter] = [:pos] seq.actions[:exit] = [:record_seq] qual = re"[!-~]+" - qual.actions[:enter] = [:mark] + qual.actions[:enter] = [:pos] qual.actions[:exit] = [:record_qual] field = let @@ -272,7 +272,7 @@ const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_mac cat(tag, ':', val) end - field.actions[:enter] = [:mark] + field.actions[:enter] = [:pos] field.actions[:exit] = [:record_field] cat( @@ -289,7 +289,7 @@ const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_mac qual, rep(cat('\t', field))) end - record.actions[:enter] = [:anchor] + record.actions[:enter] = [:mark] record.actions[:exit] = [:record] newline = let @@ -309,23 +309,23 @@ const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_mac end)() const sam_metainfo_actions = Dict( - :metainfo_tag => :(record.tag = (mark1:p-1) .- offset), - :metainfo_val => :(record.val = (mark1:p-1) .- offset), - :metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)), - :metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)), + :metainfo_tag => :(record.tag = (pos1:p-1) .- offset), + :metainfo_val => :(record.val = (pos1:p-1) .- offset), + :metainfo_dict_key => :(push!(record.dictkey, (pos2:p-1) .- offset)), + :metainfo_dict_val => :(push!(record.dictval, (pos2:p-1) .- offset)), :metainfo => quote resize_and_copy!(record.data, data, offset+1:p-1) record.filled = (offset+1:p-1) .- offset end, - :anchor => :(), - :mark1 => :(mark1 = p), - :mark2 => :(mark2 = p) + :mark => :(), + :pos1 => :(pos1 = p), + :pos2 => :(pos2 = p) ) generate_index_function( MetaInfo, sam_metainfo_machine, - :(mark1 = mark2 = offset = 0), + :(pos1 = pos2 = offset = 0), sam_metainfo_actions ) |> eval @@ -333,7 +333,7 @@ generate_readheader_function( Reader, MetaInfo, sam_header_machine, - :(mark1 = mark2 = offset = 0), + :(pos1 = pos2 = offset = 0), merge(sam_metainfo_actions, Dict( :metainfo => quote resize_and_copy!(record.data, data, upanchor!(stream):p-1) @@ -345,7 +345,7 @@ generate_readheader_function( end, :header => :(finish_header = true; @escape), :countline => :(linenum += 1), - :anchor => :(anchor!(stream, p); offset = p - 1))), + :mark => :(anchor!(stream, p); offset = p - 1))), quote if !eof(stream) stream.position -= 1 # cancel look-ahead @@ -354,37 +354,37 @@ generate_readheader_function( ) |> eval const sam_record_actions = Dict( - :record_qname => :(record.qname = (mark:p-1) .- offset), - :record_flag => :(record.flag = (mark:p-1) .- offset), - :record_rname => :(record.rname = (mark:p-1) .- offset), - :record_pos => :(record.pos = (mark:p-1) .- offset), - :record_mapq => :(record.mapq = (mark:p-1) .- offset), - :record_cigar => :(record.cigar = (mark:p-1) .- offset), - :record_rnext => :(record.rnext = (mark:p-1) .- offset), - :record_pnext => :(record.pnext = (mark:p-1) .- offset), - :record_tlen => :(record.tlen = (mark:p-1) .- offset), - :record_seq => :(record.seq = (mark:p-1) .- offset), - :record_qual => :(record.qual = (mark:p-1) .- offset), - :record_field => :(push!(record.fields, (mark:p-1) .- offset)), + :record_qname => :(record.qname = (pos:p-1) .- offset), + :record_flag => :(record.flag = (pos:p-1) .- offset), + :record_rname => :(record.rname = (pos:p-1) .- offset), + :record_pos => :(record.pos = (pos:p-1) .- offset), + :record_mapq => :(record.mapq = (pos:p-1) .- offset), + :record_cigar => :(record.cigar = (pos:p-1) .- offset), + :record_rnext => :(record.rnext = (pos:p-1) .- offset), + :record_pnext => :(record.pnext = (pos:p-1) .- offset), + :record_tlen => :(record.tlen = (pos:p-1) .- offset), + :record_seq => :(record.seq = (pos:p-1) .- offset), + :record_qual => :(record.qual = (pos:p-1) .- offset), + :record_field => :(push!(record.fields, (pos:p-1) .- offset)), :record => quote resize_and_copy!(record.data, data, 1:p-1) record.filled = (offset+1:p-1) .- offset end, - :anchor => :(), - :mark => :(mark = p) + :mark => :(), + :pos => :(pos = p) ) generate_index_function( Record, sam_record_machine, - :(mark = offset = 0), + :(pos = offset = 0), sam_record_actions ) |> eval generate_read_function( Reader, sam_body_machine, - :(mark = offset = 0), + :(pos = offset = 0), merge(sam_record_actions, Dict( :record => quote resize_and_copy!(record.data, data, upanchor!(stream):p-1) @@ -393,6 +393,6 @@ generate_read_function( @escape end, :countline => :(linenum += 1), - :anchor => :(anchor!(stream, p); offset = p - 1)) + :mark => :(anchor!(stream, p); offset = p - 1)) ) ) |> eval From de56faf0667078ff20bb17747f77b0dd422af2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciar=C3=A1n=20O=27Mara?= Date: Thu, 20 Feb 2020 21:19:53 +1100 Subject: [PATCH 9/9] Migrate from BufferedStreams to TranscodingStreams --- Project.toml | 4 +- src/sam/reader.jl | 76 ++++++- src/sam/readrecord.jl | 489 +++++++++++++++++++++--------------------- src/sam/sam.jl | 31 +-- 4 files changed, 327 insertions(+), 273 deletions(-) diff --git a/Project.toml b/Project.toml index 850b570..29455b6 100644 --- a/Project.toml +++ b/Project.toml @@ -9,10 +9,10 @@ BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" -BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" [compat] Automa = "0.7, 0.8" @@ -20,9 +20,9 @@ BGZFStreams = "0.3" BioAlignments = "2" BioGenerics = "0.1" BioSequences = "2" -BufferedStreams = "1" GenomicFeatures = "2" Indexes = "0.1" +TranscodingStreams = "0.6, 0.7, 0.8, 0.9" julia = "1.1" [extras] diff --git a/src/sam/reader.jl b/src/sam/reader.jl index 0406165..3270957 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -1,16 +1,22 @@ # SAM Reader # ========= -mutable struct Reader <: BioGenerics.IO.AbstractReader - state::State +mutable struct Reader{S <: TranscodingStream} <: BioGenerics.IO.AbstractReader + state::State{S} header::Header +end - function Reader(input::BufferedStreams.BufferedInputStream) - reader = new(State(sam_header_machine.start_state, input), Header()) - readheader!(reader) - reader.state.cs = sam_body_machine.start_state - return reader - end +function Reader(state::State{S}) where {S <: TranscodingStream} + + rdr = Reader(state, Header()) + + cs, ln, f = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum)) + + rdr.state.state = sam_machine_body.start_state + rdr.state.linenum = ln + rdr.state.filled = false + + return rdr end """ @@ -22,7 +28,19 @@ Create a data reader of the SAM file format. * `input`: data source """ function Reader(input::IO) - return Reader(BufferedStreams.BufferedInputStream(input)) + + if input isa TranscodingStream + return Reader(State(input, 1, 1, false)) + end + + stream = TranscodingStreams.NoopStream(input) + + return Reader(State(stream, 1, 1, false)) + +end + +function Base.eltype(::Type{<:Reader}) + return Record end function BioGenerics.IO.stream(reader::Reader) @@ -38,6 +56,42 @@ function header(reader::Reader)::Header return reader.header end -function Base.eltype(::Type{Reader}) - return Record +function Base.close(reader::Reader) + if reader.state.stream isa IO + close(reader.state.stream) + end + return nothing +end + +function index!(record::MetaInfo) + stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) + found = index!(stream, record) + if !found + throw(ArgumentError("invalid SAM metadata")) + end + return record +end + +function index!(record::Record) + stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) + found = index!(stream, record) + if !found + throw(ArgumentError("invalid SAM record")) + end + return record +end + +function Base.read!(rdr::Reader, rec::Record) + + cs, ln, f = readrecord!(rdr.state.stream, rec, (rdr.state.state, rdr.state.linenum)) + + rdr.state.state = cs + rdr.state.linenum = ln + rdr.state.filled = f + + if !f + cs == 0 && throw(EOFError()) + throw(ArgumentError("malformed SAM file")) + end + return rec end diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl index 82ebb93..e7ea75f 100644 --- a/src/sam/readrecord.jl +++ b/src/sam/readrecord.jl @@ -1,179 +1,10 @@ -@inline function anchor!(stream::BufferedStreams.BufferedInputStream, p, immobilize = true) - stream.anchor = p - stream.immobilized = immobilize - return stream -end - -@inline function upanchor!(stream::BufferedStreams.BufferedInputStream) - @assert stream.anchor != 0 "upanchor! called with no anchor set" - anchor = stream.anchor - stream.anchor = 0 - stream.immobilized = false - return anchor -end - -function ensure_margin!(stream::BufferedStreams.BufferedInputStream) - if stream.position * 20 > length(stream.buffer) * 19 - BufferedStreams.shiftdata!(stream) - end - return nothing -end - -@inline function resize_and_copy!(dst::Vector{UInt8}, src::Vector{UInt8}, r::UnitRange{Int}) - return resize_and_copy!(dst, 1, src, r) -end - -@inline function resize_and_copy!(dst::Vector{UInt8}, dstart::Int, src::Vector{UInt8}, r::UnitRange{Int}) - rlen = length(r) - if length(dst) != dstart + rlen - 1 - resize!(dst, dstart + rlen - 1) - end - copyto!(dst, dstart, src, first(r), rlen) - return dst -end - -function generate_index_function(record_type, machine, init_code, actions; kwargs...) - kwargs = Dict(kwargs) - context = Automa.CodeGenContext( - generator = get(kwargs, :generator, :goto), - checkbounds = get(kwargs, :checkbounds, false), - loopunroll = get(kwargs, :loopunroll, 0) - ) - quote - function index!(record::$(record_type)) - data = record.data - p = 1 - p_end = p_eof = sizeof(data) - initialize!(record) - $(init_code) - cs = $(machine.start_state) - $(Automa.generate_exec_code(context, machine, actions)) - if cs != 0 - throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) - end - @assert isfilled(record) - return record - end - end -end - -function generate_readheader_function(reader_type, metainfo_type, machine, init_code, actions, finish_code=:()) - quote - function readheader!(reader::$(reader_type)) - _readheader!(reader, reader.state) - end - - function _readheader!(reader::$(reader_type), state::State) - stream = state.stream - ensure_margin!(stream) - cs = state.cs - linenum = state.linenum - data = stream.buffer - p = stream.position - p_end = stream.available - p_eof = -1 - finish_header = false - record = $(metainfo_type)() - - $(init_code) - - while true - $(Automa.generate_exec_code(Automa.CodeGenContext(generator=:table), machine, actions)) - - state.cs = cs - state.finished = cs == 0 - state.linenum = linenum - stream.position = p - - if cs < 0 - error("$($(reader_type)) file format error on line ", linenum) - elseif finish_header - $(finish_code) - break - elseif p > p_eof ≥ 0 - error("incomplete $($(reader_type)) input on line ", linenum) - else - hits_eof = BufferedStreams.fillbuffer!(stream) == 0 - p = stream.position - p_end = stream.available - if hits_eof - p_eof = p_end - end - end - end - end - end -end - -function generate_read_function(reader_type, machine, init_code, actions; kwargs...) - kwargs = Dict(kwargs) - context = Automa.CodeGenContext( - generator=get(kwargs, :generator, :goto), - checkbounds=get(kwargs, :checkbounds, false), - loopunroll=get(kwargs, :loopunroll, 0) - ) - quote - function Base.read!(reader::$(reader_type), record::eltype($(reader_type)))::eltype($(reader_type)) - return _read!(reader, reader.state, record) - end - - function _read!(reader::$(reader_type), state::State, record::eltype($(reader_type))) - stream = state.stream - ensure_margin!(stream) - cs = state.cs - linenum = state.linenum - data = stream.buffer - p = stream.position - p_end = stream.available - p_eof = -1 - found_record = false - initialize!(record) - - $(init_code) - - if state.finished - throw(EOFError()) - end - - while true - $(Automa.generate_exec_code(context, machine, actions)) - - state.cs = cs - state.finished |= cs == 0 - state.linenum = linenum - stream.position = p - - if cs < 0 - error($(reader_type), " file format error on line ", linenum, " ~>", repr(String(data[p:min(p+7,p_end)]))) - elseif found_record - break - elseif cs == 0 - throw(EOFError()) - elseif p > p_eof ≥ 0 - error("incomplete $($(reader_type)) input on line ", linenum) - elseif BufferedStreams.available_bytes(stream) < 64 - hits_eof = BufferedStreams.fillbuffer!(stream) == 0 - p = stream.position - p_end = stream.available - if hits_eof - p_eof = p_end - end - end - end - - @assert isfilled(record) - return record - end - end -end - # Automa.jl generated readrecord! and readmetainfo! functions # ======================================== # file = header . body # header = metainfo* # body = record* -const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function () +const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_body, sam_machine = (function () isinteractive() && info("compiling SAM") @@ -304,95 +135,261 @@ const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_mac header = cat(header′, opt(any() \ cat('@'))) # look ahead body = rep(cat(record, newline)) + body.actions[:exit] = [:body] - return map(Automa.compile, (metainfo, record, header, body)) + sam = cat(header, body) + + return map(Automa.compile, (metainfo, record, header, body, sam)) end)() -const sam_metainfo_actions = Dict( - :metainfo_tag => :(record.tag = (pos1:p-1) .- offset), - :metainfo_val => :(record.val = (pos1:p-1) .- offset), - :metainfo_dict_key => :(push!(record.dictkey, (pos2:p-1) .- offset)), - :metainfo_dict_val => :(push!(record.dictval, (pos2:p-1) .- offset)), +# write("sam_machine_metainfo.dot", Automa.machine2dot(sam_machine_metainfo)) +# run(`dot -Tsvg -o sam_machine_metainfo.svg sam_machine_metainfo.dot`) +# +# write("sam_machine_record.dot", Automa.machine2dot(sam_machine_record)) +# run(`dot -Tsvg -o sam_machine_record.svg sam_machine_record.dot`) +# +# write("sam_machine_header.dot", Automa.machine2dot(sam_machine_header)) +# run(`dot -Tsvg -o sam_machine_header.svg sam_machine_header.dot`) +# +# write("sam_machine_body.dot", Automa.machine2dot(sam_machine_body)) +# run(`dot -Tsvg -o sam_machine_body.svg sam_machine_body.dot`) +# +# write("sam_machine.dot", Automa.machine2dot(sam_machine)) +# run(`dot -Tsvg -o sam_machine.svg sam_machine.dot`) + +function appendfrom!(dst, dpos, src, spos, n) + if length(dst) < dpos + n - 1 + resize!(dst, dpos + n - 1) + end + copyto!(dst, dpos, src, spos, n) + return dst +end + +const sam_actions_metainfo = Dict( + :mark => :(@mark), + :pos1 => :(pos1 = @relpos(p)), + :pos2 => :(pos2 = @relpos(p)), + :metainfo_tag => :(metainfo.tag = pos1:@relpos(p-1)), + :metainfo_val => :(metainfo.val = pos1:@relpos(p-1)), + :metainfo_dict_key => :(push!(metainfo.dictkey, pos2:@relpos(p-1))), + :metainfo_dict_val => :(push!(metainfo.dictval, pos2:@relpos(p-1))), :metainfo => quote - resize_and_copy!(record.data, data, offset+1:p-1) - record.filled = (offset+1:p-1) .- offset - end, - :mark => :(), - :pos1 => :(pos1 = p), - :pos2 => :(pos2 = p) -) + let markpos = @markpos() -generate_index_function( - MetaInfo, - sam_metainfo_machine, - :(pos1 = pos2 = offset = 0), - sam_metainfo_actions -) |> eval + appendfrom!(metainfo.data, 1, data, markpos, length(markpos:p-1)) -generate_readheader_function( - Reader, - MetaInfo, - sam_header_machine, - :(pos1 = pos2 = offset = 0), - merge(sam_metainfo_actions, Dict( - :metainfo => quote - resize_and_copy!(record.data, data, upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset - @assert isfilled(record) - push!(reader.header.metainfo, record) - ensure_margin!(stream) - record = MetaInfo() - end, - :header => :(finish_header = true; @escape), - :countline => :(linenum += 1), - :mark => :(anchor!(stream, p); offset = p - 1))), - quote - if !eof(stream) - stream.position -= 1 # cancel look-ahead + metainfo.filled = @relpos(markpos):@relpos(p-1) + + found_metainfo = true end end -) |> eval - -const sam_record_actions = Dict( - :record_qname => :(record.qname = (pos:p-1) .- offset), - :record_flag => :(record.flag = (pos:p-1) .- offset), - :record_rname => :(record.rname = (pos:p-1) .- offset), - :record_pos => :(record.pos = (pos:p-1) .- offset), - :record_mapq => :(record.mapq = (pos:p-1) .- offset), - :record_cigar => :(record.cigar = (pos:p-1) .- offset), - :record_rnext => :(record.rnext = (pos:p-1) .- offset), - :record_pnext => :(record.pnext = (pos:p-1) .- offset), - :record_tlen => :(record.tlen = (pos:p-1) .- offset), - :record_seq => :(record.seq = (pos:p-1) .- offset), - :record_qual => :(record.qual = (pos:p-1) .- offset), - :record_field => :(push!(record.fields, (pos:p-1) .- offset)), - :record => quote - resize_and_copy!(record.data, data, 1:p-1) - record.filled = (offset+1:p-1) .- offset - end, - :mark => :(), - :pos => :(pos = p) ) -generate_index_function( - Record, - sam_record_machine, - :(pos = offset = 0), - sam_record_actions -) |> eval +const sam_actions_header = merge( + sam_actions_metainfo, + Dict( + :countline => :(linenum += 1), + :header => quote + + finish_header = true + + if !eof(stream) + p -= 1 # cancel look-ahead + end + + @escape + end + ) +) + +const sam_actions_record = Dict( + :mark => :(@mark), + :pos => :(pos = @relpos(p)), + :record_qname => :(record.qname = pos:@relpos(p-1)), + :record_flag => :(record.flag = pos:@relpos(p-1)), + :record_rname => :(record.rname = pos:@relpos(p-1)), + :record_pos => :(record.pos = pos:@relpos(p-1)), + :record_mapq => :(record.mapq = pos:@relpos(p-1)), + :record_cigar => :(record.cigar = pos:@relpos(p-1)), + :record_rnext => :(record.rnext = pos:@relpos(p-1)), + :record_pnext => :(record.pnext = pos:@relpos(p-1)), + :record_tlen => :(record.tlen = pos:@relpos(p-1)), + :record_seq => :(record.seq = pos:@relpos(p-1)), + :record_qual => :(record.qual = pos:@relpos(p-1)), + :record_field => :(push!(record.fields, pos:@relpos(p-1))), + :record => quote + let markpos = @markpos() + + appendfrom!(record.data, 1, data, markpos, length(markpos:p-1)) + + record.filled = @relpos(markpos):@relpos(p-1) -generate_read_function( - Reader, - sam_body_machine, - :(pos = offset = 0), - merge(sam_record_actions, Dict( - :record => quote - resize_and_copy!(record.data, data, upanchor!(stream):p-1) - record.filled = (offset+1:p-1) .- offset found_record = true @escape - end, + end + end +) + +const sam_actions_body = merge( + sam_actions_record, + Dict( :countline => :(linenum += 1), - :mark => :(anchor!(stream, p); offset = p - 1)) + :body => quote + finish_body = true + @escape + end ) +) + +# const sam_actions = merge( +# sam_actions_header, +# sam_actions_body +# ) + +const sam_context = Automa.CodeGenContext( + generator = :goto, + checkbounds = false, + loopunroll = 0 +) + +const sam_initcode_metainfo = quote + pos1 = 0 + pos2 = 0 + found_metainfo = false +end + +const sam_initcode_record = quote + pos = 0 + found_record = false +end + +const sam_initcode_header = quote + $(sam_initcode_metainfo) + metainfo = MetaInfo() + finish_header = false + cs, linenum = state +end + +const sam_initcode_body = quote + $(sam_initcode_record) + finish_body = false + cs, linenum = state +end + +const sam_loopcode_metainfo = quote + + if cs < 0 + throw(ArgumentError("malformed metainfo at pos $(p)")) + end + + if found_metainfo + @goto __return__ + end +end + +const sam_returncode_metainfo = quote + return found_metainfo +end + + +Automa.Stream.generate_reader( + :index!, + sam_machine_metainfo, + arguments = (:(metainfo::MetaInfo),), + actions = sam_actions_metainfo, + context = sam_context, + initcode = sam_initcode_metainfo, + loopcode = sam_loopcode_metainfo, + returncode = sam_returncode_metainfo +) |> eval + +const sam_loopcode_header = quote + + if cs < 0 + throw(ArgumentError("malformed metainfo at line $(linenum)")) + end + + if found_metainfo + push!(header, metainfo) + found_metainfo = false + end + + metainfo = MetaInfo() + + if finish_header + @goto __return__ + end +end + +const sam_returncode_header = quote + return cs, linenum, finish_header +end + +Automa.Stream.generate_reader( + :readheader!, + sam_machine_header, + arguments = (:(header::SAM.Header), :(state::Tuple{Int,Int})), + actions = sam_actions_header, + context = sam_context, + initcode = sam_initcode_header, + loopcode = sam_loopcode_header, + returncode = sam_returncode_header +) |> eval + + +const sam_loopcode_record = quote + + if cs < 0 + throw(ArgumentError("malformed SAM record at position $(p), line $(linenum)")) + end + + # # if cs != 0 + # # throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) + # # end + + if found_record + @goto __return__ + end + +end + +const sam_loopcode_body = quote + + $(sam_loopcode_record) + + if finish_body + @goto __return__ + end +end + +const sam_returncode_record = quote + return found_record +end + +Automa.Stream.generate_reader( + :index!, + sam_machine_record, + arguments = (:(record::Record),), + actions = sam_actions_record, + context = sam_context, + initcode = sam_initcode_record, + loopcode = sam_loopcode_record, + returncode = sam_returncode_record +) |> eval + + + +const sam_returncode_body = quote + return cs, linenum, found_record +end + +Automa.Stream.generate_reader( + :readrecord!, + sam_machine_body, + arguments = (:(record::Record), :(state::Tuple{Int,Int})), + actions = sam_actions_body, + context = sam_context, + initcode = sam_initcode_body, + loopcode = sam_loopcode_body, + returncode = sam_returncode_body ) |> eval diff --git a/src/sam/sam.jl b/src/sam/sam.jl index 5b6bab8..95e36f6 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -7,11 +7,14 @@ using BioGenerics import Automa import Automa.RegExp: @re_str +import Automa.Stream: @mark, @markpos, @relpos, @abspos import BioAlignments +import BioGenerics: BioGenerics, isfilled, header import BioGenerics.Exceptions: missingerror -import BioGenerics: isfilled, header +import BioGenerics.Automa: State import BioSequences -import BufferedStreams +import TranscodingStreams: TranscodingStreams, TranscodingStream + using Printf: @sprintf @@ -46,19 +49,19 @@ function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{I return sign * x end -#TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams). -import BufferedStreams: BufferedStreams, BufferedInputStream -# A type keeping track of a ragel-based parser's state. -mutable struct State{T<:BufferedInputStream} - stream::T # input stream - cs::Int # current DFA state of Ragel - linenum::Int # line number: parser is responsible for updating this - finished::Bool # true if finished (regardless of where in the stream we are) -end +# #TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams). +# import BufferedStreams: BufferedStreams, BufferedInputStream +# # A type keeping track of a ragel-based parser's state. +# mutable struct State{T<:BufferedInputStream} +# stream::T # input stream +# cs::Int # current DFA state of Ragel +# linenum::Int # line number: parser is responsible for updating this +# finished::Bool # true if finished (regardless of where in the stream we are) +# end -function State(initstate::Int, input::BufferedInputStream) - return State(input, initstate, 1, false) -end +# function State(initstate::Int, input::BufferedInputStream) +# return State(input, initstate, 1, false) +# end include("flags.jl")