mirror of
https://github.com/MillironX/XAM.jl.git
synced 2024-11-23 10:19:56 +00:00
Merge branch 'release/v0.2.0'
This commit is contained in:
commit
c139754845
13 changed files with 656 additions and 340 deletions
10
Project.toml
10
Project.toml
|
@ -1,28 +1,28 @@
|
||||||
name = "XAM"
|
name = "XAM"
|
||||||
uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"
|
uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"
|
||||||
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <ward9250@gmail.com>", "Ciarán O'Mara <Ciaran.OMara@utas.edu.au>"]
|
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <ward9250@gmail.com>", "Ciarán O'Mara <Ciaran.OMara@utas.edu.au>"]
|
||||||
version = "0.1.1"
|
version = "0.2.0"
|
||||||
|
|
||||||
[deps]
|
[deps]
|
||||||
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
|
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
|
||||||
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
|
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
|
||||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||||
BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
|
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
|
||||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||||
BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
|
|
||||||
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
|
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
|
||||||
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
|
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
|
||||||
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||||
|
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
|
||||||
|
|
||||||
[compat]
|
[compat]
|
||||||
Automa = "0.7, 0.8"
|
Automa = "0.7, 0.8"
|
||||||
BGZFStreams = "0.3"
|
BGZFStreams = "0.3"
|
||||||
BioAlignments = "2"
|
BioAlignments = "2"
|
||||||
BioCore = "2"
|
BioGenerics = "0.1"
|
||||||
BioSequences = "2"
|
BioSequences = "2"
|
||||||
BufferedStreams = "1"
|
|
||||||
GenomicFeatures = "2"
|
GenomicFeatures = "2"
|
||||||
Indexes = "0.1"
|
Indexes = "0.1"
|
||||||
|
TranscodingStreams = "0.6, 0.7, 0.8, 0.9"
|
||||||
julia = "1.1"
|
julia = "1.1"
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
|
|
|
@ -72,12 +72,15 @@ function loadauxtype(data::Vector{UInt8}, p::Int)
|
||||||
b == UInt8('Z') ? String :
|
b == UInt8('Z') ? String :
|
||||||
error("invalid type tag: '$(Char(b))'"))
|
error("invalid type tag: '$(Char(b))'"))
|
||||||
end
|
end
|
||||||
|
|
||||||
t = data[p]
|
t = data[p]
|
||||||
|
|
||||||
if t == UInt8('B')
|
if t == UInt8('B')
|
||||||
return p + 2, Vector{auxtype(data[p+1])}
|
return p + 2, Vector{auxtype(data[p+1])}
|
||||||
else
|
|
||||||
return p + 1, auxtype(t)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
return p + 1, auxtype(t)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T
|
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T
|
||||||
|
@ -105,14 +108,17 @@ end
|
||||||
|
|
||||||
function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
|
function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
|
||||||
pos = start
|
pos = start
|
||||||
|
|
||||||
while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2)
|
while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2)
|
||||||
pos = next_tag_position(data, pos)
|
pos = next_tag_position(data, pos)
|
||||||
end
|
end
|
||||||
|
|
||||||
if pos > stop
|
if pos > stop
|
||||||
return 0
|
return 0
|
||||||
else
|
|
||||||
return pos
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
return pos
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Find the starting position of a next tag in `data` after `p`.
|
# Find the starting position of a next tag in `data` after `p`.
|
||||||
|
@ -120,24 +126,40 @@ end
|
||||||
function next_tag_position(data::Vector{UInt8}, p::Int)
|
function next_tag_position(data::Vector{UInt8}, p::Int)
|
||||||
typ = Char(data[p+2])
|
typ = Char(data[p+2])
|
||||||
p += 3
|
p += 3
|
||||||
|
|
||||||
if typ == 'A'
|
if typ == 'A'
|
||||||
p += 1
|
return p += 1
|
||||||
elseif typ == 'c' || typ == 'C'
|
end
|
||||||
p += 1
|
|
||||||
elseif typ == 's' || typ == 'S'
|
if typ == 'c' || typ == 'C'
|
||||||
p += 2
|
return p += 1
|
||||||
elseif typ == 'i' || typ == 'I'
|
end
|
||||||
p += 4
|
|
||||||
elseif typ == 'f'
|
if typ == 's' || typ == 'S'
|
||||||
p += 4
|
return p += 2
|
||||||
elseif typ == 'd'
|
end
|
||||||
p += 8
|
|
||||||
elseif typ == 'Z' || typ == 'H'
|
if typ == 'i' || typ == 'I'
|
||||||
|
return p += 4
|
||||||
|
end
|
||||||
|
|
||||||
|
if typ == 'f'
|
||||||
|
return p += 4
|
||||||
|
end
|
||||||
|
|
||||||
|
if typ == 'd'
|
||||||
|
return p += 8
|
||||||
|
end
|
||||||
|
|
||||||
|
if typ == 'Z' || typ == 'H'
|
||||||
while data[p] != 0x00 # NULL-terminalted string
|
while data[p] != 0x00 # NULL-terminalted string
|
||||||
p += 1
|
p += 1
|
||||||
end
|
end
|
||||||
p += 1
|
return p += 1
|
||||||
elseif typ == 'B'
|
|
||||||
|
end
|
||||||
|
|
||||||
|
if typ == 'B'
|
||||||
eltyp = Char(data[p])
|
eltyp = Char(data[p])
|
||||||
elsize = eltyp == 'c' || eltyp == 'C' ? 1 :
|
elsize = eltyp == 'c' || eltyp == 'C' ? 1 :
|
||||||
eltyp == 's' || eltyp == 'S' ? 2 :
|
eltyp == 's' || eltyp == 'S' ? 2 :
|
||||||
|
@ -145,9 +167,9 @@ function next_tag_position(data::Vector{UInt8}, p::Int)
|
||||||
error("invalid type tag: '$(Char(eltyp))'")
|
error("invalid type tag: '$(Char(eltyp))'")
|
||||||
p += 1
|
p += 1
|
||||||
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
|
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
|
||||||
p += 4 + elsize * n
|
return p += 4 + elsize * n
|
||||||
else
|
|
||||||
error("invalid type tag: '$(Char(typ))'")
|
|
||||||
end
|
end
|
||||||
return p
|
|
||||||
|
error("invalid type tag: '$(Char(typ))'")
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
module BAM
|
module BAM
|
||||||
|
|
||||||
using BioCore
|
using BioGenerics
|
||||||
using GenomicFeatures
|
using GenomicFeatures
|
||||||
using XAM.SAM
|
using XAM.SAM
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ import BGZFStreams
|
||||||
import BioAlignments
|
import BioAlignments
|
||||||
import Indexes
|
import Indexes
|
||||||
import BioSequences
|
import BioSequences
|
||||||
import BioCore: isfilled, header
|
import BioGenerics: isfilled, header
|
||||||
|
|
||||||
import GenomicFeatures: eachoverlap
|
import GenomicFeatures: eachoverlap
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ function Base.eltype(::Type{OverlapIterator{T}}) where T
|
||||||
end
|
end
|
||||||
|
|
||||||
function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval)
|
function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval)
|
||||||
return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last)
|
return GenomicFeatures.eachoverlap(reader, GenomicFeatures.seqname(interval), GenomicFeatures.leftposition(interval):GenomicFeatures.rightposition(interval))
|
||||||
end
|
end
|
||||||
|
|
||||||
function GenomicFeatures.eachoverlap(reader::Reader, interval)
|
function GenomicFeatures.eachoverlap(reader::Reader, interval)
|
||||||
|
@ -67,7 +67,8 @@ function Base.iterate(iter::OverlapIterator, state)
|
||||||
c = compare_intervals(state.record, (state.refindex, iter.interval))
|
c = compare_intervals(state.record, (state.refindex, iter.interval))
|
||||||
if c == 0
|
if c == 0
|
||||||
return copy(state.record), state
|
return copy(state.record), state
|
||||||
elseif c > 0
|
end
|
||||||
|
if c > 0
|
||||||
# no more overlapping records in this chunk since records are sorted
|
# no more overlapping records in this chunk since records are sorted
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
|
@ -82,14 +83,18 @@ end
|
||||||
|
|
||||||
function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}})
|
function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}})
|
||||||
rid = refid(record)
|
rid = refid(record)
|
||||||
|
|
||||||
if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2]))
|
if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2]))
|
||||||
# strictly left
|
# strictly left
|
||||||
return -1
|
return -1
|
||||||
elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2]))
|
end
|
||||||
|
|
||||||
|
if rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2]))
|
||||||
# strictly right
|
# strictly right
|
||||||
return +1
|
return +1
|
||||||
else
|
|
||||||
# overlapping
|
|
||||||
return 0
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# overlapping
|
||||||
|
return 0
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -10,7 +10,7 @@ Create a data reader of the BAM file format.
|
||||||
* `input`: data source
|
* `input`: data source
|
||||||
* `index=nothing`: filepath to a random access index (currently *bai* is supported)
|
* `index=nothing`: filepath to a random access index (currently *bai* is supported)
|
||||||
"""
|
"""
|
||||||
mutable struct Reader{T} <: BioCore.IO.AbstractReader
|
mutable struct Reader{T} <: BioGenerics.IO.AbstractReader
|
||||||
stream::BGZFStreams.BGZFStream{T}
|
stream::BGZFStreams.BGZFStream{T}
|
||||||
header::SAM.Header
|
header::SAM.Header
|
||||||
start_offset::BGZFStreams.VirtualOffset
|
start_offset::BGZFStreams.VirtualOffset
|
||||||
|
@ -23,17 +23,15 @@ function Base.eltype(::Type{Reader{T}}) where T
|
||||||
return Record
|
return Record
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.IO.stream(reader::Reader)
|
function BioGenerics.IO.stream(reader::Reader)
|
||||||
return reader.stream
|
return reader.stream
|
||||||
end
|
end
|
||||||
|
|
||||||
function Reader(input::IO; index=nothing)
|
function Reader(input::IO; index=nothing)
|
||||||
if isa(index, AbstractString)
|
if isa(index, AbstractString)
|
||||||
index = BAI(index)
|
index = BAI(index)
|
||||||
else
|
elseif index != nothing
|
||||||
if index != nothing
|
error("unrecognizable index argument")
|
||||||
error("unrecognizable index argument")
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
reader = init_bam_reader(input)
|
reader = init_bam_reader(input)
|
||||||
reader.index = index
|
reader.index = index
|
||||||
|
@ -89,6 +87,7 @@ function init_bam_reader(input::BGZFStreams.BGZFStream)
|
||||||
A = read(input, UInt8)
|
A = read(input, UInt8)
|
||||||
M = read(input, UInt8)
|
M = read(input, UInt8)
|
||||||
x = read(input, UInt8)
|
x = read(input, UInt8)
|
||||||
|
|
||||||
if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01
|
if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01
|
||||||
error("input was not a valid BAM file")
|
error("input was not a valid BAM file")
|
||||||
end
|
end
|
||||||
|
@ -98,21 +97,22 @@ function init_bam_reader(input::BGZFStreams.BGZFStream)
|
||||||
samreader = SAM.Reader(IOBuffer(read(input, textlen)))
|
samreader = SAM.Reader(IOBuffer(read(input, textlen)))
|
||||||
|
|
||||||
# reference sequences
|
# reference sequences
|
||||||
refseqnames = String[]
|
|
||||||
refseqlens = Int[]
|
|
||||||
n_refs = read(input, Int32)
|
n_refs = read(input, Int32)
|
||||||
for _ in 1:n_refs
|
refseqnames = Vector{String}(undef, n_refs)
|
||||||
|
refseqlens = Vector{Int}(undef, n_refs)
|
||||||
|
@inbounds for i in 1:n_refs
|
||||||
namelen = read(input, Int32)
|
namelen = read(input, Int32)
|
||||||
data = read(input, namelen)
|
data = read(input, namelen)
|
||||||
seqname = unsafe_string(pointer(data))
|
seqname = unsafe_string(pointer(data))
|
||||||
seqlen = read(input, Int32)
|
seqlen = read(input, Int32)
|
||||||
push!(refseqnames, seqname)
|
refseqnames[i] = seqname
|
||||||
push!(refseqlens, seqlen)
|
refseqlens[i] = seqlen
|
||||||
end
|
end
|
||||||
|
|
||||||
voffset = isa(input.io, Base.AbstractPipe) ?
|
voffset = isa(input.io, Base.AbstractPipe) ?
|
||||||
BGZFStreams.VirtualOffset(0, 0) :
|
BGZFStreams.VirtualOffset(0, 0) :
|
||||||
BGZFStreams.virtualoffset(input)
|
BGZFStreams.virtualoffset(input)
|
||||||
|
|
||||||
return Reader(
|
return Reader(
|
||||||
input,
|
input,
|
||||||
samreader.header,
|
samreader.header,
|
||||||
|
|
|
@ -163,7 +163,8 @@ function checked_refid(record::Record)
|
||||||
id = refid(record)
|
id = refid(record)
|
||||||
if id == 0
|
if id == 0
|
||||||
throw(ArgumentError("record is not mapped"))
|
throw(ArgumentError("record is not mapped"))
|
||||||
elseif !isdefined(record, :reader)
|
end
|
||||||
|
if !isdefined(record, :reader)
|
||||||
throw(ArgumentError("reader is not defined"))
|
throw(ArgumentError("reader is not defined"))
|
||||||
end
|
end
|
||||||
return id
|
return id
|
||||||
|
@ -565,42 +566,42 @@ function Base.values(record::Record)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
# BioCore Methods
|
# BioGenerics Methods
|
||||||
# -----------
|
# -----------
|
||||||
|
|
||||||
function BioCore.isfilled(record::Record)
|
function BioGenerics.isfilled(record::Record)
|
||||||
return record.block_size != 0
|
return record.block_size != 0
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.seqname(record::Record)
|
function BioGenerics.seqname(record::Record)
|
||||||
return tempname(record)
|
return tempname(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hasseqname(record::Record)
|
function BioGenerics.hasseqname(record::Record)
|
||||||
return hastempname(record)
|
return hastempname(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.sequence(record::Record)
|
function BioGenerics.sequence(record::Record)
|
||||||
return sequence(record)
|
return sequence(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hassequence(record::Record)
|
function BioGenerics.hassequence(record::Record)
|
||||||
return hassequence(record)
|
return hassequence(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.leftposition(record::Record)
|
function BioGenerics.leftposition(record::Record)
|
||||||
return position(record)
|
return position(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hasleftposition(record::Record)
|
function BioGenerics.hasleftposition(record::Record)
|
||||||
return hasposition(record)
|
return hasposition(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.rightposition(record::Record)
|
function BioGenerics.rightposition(record::Record)
|
||||||
return rightposition(record)
|
return rightposition(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hasrightposition(record::Record)
|
function BioGenerics.hasrightposition(record::Record)
|
||||||
return hasrightposition(record)
|
return hasrightposition(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -612,9 +613,10 @@ end
|
||||||
function data_size(record::Record)
|
function data_size(record::Record)
|
||||||
if isfilled(record)
|
if isfilled(record)
|
||||||
return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size)
|
return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size)
|
||||||
else
|
|
||||||
return 0
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function checkfilled(record::Record)
|
function checkfilled(record::Record)
|
||||||
|
|
|
@ -10,7 +10,7 @@ Create a data writer of the BAM file format.
|
||||||
* `output`: data sink
|
* `output`: data sink
|
||||||
* `header`: SAM header object
|
* `header`: SAM header object
|
||||||
"""
|
"""
|
||||||
mutable struct Writer <: BioCore.IO.AbstractWriter
|
mutable struct Writer <: BioGenerics.IO.AbstractWriter
|
||||||
stream::BGZFStreams.BGZFStream
|
stream::BGZFStreams.BGZFStream
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header)
|
||||||
return Writer(stream)
|
return Writer(stream)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.IO.stream(writer::Writer)
|
function BioGenerics.IO.stream(writer::Writer)
|
||||||
return writer.stream
|
return writer.stream
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,22 @@
|
||||||
# SAM Reader
|
# SAM Reader
|
||||||
# =========
|
# =========
|
||||||
|
|
||||||
mutable struct Reader <: BioCore.IO.AbstractReader
|
mutable struct Reader{S <: TranscodingStream} <: BioGenerics.IO.AbstractReader
|
||||||
state::BioCore.Ragel.State
|
state::State{S}
|
||||||
header::Header
|
header::Header
|
||||||
|
end
|
||||||
|
|
||||||
function Reader(input::BufferedStreams.BufferedInputStream)
|
function Reader(state::State{S}) where {S <: TranscodingStream}
|
||||||
reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header())
|
|
||||||
readheader!(reader)
|
rdr = Reader(state, Header())
|
||||||
reader.state.cs = sam_body_machine.start_state
|
|
||||||
return reader
|
cs, ln, f = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum))
|
||||||
end
|
|
||||||
|
rdr.state.state = sam_machine_body.start_state
|
||||||
|
rdr.state.linenum = ln
|
||||||
|
rdr.state.filled = false
|
||||||
|
|
||||||
|
return rdr
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -22,10 +28,22 @@ Create a data reader of the SAM file format.
|
||||||
* `input`: data source
|
* `input`: data source
|
||||||
"""
|
"""
|
||||||
function Reader(input::IO)
|
function Reader(input::IO)
|
||||||
return Reader(BufferedStreams.BufferedInputStream(input))
|
|
||||||
|
if input isa TranscodingStream
|
||||||
|
return Reader(State(input, 1, 1, false))
|
||||||
|
end
|
||||||
|
|
||||||
|
stream = TranscodingStreams.NoopStream(input)
|
||||||
|
|
||||||
|
return Reader(State(stream, 1, 1, false))
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.IO.stream(reader::Reader)
|
function Base.eltype(::Type{<:Reader})
|
||||||
|
return Record
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioGenerics.IO.stream(reader::Reader)
|
||||||
return reader.state.stream
|
return reader.state.stream
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -38,224 +56,42 @@ function header(reader::Reader)::Header
|
||||||
return reader.header
|
return reader.header
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.eltype(::Type{Reader})
|
function Base.close(reader::Reader)
|
||||||
return Record
|
if reader.state.stream isa IO
|
||||||
|
close(reader.state.stream)
|
||||||
|
end
|
||||||
|
return nothing
|
||||||
end
|
end
|
||||||
|
|
||||||
# file = header . body
|
function index!(record::MetaInfo)
|
||||||
# header = metainfo*
|
stream = TranscodingStreams.NoopStream(IOBuffer(record.data))
|
||||||
# body = record*
|
found = index!(stream, record)
|
||||||
isinteractive() && info("compiling SAM")
|
if !found
|
||||||
const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function ()
|
throw(ArgumentError("invalid SAM metadata"))
|
||||||
cat = Automa.RegExp.cat
|
|
||||||
rep = Automa.RegExp.rep
|
|
||||||
alt = Automa.RegExp.alt
|
|
||||||
opt = Automa.RegExp.opt
|
|
||||||
any = Automa.RegExp.any
|
|
||||||
|
|
||||||
metainfo = let
|
|
||||||
tag = re"[A-Z][A-Z]" \ cat("CO")
|
|
||||||
tag.actions[:enter] = [:mark1]
|
|
||||||
tag.actions[:exit] = [:metainfo_tag]
|
|
||||||
|
|
||||||
dict = let
|
|
||||||
key = re"[A-Za-z][A-Za-z0-9]"
|
|
||||||
key.actions[:enter] = [:mark2]
|
|
||||||
key.actions[:exit] = [:metainfo_dict_key]
|
|
||||||
val = re"[ -~]+"
|
|
||||||
val.actions[:enter] = [:mark2]
|
|
||||||
val.actions[:exit] = [:metainfo_dict_val]
|
|
||||||
keyval = cat(key, ':', val)
|
|
||||||
|
|
||||||
cat(keyval, rep(cat('\t', keyval)))
|
|
||||||
end
|
|
||||||
dict.actions[:enter] = [:mark1]
|
|
||||||
dict.actions[:exit] = [:metainfo_val]
|
|
||||||
|
|
||||||
co = cat("CO")
|
|
||||||
co.actions[:enter] = [:mark1]
|
|
||||||
co.actions[:exit] = [:metainfo_tag]
|
|
||||||
|
|
||||||
comment = re"[^\r\n]*"
|
|
||||||
comment.actions[:enter] = [:mark1]
|
|
||||||
comment.actions[:exit] = [:metainfo_val]
|
|
||||||
|
|
||||||
cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment)))
|
|
||||||
end
|
end
|
||||||
metainfo.actions[:enter] = [:anchor]
|
return record
|
||||||
metainfo.actions[:exit] = [:metainfo]
|
end
|
||||||
|
|
||||||
record = let
|
function index!(record::Record)
|
||||||
qname = re"[!-?A-~]+"
|
stream = TranscodingStreams.NoopStream(IOBuffer(record.data))
|
||||||
qname.actions[:enter] = [:mark]
|
found = index!(stream, record)
|
||||||
qname.actions[:exit] = [:record_qname]
|
if !found
|
||||||
|
throw(ArgumentError("invalid SAM record"))
|
||||||
flag = re"[0-9]+"
|
|
||||||
flag.actions[:enter] = [:mark]
|
|
||||||
flag.actions[:exit] = [:record_flag]
|
|
||||||
|
|
||||||
rname = re"\*|[!-()+-<>-~][!-~]*"
|
|
||||||
rname.actions[:enter] = [:mark]
|
|
||||||
rname.actions[:exit] = [:record_rname]
|
|
||||||
|
|
||||||
pos = re"[0-9]+"
|
|
||||||
pos.actions[:enter] = [:mark]
|
|
||||||
pos.actions[:exit] = [:record_pos]
|
|
||||||
|
|
||||||
mapq = re"[0-9]+"
|
|
||||||
mapq.actions[:enter] = [:mark]
|
|
||||||
mapq.actions[:exit] = [:record_mapq]
|
|
||||||
|
|
||||||
cigar = re"\*|([0-9]+[MIDNSHPX=])+"
|
|
||||||
cigar.actions[:enter] = [:mark]
|
|
||||||
cigar.actions[:exit] = [:record_cigar]
|
|
||||||
|
|
||||||
rnext = re"\*|=|[!-()+-<>-~][!-~]*"
|
|
||||||
rnext.actions[:enter] = [:mark]
|
|
||||||
rnext.actions[:exit] = [:record_rnext]
|
|
||||||
|
|
||||||
pnext = re"[0-9]+"
|
|
||||||
pnext.actions[:enter] = [:mark]
|
|
||||||
pnext.actions[:exit] = [:record_pnext]
|
|
||||||
|
|
||||||
tlen = re"[-+]?[0-9]+"
|
|
||||||
tlen.actions[:enter] = [:mark]
|
|
||||||
tlen.actions[:exit] = [:record_tlen]
|
|
||||||
|
|
||||||
seq = re"\*|[A-Za-z=.]+"
|
|
||||||
seq.actions[:enter] = [:mark]
|
|
||||||
seq.actions[:exit] = [:record_seq]
|
|
||||||
|
|
||||||
qual = re"[!-~]+"
|
|
||||||
qual.actions[:enter] = [:mark]
|
|
||||||
qual.actions[:exit] = [:record_qual]
|
|
||||||
|
|
||||||
field = let
|
|
||||||
tag = re"[A-Za-z][A-Za-z0-9]"
|
|
||||||
val = alt(
|
|
||||||
re"A:[!-~]",
|
|
||||||
re"i:[-+]?[0-9]+",
|
|
||||||
re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?",
|
|
||||||
re"Z:[ !-~]*",
|
|
||||||
re"H:([0-9A-F][0-9A-F])*",
|
|
||||||
re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+")
|
|
||||||
|
|
||||||
cat(tag, ':', val)
|
|
||||||
end
|
|
||||||
field.actions[:enter] = [:mark]
|
|
||||||
field.actions[:exit] = [:record_field]
|
|
||||||
|
|
||||||
cat(
|
|
||||||
qname, '\t',
|
|
||||||
flag, '\t',
|
|
||||||
rname, '\t',
|
|
||||||
pos, '\t',
|
|
||||||
mapq, '\t',
|
|
||||||
cigar, '\t',
|
|
||||||
rnext, '\t',
|
|
||||||
pnext, '\t',
|
|
||||||
tlen, '\t',
|
|
||||||
seq, '\t',
|
|
||||||
qual,
|
|
||||||
rep(cat('\t', field)))
|
|
||||||
end
|
end
|
||||||
record.actions[:enter] = [:anchor]
|
return record
|
||||||
record.actions[:exit] = [:record]
|
end
|
||||||
|
|
||||||
newline = let
|
function Base.read!(rdr::Reader, rec::Record)
|
||||||
lf = re"\n"
|
|
||||||
lf.actions[:enter] = [:countline]
|
|
||||||
|
|
||||||
cat(re"\r?", lf)
|
cs, ln, f = readrecord!(rdr.state.stream, rec, (rdr.state.state, rdr.state.linenum))
|
||||||
|
|
||||||
|
rdr.state.state = cs
|
||||||
|
rdr.state.linenum = ln
|
||||||
|
rdr.state.filled = f
|
||||||
|
|
||||||
|
if !f
|
||||||
|
cs == 0 && throw(EOFError())
|
||||||
|
throw(ArgumentError("malformed SAM file"))
|
||||||
end
|
end
|
||||||
|
return rec
|
||||||
header′ = rep(cat(metainfo, newline))
|
end
|
||||||
header′.actions[:exit] = [:header]
|
|
||||||
header = cat(header′, opt(any() \ cat('@'))) # look ahead
|
|
||||||
|
|
||||||
body = rep(cat(record, newline))
|
|
||||||
|
|
||||||
return map(Automa.compile, (metainfo, record, header, body))
|
|
||||||
end)()
|
|
||||||
|
|
||||||
const sam_metainfo_actions = Dict(
|
|
||||||
:metainfo_tag => :(record.tag = (mark1:p-1) .- offset),
|
|
||||||
:metainfo_val => :(record.val = (mark1:p-1) .- offset),
|
|
||||||
:metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)),
|
|
||||||
:metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)),
|
|
||||||
:metainfo => quote
|
|
||||||
BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1)
|
|
||||||
record.filled = (offset+1:p-1) .- offset
|
|
||||||
end,
|
|
||||||
:anchor => :(),
|
|
||||||
:mark1 => :(mark1 = p),
|
|
||||||
:mark2 => :(mark2 = p))
|
|
||||||
eval(
|
|
||||||
BioCore.ReaderHelper.generate_index_function(
|
|
||||||
MetaInfo,
|
|
||||||
sam_metainfo_machine,
|
|
||||||
:(mark1 = mark2 = offset = 0),
|
|
||||||
sam_metainfo_actions))
|
|
||||||
eval(
|
|
||||||
BioCore.ReaderHelper.generate_readheader_function(
|
|
||||||
Reader,
|
|
||||||
MetaInfo,
|
|
||||||
sam_header_machine,
|
|
||||||
:(mark1 = mark2 = offset = 0),
|
|
||||||
merge(sam_metainfo_actions, Dict(
|
|
||||||
:metainfo => quote
|
|
||||||
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
|
|
||||||
record.filled = (offset+1:p-1) .- offset
|
|
||||||
@assert isfilled(record)
|
|
||||||
push!(reader.header.metainfo, record)
|
|
||||||
BioCore.ReaderHelper.ensure_margin!(stream)
|
|
||||||
record = MetaInfo()
|
|
||||||
end,
|
|
||||||
:header => :(finish_header = true; @escape),
|
|
||||||
:countline => :(linenum += 1),
|
|
||||||
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))),
|
|
||||||
quote
|
|
||||||
if !eof(stream)
|
|
||||||
stream.position -= 1 # cancel look-ahead
|
|
||||||
end
|
|
||||||
end))
|
|
||||||
|
|
||||||
const sam_record_actions = Dict(
|
|
||||||
:record_qname => :(record.qname = (mark:p-1) .- offset),
|
|
||||||
:record_flag => :(record.flag = (mark:p-1) .- offset),
|
|
||||||
:record_rname => :(record.rname = (mark:p-1) .- offset),
|
|
||||||
:record_pos => :(record.pos = (mark:p-1) .- offset),
|
|
||||||
:record_mapq => :(record.mapq = (mark:p-1) .- offset),
|
|
||||||
:record_cigar => :(record.cigar = (mark:p-1) .- offset),
|
|
||||||
:record_rnext => :(record.rnext = (mark:p-1) .- offset),
|
|
||||||
:record_pnext => :(record.pnext = (mark:p-1) .- offset),
|
|
||||||
:record_tlen => :(record.tlen = (mark:p-1) .- offset),
|
|
||||||
:record_seq => :(record.seq = (mark:p-1) .- offset),
|
|
||||||
:record_qual => :(record.qual = (mark:p-1) .- offset),
|
|
||||||
:record_field => :(push!(record.fields, (mark:p-1) .- offset)),
|
|
||||||
:record => quote
|
|
||||||
BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1)
|
|
||||||
record.filled = (offset+1:p-1) .- offset
|
|
||||||
end,
|
|
||||||
:anchor => :(),
|
|
||||||
:mark => :(mark = p))
|
|
||||||
eval(
|
|
||||||
BioCore.ReaderHelper.generate_index_function(
|
|
||||||
Record,
|
|
||||||
sam_record_machine,
|
|
||||||
:(mark = offset = 0),
|
|
||||||
sam_record_actions))
|
|
||||||
eval(
|
|
||||||
BioCore.ReaderHelper.generate_read_function(
|
|
||||||
Reader,
|
|
||||||
sam_body_machine,
|
|
||||||
:(mark = offset = 0),
|
|
||||||
merge(sam_record_actions, Dict(
|
|
||||||
:record => quote
|
|
||||||
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
|
|
||||||
record.filled = (offset+1:p-1) .- offset
|
|
||||||
found_record = true
|
|
||||||
@escape
|
|
||||||
end,
|
|
||||||
:countline => :(linenum += 1),
|
|
||||||
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1)))))
|
|
||||||
|
|
395
src/sam/readrecord.jl
Normal file
395
src/sam/readrecord.jl
Normal file
|
@ -0,0 +1,395 @@
|
||||||
|
# Automa.jl generated readrecord! and readmetainfo! functions
|
||||||
|
# ========================================
|
||||||
|
|
||||||
|
# file = header . body
|
||||||
|
# header = metainfo*
|
||||||
|
# body = record*
|
||||||
|
const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_body, sam_machine = (function ()
|
||||||
|
|
||||||
|
isinteractive() && info("compiling SAM")
|
||||||
|
|
||||||
|
cat = Automa.RegExp.cat
|
||||||
|
rep = Automa.RegExp.rep
|
||||||
|
alt = Automa.RegExp.alt
|
||||||
|
opt = Automa.RegExp.opt
|
||||||
|
any = Automa.RegExp.any
|
||||||
|
|
||||||
|
metainfo = let
|
||||||
|
tag = re"[A-Z][A-Z]" \ cat("CO")
|
||||||
|
tag.actions[:enter] = [:pos1]
|
||||||
|
tag.actions[:exit] = [:metainfo_tag]
|
||||||
|
|
||||||
|
dict = let
|
||||||
|
key = re"[A-Za-z][A-Za-z0-9]"
|
||||||
|
key.actions[:enter] = [:pos2]
|
||||||
|
key.actions[:exit] = [:metainfo_dict_key]
|
||||||
|
val = re"[ -~]+"
|
||||||
|
val.actions[:enter] = [:pos2]
|
||||||
|
val.actions[:exit] = [:metainfo_dict_val]
|
||||||
|
keyval = cat(key, ':', val)
|
||||||
|
|
||||||
|
cat(keyval, rep(cat('\t', keyval)))
|
||||||
|
end
|
||||||
|
dict.actions[:enter] = [:pos1]
|
||||||
|
dict.actions[:exit] = [:metainfo_val]
|
||||||
|
|
||||||
|
co = cat("CO")
|
||||||
|
co.actions[:enter] = [:pos1]
|
||||||
|
co.actions[:exit] = [:metainfo_tag]
|
||||||
|
|
||||||
|
comment = re"[^\r\n]*"
|
||||||
|
comment.actions[:enter] = [:pos1]
|
||||||
|
comment.actions[:exit] = [:metainfo_val]
|
||||||
|
|
||||||
|
cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment)))
|
||||||
|
end
|
||||||
|
metainfo.actions[:enter] = [:mark]
|
||||||
|
metainfo.actions[:exit] = [:metainfo]
|
||||||
|
|
||||||
|
record = let
|
||||||
|
qname = re"[!-?A-~]+"
|
||||||
|
qname.actions[:enter] = [:pos]
|
||||||
|
qname.actions[:exit] = [:record_qname]
|
||||||
|
|
||||||
|
flag = re"[0-9]+"
|
||||||
|
flag.actions[:enter] = [:pos]
|
||||||
|
flag.actions[:exit] = [:record_flag]
|
||||||
|
|
||||||
|
rname = re"\*|[!-()+-<>-~][!-~]*"
|
||||||
|
rname.actions[:enter] = [:pos]
|
||||||
|
rname.actions[:exit] = [:record_rname]
|
||||||
|
|
||||||
|
pos = re"[0-9]+"
|
||||||
|
pos.actions[:enter] = [:pos]
|
||||||
|
pos.actions[:exit] = [:record_pos]
|
||||||
|
|
||||||
|
mapq = re"[0-9]+"
|
||||||
|
mapq.actions[:enter] = [:pos]
|
||||||
|
mapq.actions[:exit] = [:record_mapq]
|
||||||
|
|
||||||
|
cigar = re"\*|([0-9]+[MIDNSHPX=])+"
|
||||||
|
cigar.actions[:enter] = [:pos]
|
||||||
|
cigar.actions[:exit] = [:record_cigar]
|
||||||
|
|
||||||
|
rnext = re"\*|=|[!-()+-<>-~][!-~]*"
|
||||||
|
rnext.actions[:enter] = [:pos]
|
||||||
|
rnext.actions[:exit] = [:record_rnext]
|
||||||
|
|
||||||
|
pnext = re"[0-9]+"
|
||||||
|
pnext.actions[:enter] = [:pos]
|
||||||
|
pnext.actions[:exit] = [:record_pnext]
|
||||||
|
|
||||||
|
tlen = re"[-+]?[0-9]+"
|
||||||
|
tlen.actions[:enter] = [:pos]
|
||||||
|
tlen.actions[:exit] = [:record_tlen]
|
||||||
|
|
||||||
|
seq = re"\*|[A-Za-z=.]+"
|
||||||
|
seq.actions[:enter] = [:pos]
|
||||||
|
seq.actions[:exit] = [:record_seq]
|
||||||
|
|
||||||
|
qual = re"[!-~]+"
|
||||||
|
qual.actions[:enter] = [:pos]
|
||||||
|
qual.actions[:exit] = [:record_qual]
|
||||||
|
|
||||||
|
field = let
|
||||||
|
tag = re"[A-Za-z][A-Za-z0-9]"
|
||||||
|
val = alt(
|
||||||
|
re"A:[!-~]",
|
||||||
|
re"i:[-+]?[0-9]+",
|
||||||
|
re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?",
|
||||||
|
re"Z:[ !-~]*",
|
||||||
|
re"H:([0-9A-F][0-9A-F])*",
|
||||||
|
re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+")
|
||||||
|
|
||||||
|
cat(tag, ':', val)
|
||||||
|
end
|
||||||
|
field.actions[:enter] = [:pos]
|
||||||
|
field.actions[:exit] = [:record_field]
|
||||||
|
|
||||||
|
cat(
|
||||||
|
qname, '\t',
|
||||||
|
flag, '\t',
|
||||||
|
rname, '\t',
|
||||||
|
pos, '\t',
|
||||||
|
mapq, '\t',
|
||||||
|
cigar, '\t',
|
||||||
|
rnext, '\t',
|
||||||
|
pnext, '\t',
|
||||||
|
tlen, '\t',
|
||||||
|
seq, '\t',
|
||||||
|
qual,
|
||||||
|
rep(cat('\t', field)))
|
||||||
|
end
|
||||||
|
record.actions[:enter] = [:mark]
|
||||||
|
record.actions[:exit] = [:record]
|
||||||
|
|
||||||
|
newline = let
|
||||||
|
lf = re"\n"
|
||||||
|
lf.actions[:enter] = [:countline]
|
||||||
|
|
||||||
|
cat(re"\r?", lf)
|
||||||
|
end
|
||||||
|
|
||||||
|
header′ = rep(cat(metainfo, newline))
|
||||||
|
header′.actions[:exit] = [:header]
|
||||||
|
header = cat(header′, opt(any() \ cat('@'))) # look ahead
|
||||||
|
|
||||||
|
body = rep(cat(record, newline))
|
||||||
|
body.actions[:exit] = [:body]
|
||||||
|
|
||||||
|
sam = cat(header, body)
|
||||||
|
|
||||||
|
return map(Automa.compile, (metainfo, record, header, body, sam))
|
||||||
|
end)()
|
||||||
|
|
||||||
|
# write("sam_machine_metainfo.dot", Automa.machine2dot(sam_machine_metainfo))
|
||||||
|
# run(`dot -Tsvg -o sam_machine_metainfo.svg sam_machine_metainfo.dot`)
|
||||||
|
#
|
||||||
|
# write("sam_machine_record.dot", Automa.machine2dot(sam_machine_record))
|
||||||
|
# run(`dot -Tsvg -o sam_machine_record.svg sam_machine_record.dot`)
|
||||||
|
#
|
||||||
|
# write("sam_machine_header.dot", Automa.machine2dot(sam_machine_header))
|
||||||
|
# run(`dot -Tsvg -o sam_machine_header.svg sam_machine_header.dot`)
|
||||||
|
#
|
||||||
|
# write("sam_machine_body.dot", Automa.machine2dot(sam_machine_body))
|
||||||
|
# run(`dot -Tsvg -o sam_machine_body.svg sam_machine_body.dot`)
|
||||||
|
#
|
||||||
|
# write("sam_machine.dot", Automa.machine2dot(sam_machine))
|
||||||
|
# run(`dot -Tsvg -o sam_machine.svg sam_machine.dot`)
|
||||||
|
|
||||||
|
function appendfrom!(dst, dpos, src, spos, n)
|
||||||
|
if length(dst) < dpos + n - 1
|
||||||
|
resize!(dst, dpos + n - 1)
|
||||||
|
end
|
||||||
|
copyto!(dst, dpos, src, spos, n)
|
||||||
|
return dst
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_actions_metainfo = Dict(
|
||||||
|
:mark => :(@mark),
|
||||||
|
:pos1 => :(pos1 = @relpos(p)),
|
||||||
|
:pos2 => :(pos2 = @relpos(p)),
|
||||||
|
:metainfo_tag => :(metainfo.tag = pos1:@relpos(p-1)),
|
||||||
|
:metainfo_val => :(metainfo.val = pos1:@relpos(p-1)),
|
||||||
|
:metainfo_dict_key => :(push!(metainfo.dictkey, pos2:@relpos(p-1))),
|
||||||
|
:metainfo_dict_val => :(push!(metainfo.dictval, pos2:@relpos(p-1))),
|
||||||
|
:metainfo => quote
|
||||||
|
let markpos = @markpos()
|
||||||
|
|
||||||
|
appendfrom!(metainfo.data, 1, data, markpos, length(markpos:p-1))
|
||||||
|
|
||||||
|
metainfo.filled = @relpos(markpos):@relpos(p-1)
|
||||||
|
|
||||||
|
found_metainfo = true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
)
|
||||||
|
|
||||||
|
const sam_actions_header = merge(
|
||||||
|
sam_actions_metainfo,
|
||||||
|
Dict(
|
||||||
|
:countline => :(linenum += 1),
|
||||||
|
:header => quote
|
||||||
|
|
||||||
|
finish_header = true
|
||||||
|
|
||||||
|
if !eof(stream)
|
||||||
|
p -= 1 # cancel look-ahead
|
||||||
|
end
|
||||||
|
|
||||||
|
@escape
|
||||||
|
end
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
const sam_actions_record = Dict(
|
||||||
|
:mark => :(@mark),
|
||||||
|
:pos => :(pos = @relpos(p)),
|
||||||
|
:record_qname => :(record.qname = pos:@relpos(p-1)),
|
||||||
|
:record_flag => :(record.flag = pos:@relpos(p-1)),
|
||||||
|
:record_rname => :(record.rname = pos:@relpos(p-1)),
|
||||||
|
:record_pos => :(record.pos = pos:@relpos(p-1)),
|
||||||
|
:record_mapq => :(record.mapq = pos:@relpos(p-1)),
|
||||||
|
:record_cigar => :(record.cigar = pos:@relpos(p-1)),
|
||||||
|
:record_rnext => :(record.rnext = pos:@relpos(p-1)),
|
||||||
|
:record_pnext => :(record.pnext = pos:@relpos(p-1)),
|
||||||
|
:record_tlen => :(record.tlen = pos:@relpos(p-1)),
|
||||||
|
:record_seq => :(record.seq = pos:@relpos(p-1)),
|
||||||
|
:record_qual => :(record.qual = pos:@relpos(p-1)),
|
||||||
|
:record_field => :(push!(record.fields, pos:@relpos(p-1))),
|
||||||
|
:record => quote
|
||||||
|
let markpos = @markpos()
|
||||||
|
|
||||||
|
appendfrom!(record.data, 1, data, markpos, length(markpos:p-1))
|
||||||
|
|
||||||
|
record.filled = @relpos(markpos):@relpos(p-1)
|
||||||
|
|
||||||
|
found_record = true
|
||||||
|
@escape
|
||||||
|
end
|
||||||
|
end
|
||||||
|
)
|
||||||
|
|
||||||
|
const sam_actions_body = merge(
|
||||||
|
sam_actions_record,
|
||||||
|
Dict(
|
||||||
|
:countline => :(linenum += 1),
|
||||||
|
:body => quote
|
||||||
|
finish_body = true
|
||||||
|
@escape
|
||||||
|
end
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# const sam_actions = merge(
|
||||||
|
# sam_actions_header,
|
||||||
|
# sam_actions_body
|
||||||
|
# )
|
||||||
|
|
||||||
|
const sam_context = Automa.CodeGenContext(
|
||||||
|
generator = :goto,
|
||||||
|
checkbounds = false,
|
||||||
|
loopunroll = 0
|
||||||
|
)
|
||||||
|
|
||||||
|
const sam_initcode_metainfo = quote
|
||||||
|
pos1 = 0
|
||||||
|
pos2 = 0
|
||||||
|
found_metainfo = false
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_initcode_record = quote
|
||||||
|
pos = 0
|
||||||
|
found_record = false
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_initcode_header = quote
|
||||||
|
$(sam_initcode_metainfo)
|
||||||
|
metainfo = MetaInfo()
|
||||||
|
finish_header = false
|
||||||
|
cs, linenum = state
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_initcode_body = quote
|
||||||
|
$(sam_initcode_record)
|
||||||
|
finish_body = false
|
||||||
|
cs, linenum = state
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_loopcode_metainfo = quote
|
||||||
|
|
||||||
|
if cs < 0
|
||||||
|
throw(ArgumentError("malformed metainfo at pos $(p)"))
|
||||||
|
end
|
||||||
|
|
||||||
|
if found_metainfo
|
||||||
|
@goto __return__
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_returncode_metainfo = quote
|
||||||
|
return found_metainfo
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
Automa.Stream.generate_reader(
|
||||||
|
:index!,
|
||||||
|
sam_machine_metainfo,
|
||||||
|
arguments = (:(metainfo::MetaInfo),),
|
||||||
|
actions = sam_actions_metainfo,
|
||||||
|
context = sam_context,
|
||||||
|
initcode = sam_initcode_metainfo,
|
||||||
|
loopcode = sam_loopcode_metainfo,
|
||||||
|
returncode = sam_returncode_metainfo
|
||||||
|
) |> eval
|
||||||
|
|
||||||
|
const sam_loopcode_header = quote
|
||||||
|
|
||||||
|
if cs < 0
|
||||||
|
throw(ArgumentError("malformed metainfo at line $(linenum)"))
|
||||||
|
end
|
||||||
|
|
||||||
|
if found_metainfo
|
||||||
|
push!(header, metainfo)
|
||||||
|
found_metainfo = false
|
||||||
|
end
|
||||||
|
|
||||||
|
metainfo = MetaInfo()
|
||||||
|
|
||||||
|
if finish_header
|
||||||
|
@goto __return__
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_returncode_header = quote
|
||||||
|
return cs, linenum, finish_header
|
||||||
|
end
|
||||||
|
|
||||||
|
Automa.Stream.generate_reader(
|
||||||
|
:readheader!,
|
||||||
|
sam_machine_header,
|
||||||
|
arguments = (:(header::SAM.Header), :(state::Tuple{Int,Int})),
|
||||||
|
actions = sam_actions_header,
|
||||||
|
context = sam_context,
|
||||||
|
initcode = sam_initcode_header,
|
||||||
|
loopcode = sam_loopcode_header,
|
||||||
|
returncode = sam_returncode_header
|
||||||
|
) |> eval
|
||||||
|
|
||||||
|
|
||||||
|
const sam_loopcode_record = quote
|
||||||
|
|
||||||
|
if cs < 0
|
||||||
|
throw(ArgumentError("malformed SAM record at position $(p), line $(linenum)"))
|
||||||
|
end
|
||||||
|
|
||||||
|
# # if cs != 0
|
||||||
|
# # throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)])))))
|
||||||
|
# # end
|
||||||
|
|
||||||
|
if found_record
|
||||||
|
@goto __return__
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_loopcode_body = quote
|
||||||
|
|
||||||
|
$(sam_loopcode_record)
|
||||||
|
|
||||||
|
if finish_body
|
||||||
|
@goto __return__
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
const sam_returncode_record = quote
|
||||||
|
return found_record
|
||||||
|
end
|
||||||
|
|
||||||
|
Automa.Stream.generate_reader(
|
||||||
|
:index!,
|
||||||
|
sam_machine_record,
|
||||||
|
arguments = (:(record::Record),),
|
||||||
|
actions = sam_actions_record,
|
||||||
|
context = sam_context,
|
||||||
|
initcode = sam_initcode_record,
|
||||||
|
loopcode = sam_loopcode_record,
|
||||||
|
returncode = sam_returncode_record
|
||||||
|
) |> eval
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
const sam_returncode_body = quote
|
||||||
|
return cs, linenum, found_record
|
||||||
|
end
|
||||||
|
|
||||||
|
Automa.Stream.generate_reader(
|
||||||
|
:readrecord!,
|
||||||
|
sam_machine_body,
|
||||||
|
arguments = (:(record::Record), :(state::Tuple{Int,Int})),
|
||||||
|
actions = sam_actions_body,
|
||||||
|
context = sam_context,
|
||||||
|
initcode = sam_initcode_body,
|
||||||
|
loopcode = sam_loopcode_body,
|
||||||
|
returncode = sam_returncode_body
|
||||||
|
) |> eval
|
|
@ -299,9 +299,10 @@ Get the alignment of `record`.
|
||||||
function alignment(record::Record)::BioAlignments.Alignment
|
function alignment(record::Record)::BioAlignments.Alignment
|
||||||
if ismapped(record)
|
if ismapped(record)
|
||||||
return BioAlignments.Alignment(cigar(record), 1, position(record))
|
return BioAlignments.Alignment(cigar(record), 1, position(record))
|
||||||
else
|
|
||||||
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function hasalignment(record::Record)
|
function hasalignment(record::Record)
|
||||||
|
@ -474,23 +475,30 @@ function Base.getindex(record::Record, tag::AbstractString)
|
||||||
else
|
else
|
||||||
hi = first(record.fields[i+1]) - 2
|
hi = first(record.fields[i+1]) - 2
|
||||||
end
|
end
|
||||||
|
|
||||||
if typ == UInt8('A')
|
if typ == UInt8('A')
|
||||||
@assert lo == hi
|
@assert lo == hi
|
||||||
return Char(record.data[lo])
|
return Char(record.data[lo])
|
||||||
elseif typ == UInt8('i')
|
end
|
||||||
|
if typ == UInt8('i')
|
||||||
return unsafe_parse_decimal(Int, record.data, lo:hi)
|
return unsafe_parse_decimal(Int, record.data, lo:hi)
|
||||||
elseif typ == UInt8('f')
|
end
|
||||||
|
if typ == UInt8('f')
|
||||||
# TODO: Call a C function directly for speed?
|
# TODO: Call a C function directly for speed?
|
||||||
return parse(Float32, SubString(record.data[lo:hi]))
|
return parse(Float32, SubString(record.data[lo:hi]))
|
||||||
elseif typ == UInt8('Z')
|
|
||||||
return String(record.data[lo:hi])
|
|
||||||
elseif typ == UInt8('H')
|
|
||||||
return parse_hexarray(record.data, lo:hi)
|
|
||||||
elseif typ == UInt8('B')
|
|
||||||
return parse_typedarray(record.data, lo:hi)
|
|
||||||
else
|
|
||||||
throw(ArgumentError("type code '$(Char(typ))' is not defined"))
|
|
||||||
end
|
end
|
||||||
|
if typ == UInt8('Z')
|
||||||
|
return String(record.data[lo:hi])
|
||||||
|
end
|
||||||
|
if typ == UInt8('H')
|
||||||
|
return parse_hexarray(record.data, lo:hi)
|
||||||
|
end
|
||||||
|
if typ == UInt8('B')
|
||||||
|
return parse_typedarray(record.data, lo:hi)
|
||||||
|
end
|
||||||
|
|
||||||
|
throw(ArgumentError("type code '$(Char(typ))' is not defined"))
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.keys(record::Record)
|
function Base.keys(record::Record)
|
||||||
|
@ -506,39 +514,39 @@ end
|
||||||
# Bio Methods
|
# Bio Methods
|
||||||
# -----------
|
# -----------
|
||||||
|
|
||||||
function BioCore.isfilled(record::Record)
|
function BioGenerics.isfilled(record::Record)
|
||||||
return !isempty(record.filled)
|
return !isempty(record.filled)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.seqname(record::Record)
|
function BioGenerics.seqname(record::Record)
|
||||||
return tempname(record)
|
return tempname(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hasseqname(record::Record)
|
function BioGenerics.hasseqname(record::Record)
|
||||||
return hastempname(record)
|
return hastempname(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.sequence(record::Record)
|
function BioGenerics.sequence(record::Record)
|
||||||
return sequence(record)
|
return sequence(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hassequence(record::Record)
|
function BioGenerics.hassequence(record::Record)
|
||||||
return hassequence(record)
|
return hassequence(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.rightposition(record::Record)
|
function BioGenerics.rightposition(record::Record)
|
||||||
return rightposition(record)
|
return rightposition(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hasrightposition(record::Record)
|
function BioGenerics.hasrightposition(record::Record)
|
||||||
return hasrightposition(record)
|
return hasrightposition(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.leftposition(record::Record)
|
function BioGenerics.leftposition(record::Record)
|
||||||
return position(record)
|
return position(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.hasleftposition(record::Record)
|
function BioGenerics.hasleftposition(record::Record)
|
||||||
return hasposition(record)
|
return hasposition(record)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -602,21 +610,28 @@ function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int})
|
||||||
xs = split(String(data[first(range)+2:last(range)]))
|
xs = split(String(data[first(range)+2:last(range)]))
|
||||||
if t == UInt8('c')
|
if t == UInt8('c')
|
||||||
return [parse(Int8, x) for x in xs]
|
return [parse(Int8, x) for x in xs]
|
||||||
elseif t == UInt8('C')
|
|
||||||
return [parse(UInt8, x) for x in xs]
|
|
||||||
elseif t == UInt8('s')
|
|
||||||
return [parse(Int16, x) for x in xs]
|
|
||||||
elseif t == UInt8('S')
|
|
||||||
return [parse(UInt16, x) for x in xs]
|
|
||||||
elseif t == UInt8('i')
|
|
||||||
return [parse(Int32, x) for x in xs]
|
|
||||||
elseif t == UInt8('I')
|
|
||||||
return [parse(UInt32, x) for x in xs]
|
|
||||||
elseif t == UInt8('f')
|
|
||||||
return [parse(Float32, x) for x in xs]
|
|
||||||
else
|
|
||||||
throw(ArgumentError("type code '$(Char(t))' is not defined"))
|
|
||||||
end
|
end
|
||||||
|
if t == UInt8('C')
|
||||||
|
return [parse(UInt8, x) for x in xs]
|
||||||
|
end
|
||||||
|
if t == UInt8('s')
|
||||||
|
return [parse(Int16, x) for x in xs]
|
||||||
|
end
|
||||||
|
if t == UInt8('S')
|
||||||
|
return [parse(UInt16, x) for x in xs]
|
||||||
|
end
|
||||||
|
if t == UInt8('i')
|
||||||
|
return [parse(Int32, x) for x in xs]
|
||||||
|
end
|
||||||
|
if t == UInt8('I')
|
||||||
|
return [parse(UInt32, x) for x in xs]
|
||||||
|
end
|
||||||
|
if t == UInt8('f')
|
||||||
|
return [parse(Float32, x) for x in xs]
|
||||||
|
end
|
||||||
|
|
||||||
|
throw(ArgumentError("type code '$(Char(t))' is not defined"))
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function ismissing(record::Record, range::UnitRange{Int})
|
function ismissing(record::Record, range::UnitRange{Int})
|
||||||
|
|
|
@ -3,23 +3,73 @@
|
||||||
|
|
||||||
module SAM
|
module SAM
|
||||||
|
|
||||||
using BioCore
|
using BioGenerics
|
||||||
|
|
||||||
import Automa
|
import Automa
|
||||||
import Automa.RegExp: @re_str
|
import Automa.RegExp: @re_str
|
||||||
|
import Automa.Stream: @mark, @markpos, @relpos, @abspos
|
||||||
import BioAlignments
|
import BioAlignments
|
||||||
import BioCore.Exceptions: missingerror
|
import BioGenerics: BioGenerics, isfilled, header
|
||||||
import BioCore.RecordHelper: unsafe_parse_decimal
|
import BioGenerics.Exceptions: missingerror
|
||||||
import BioCore: isfilled, header
|
import BioGenerics.Automa: State
|
||||||
import BioSequences
|
import BioSequences
|
||||||
import BufferedStreams
|
import TranscodingStreams: TranscodingStreams, TranscodingStream
|
||||||
|
|
||||||
using Printf: @sprintf
|
using Printf: @sprintf
|
||||||
|
|
||||||
|
|
||||||
|
#TODO: update import BioCore.RecordHelper: unsafe_parse_decimal
|
||||||
|
# r"[0-9]+" must match `data[range]`.
|
||||||
|
function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{Int}) where {T<:Unsigned}
|
||||||
|
x = zero(T)
|
||||||
|
@inbounds for i in range
|
||||||
|
x = Base.Checked.checked_mul(x, 10 % T)
|
||||||
|
x = Base.Checked.checked_add(x, (data[i] - UInt8('0')) % T)
|
||||||
|
end
|
||||||
|
return x
|
||||||
|
end
|
||||||
|
|
||||||
|
# r"[-+]?[0-9]+" must match `data[range]`.
|
||||||
|
function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{Int}) where {T<:Signed}
|
||||||
|
lo = first(range)
|
||||||
|
if data[lo] == UInt8('-')
|
||||||
|
sign = T(-1)
|
||||||
|
lo += 1
|
||||||
|
elseif data[lo] == UInt8('+')
|
||||||
|
sign = T(+1)
|
||||||
|
lo += 1
|
||||||
|
else
|
||||||
|
sign = T(+1)
|
||||||
|
end
|
||||||
|
x = zero(T)
|
||||||
|
@inbounds for i in lo:last(range)
|
||||||
|
x = Base.Checked.checked_mul(x, 10 % T)
|
||||||
|
x = Base.Checked.checked_add(x, (data[i] - UInt8('0')) % T)
|
||||||
|
end
|
||||||
|
return sign * x
|
||||||
|
end
|
||||||
|
|
||||||
|
# #TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams).
|
||||||
|
# import BufferedStreams: BufferedStreams, BufferedInputStream
|
||||||
|
# # A type keeping track of a ragel-based parser's state.
|
||||||
|
# mutable struct State{T<:BufferedInputStream}
|
||||||
|
# stream::T # input stream
|
||||||
|
# cs::Int # current DFA state of Ragel
|
||||||
|
# linenum::Int # line number: parser is responsible for updating this
|
||||||
|
# finished::Bool # true if finished (regardless of where in the stream we are)
|
||||||
|
# end
|
||||||
|
|
||||||
|
# function State(initstate::Int, input::BufferedInputStream)
|
||||||
|
# return State(input, initstate, 1, false)
|
||||||
|
# end
|
||||||
|
|
||||||
|
|
||||||
include("flags.jl")
|
include("flags.jl")
|
||||||
include("metainfo.jl")
|
include("metainfo.jl")
|
||||||
include("record.jl")
|
include("record.jl")
|
||||||
include("header.jl")
|
include("header.jl")
|
||||||
include("reader.jl")
|
include("reader.jl")
|
||||||
|
include("readrecord.jl")
|
||||||
include("writer.jl")
|
include("writer.jl")
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -10,7 +10,7 @@ Create a data writer of the SAM file format.
|
||||||
* `output`: data sink
|
* `output`: data sink
|
||||||
* `header=Header()`: SAM header object
|
* `header=Header()`: SAM header object
|
||||||
"""
|
"""
|
||||||
mutable struct Writer <: BioCore.IO.AbstractWriter
|
mutable struct Writer <: BioGenerics.IO.AbstractWriter
|
||||||
stream::IO
|
stream::IO
|
||||||
|
|
||||||
function Writer(output::IO, header::Header=Header())
|
function Writer(output::IO, header::Header=Header())
|
||||||
|
@ -20,7 +20,7 @@ mutable struct Writer <: BioCore.IO.AbstractWriter
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
function BioCore.IO.stream(writer::Writer)
|
function BioGenerics.IO.stream(writer::Writer)
|
||||||
return writer.stream
|
return writer.stream
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -1,25 +1,16 @@
|
||||||
using Test
|
using Test
|
||||||
|
|
||||||
|
using BioGenerics
|
||||||
|
using FormatSpecimens
|
||||||
using GenomicFeatures
|
using GenomicFeatures
|
||||||
using XAM
|
using XAM
|
||||||
|
|
||||||
import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE
|
import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE
|
||||||
using FormatSpecimens
|
|
||||||
import BGZFStreams: BGZFStream
|
import BGZFStreams: BGZFStream
|
||||||
import BioCore.Exceptions: MissingFieldException
|
import BioGenerics.Exceptions: MissingFieldException
|
||||||
import BioSequences: @dna_str, @aa_str
|
import BioSequences: @dna_str, @aa_str
|
||||||
|
|
||||||
|
|
||||||
import BioCore:
|
|
||||||
header,
|
|
||||||
isfilled,
|
|
||||||
seqname,
|
|
||||||
hasseqname,
|
|
||||||
sequence,
|
|
||||||
hassequence,
|
|
||||||
leftposition,
|
|
||||||
rightposition,
|
|
||||||
hasleftposition,
|
|
||||||
hasrightposition
|
|
||||||
|
|
||||||
# Generate a random range within `range`.
|
# Generate a random range within `range`.
|
||||||
function randrange(range)
|
function randrange(range)
|
||||||
x = rand(range)
|
x = rand(range)
|
||||||
|
|
Loading…
Reference in a new issue