1
0
Fork 0
mirror of https://github.com/MillironX/XAM.jl.git synced 2024-12-23 13:28:16 +00:00

Switch to CodecBGZF

This commit is contained in:
Jakob Nybo Nissen 2021-02-28 13:06:17 +01:00 committed by Ciarán O’Mara
parent 5bd793bc5f
commit bb5807795b
8 changed files with 24 additions and 19 deletions

View file

@ -5,10 +5,10 @@ version = "0.2.7"
[deps] [deps]
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6"
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@ -16,12 +16,10 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
[compat] [compat]
Automa = "0.7, 0.8" Automa = "0.7, 0.8"
BGZFStreams = "0.3"
BioAlignments = "2" BioAlignments = "2"
BioGenerics = "0.1" BioGenerics = "0.1"
BioSequences = "2.0.4" BioSequences = "2.0.4"
GenomicFeatures = "2" GenomicFeatures = "2"
Indexes = "0.1"
TranscodingStreams = "0.6, 0.7, 0.8, 0.9" TranscodingStreams = "0.6, 0.7, 0.8, 0.9"
julia = "1" julia = "1"

View file

@ -6,8 +6,8 @@ module BAM
using BioGenerics using BioGenerics
using GenomicFeatures using GenomicFeatures
using XAM.SAM using XAM.SAM
using CodecBGZF
import BGZFStreams
import BioAlignments import BioAlignments
import Indexes import Indexes
import BioSequences import BioSequences

View file

@ -62,7 +62,7 @@ end
function Base.iterate(iter::OverlapIterator, state) function Base.iterate(iter::OverlapIterator, state)
while state.chunkid lastindex(state.chunks) while state.chunkid lastindex(state.chunks)
chunk = state.chunks[state.chunkid] chunk = state.chunks[state.chunkid]
while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop while VirtualOffset(iter.reader.stream) < chunk.stop
read!(iter.reader, state.record) read!(iter.reader, state.record)
c = compare_intervals(state.record, (state.refindex, iter.interval)) c = compare_intervals(state.record, (state.refindex, iter.interval))
if c == 0 if c == 0

View file

@ -11,9 +11,9 @@ Create a data reader of the BAM file format.
* `index=nothing`: filepath to a random access index (currently *bai* is supported) * `index=nothing`: filepath to a random access index (currently *bai* is supported)
""" """
mutable struct Reader{T} <: BioGenerics.IO.AbstractReader mutable struct Reader{T} <: BioGenerics.IO.AbstractReader
stream::BGZFStreams.BGZFStream{T} stream::BGZFDecompressorStream{T}
header::SAM.Header header::SAM.Header
start_offset::BGZFStreams.VirtualOffset start_offset::VirtualOffset
refseqnames::Vector{String} refseqnames::Vector{String}
refseqlens::Vector{Int} refseqlens::Vector{Int}
index::Union{Nothing, BAI} index::Union{Nothing, BAI}
@ -64,7 +64,7 @@ function header(reader::Reader; fillSQ::Bool=false)::SAM.Header
return header return header
end end
function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset) function Base.seek(reader::Reader, voffset::CodecBGZF.VirtualOffset)
seek(reader.stream, voffset) seek(reader.stream, voffset)
end end
@ -80,7 +80,7 @@ function Base.iterate(reader::Reader, nextone = Record())
end end
# Initialize a BAM reader by reading the header section. # Initialize a BAM reader by reading the header section.
function init_bam_reader(input::BGZFStreams.BGZFStream) function init_bam_reader(input::BGZFDecompressorStream)
# magic bytes # magic bytes
B = read(input, UInt8) B = read(input, UInt8)
A = read(input, UInt8) A = read(input, UInt8)
@ -108,9 +108,7 @@ function init_bam_reader(input::BGZFStreams.BGZFStream)
refseqlens[i] = seqlen refseqlens[i] = seqlen
end end
voffset = isa(input.io, Base.AbstractPipe) ? voffset = VirtualOffset(input)
BGZFStreams.VirtualOffset(0, 0) :
BGZFStreams.virtualoffset(input)
return Reader( return Reader(
input, input,
@ -122,7 +120,7 @@ function init_bam_reader(input::BGZFStreams.BGZFStream)
end end
function init_bam_reader(input::IO) function init_bam_reader(input::IO)
return init_bam_reader(BGZFStreams.BGZFStream(input)) return init_bam_reader(BGZFDecompressorStream(input))
end end
function _read!(reader::Reader, record) function _read!(reader::Reader, record)

View file

@ -2,7 +2,7 @@
# ========== # ==========
""" """
BAM.Writer(output::BGZFStream, header::SAM.Header) BAM.Writer(output::BGZFCompressorStream, header::SAM.Header)
Create a data writer of the BAM file format. Create a data writer of the BAM file format.
@ -11,10 +11,10 @@ Create a data writer of the BAM file format.
* `header`: SAM header object * `header`: SAM header object
""" """
mutable struct Writer <: BioGenerics.IO.AbstractWriter mutable struct Writer <: BioGenerics.IO.AbstractWriter
stream::BGZFStreams.BGZFStream stream::BGZFCompressorStream
end end
function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header) function Writer(stream::BGZFCompressorStream, header::SAM.Header)
refseqnames = String[] refseqnames = String[]
refseqlens = Int[] refseqlens = Int[]
for metainfo in findall(header, "SQ") for metainfo in findall(header, "SQ")

9
test/Project.toml Normal file
View file

@ -0,0 +1,9 @@
[deps]
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6"
FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
XAM = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"

View file

@ -4,9 +4,9 @@ using BioGenerics
using FormatSpecimens using FormatSpecimens
using GenomicFeatures using GenomicFeatures
using XAM using XAM
using CodecBGZF
import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE
import BGZFStreams: BGZFStream
import BioGenerics.Exceptions: MissingFieldException import BioGenerics.Exceptions: MissingFieldException
import BioSequences: @dna_str, @aa_str import BioSequences: @dna_str, @aa_str

View file

@ -44,7 +44,6 @@
reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam")) reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam"))
@test isa(reader, BAM.Reader) @test isa(reader, BAM.Reader)
@test eltype(reader) === BAM.Record @test eltype(reader) === BAM.Record
@test startswith(repr(reader), "XAM.BAM.Reader{IOStream}:")
# header # header
h = header(reader) h = header(reader)
@ -199,7 +198,8 @@
header_original = header(reader) header_original = header(reader)
writer = BAM.Writer(BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ")))) hdr = BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ")))
writer = BAM.Writer(BGZFCompressorStream(open(path, "w")), hdr)
records = BAM.Record[] records = BAM.Record[]
for record in reader for record in reader