diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 98d1792..b3fc56e 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -1,17 +1,45 @@ name: CompatHelper - on: schedule: - - cron: '0 0 * * *' - + - cron: 0 0 * * * + workflow_dispatch: +permissions: + contents: write + pull-requests: write jobs: CompatHelper: runs-on: ubuntu-latest steps: - - name: Add CompatHelper - run: julia --color=yes -e 'using Pkg; Pkg.add("CompatHelper")' - - name: Run CompatHelper + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v1 + with: + version: '1' + arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Add the General registry via Git" + run: | + import Pkg + ENV["JULIA_PKG_SERVER"] = "" + Pkg.Registry.add("General") + shell: julia --color=yes {0} + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} - run: julia --color=yes -e 'using CompatHelper; CompatHelper.main(master_branch = "master")' + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml index 85a6cd9..7e683ea 100644 --- a/.github/workflows/Documentation.yml +++ b/.github/workflows/Documentation.yml @@ -1,5 +1,4 @@ -name: Build Documentation - +name: Documentation on: push: branches: @@ -10,17 +9,15 @@ on: pull_request: jobs: - build: + Documenter: + permissions: + contents: write + name: Documentation runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 - with: - version: '1' - - name: Install Dependencies - run: julia --color=yes --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' - - name: Build and Deploy + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-docdeploy@v1 env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token - DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key - run: julia --color=yes --project=docs/ docs/make.jl + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index 086a8e5..9bdb8d7 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -1,12 +1,22 @@ name: TagBot + on: - schedule: - - cron: '0 0 * * *' + issue_comment: + types: + - created + workflow_dispatch: + inputs: + lookback: + default: 3 +permissions: + contents: write + jobs: TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' runs-on: ubuntu-latest steps: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} - ssh: ${{ secrets.TAGBOT_KEY }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index 9d5c078..a6cbe9e 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -6,16 +6,20 @@ on: jobs: test: + name: Julia ${{ matrix.julia-version }} - ${{ matrix.os }} - ${{ matrix.julia-arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} continue-on-error: ${{ matrix.experimental }} strategy: fail-fast: false matrix: julia-version: - - '1.0' # LTS + - '1.6' # LTS - '1' julia-arch: [x64, x86] os: [ubuntu-latest, windows-latest, macOS-latest] + exclude: + - os: macOS-latest + julia-arch: x86 experimental: [false] include: - julia-version: nightly @@ -31,7 +35,7 @@ jobs: with: version: ${{ matrix.julia-version }} - name: Run Tests - uses: julia-actions/julia-runtest@latest + uses: julia-actions/julia-runtest@v1 - name: Create CodeCov uses: julia-actions/julia-processcoverage@v1 - name: Upload CodeCov diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f4ef97d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,27 @@ +# Changelog + +All notable changes to HapLink.jl will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.3.0] + +## Added + +- Crosschecks for SAM and BAM ([#29](https://github.com/BioJulia/XAM.jl/pull/29)) +- Improved documentation for flags ([#43](https://github.com/BioJulia/XAM.jl/pull/43)) + +### Changed + +- `BAM.quality` performance improved ([#21](https://github.com/BioJulia/XAM.jl/issues/21)) +- Updated BioAlignments to v2.2 and BioSequences to v3 ([#48](https://github.com/BioJulia/XAM.jl/pull/48)) + +### Fixed + +- `BAM.Record` layout now matches the BAM specs ([#26](https://github.com/BioJulia/XAM.jl/pull/26)) + +[Unreleased]: https://github.com/BioJulia/XAM.jl/compare/v0.3.0...HEAD +[0.3.0]: https://github.com/BioJulia/XAM.jl/compare/v0.2.8...v0.3.0 diff --git a/Project.toml b/Project.toml index 9b23f8f..4ff9efb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "XAM" uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" authors = ["Kenta Sato ", "Ben J. Ward ", "CiarĂ¡n O'Mara "] -version = "0.2.8" +version = "0.3.0" [deps] Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" @@ -17,18 +17,19 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" [compat] Automa = "0.7, 0.8" BGZFStreams = "0.3.1" -BioAlignments = "2" +BioAlignments = "2.2" BioGenerics = "0.1" -BioSequences = "2.0.4" +BioSequences = "3" FormatSpecimens = "1.1" GenomicFeatures = "2" Indexes = "0.1" TranscodingStreams = "0.6, 0.7, 0.8, 0.9" -julia = "1" +julia = "1.6" [extras] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["FormatSpecimens", "Test"] +test = ["Documenter", "FormatSpecimens", "Test"] diff --git a/README.md b/README.md index 951c649..6b83fc1 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) [![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) [![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/) -[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl) > This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original blog post"). @@ -66,4 +65,4 @@ Your logo will show up here with a link to your website. ## Questions? -If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). +If you have a question about contributing or using BioJulia software, come on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/Project.toml b/docs/Project.toml index 3506869..b284c6f 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,4 +3,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [compat] -Documenter = "0.24" +Documenter = "0.27" diff --git a/docs/make.jl b/docs/make.jl index fcbd601..db96e6e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,6 +2,8 @@ using Pkg using Documenter, XAM makedocs( + checkdocs = :all, + linkcheck = true, format = Documenter.HTML( edit_link = "develop" ), diff --git a/docs/src/index.md b/docs/src/index.md index de6ee30..43b19cf 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -5,7 +5,6 @@ [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE) [![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable) [![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/) -[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl) > This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/). @@ -65,4 +64,4 @@ Your logo will show up here with a link to your website. ## Questions? -If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). +If you have a question about contributing or using BioJulia software, come on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/src/XAM.jl b/src/XAM.jl index 8609747..2c5bad1 100644 --- a/src/XAM.jl +++ b/src/XAM.jl @@ -4,6 +4,27 @@ export SAM, BAM +""" + flag(record::Union{SAM.Record, BAM.Record})::UInt16 + +Get the bitwise flags of `record`. The returned value is a `UInt16` of each flag +being OR'd together. The possible flags are: + + 0x0001 template having multiple segments in sequencing + 0x0002 each segment properly aligned according to the aligner + 0x0004 segment unmapped + 0x0008 next segment in the template unmapped + 0x0010 SEQ being reverse complemented + 0x0020 SEQ of the next segment in the template being reverse complemented + 0x0040 the first segment in the template + 0x0080 the last segment in the template + 0x0100 secondary alignment + 0x0200 not passing filters, such as platform/vendor quality controls + 0x0400 PCR or optical duplicate + 0x0800 supplementary alignment +""" +function flag end + include("sam/sam.jl") include("bam/bam.jl") diff --git a/src/bam/bam.jl b/src/bam/bam.jl index 6fd1174..b3591d7 100644 --- a/src/bam/bam.jl +++ b/src/bam/bam.jl @@ -6,6 +6,7 @@ module BAM using BioGenerics using GenomicFeatures using XAM.SAM +import ..XAM: flag import BGZFStreams import BioAlignments diff --git a/src/bam/overlap.jl b/src/bam/overlap.jl index c6cf8ac..fb28b1f 100644 --- a/src/bam/overlap.jl +++ b/src/bam/overlap.jl @@ -50,12 +50,15 @@ function Base.iterate(iter::OverlapIterator) if refindex === nothing throw(ArgumentError("sequence name $(iter.refname) is not found in the header")) end - @assert iter.reader.index !== nothing + + @assert iter.reader.index !== nothing "Reader index cannot be nothing." + chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval) - if !isempty(chunks) - seek(iter.reader, first(chunks).start) + if isempty(chunks) + return nothing end state = OverlapIteratorState(refindex, chunks, 1, Record()) + seek(iter.reader, state.chunks[state.chunkid].start) return iterate(iter, state) end @@ -65,7 +68,7 @@ function Base.iterate(iter::OverlapIterator, state) while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop read!(iter.reader, state.record) c = compare_intervals(state.record, (state.refindex, iter.interval)) - if c == 0 + if c == 0 # overlapping return copy(state.record), state end if c > 0 @@ -78,6 +81,7 @@ function Base.iterate(iter::OverlapIterator, state) seek(iter.reader, state.chunks[state.chunkid].start) end end + # no more overlapping records return nothing end @@ -89,7 +93,7 @@ function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}}) return -1 end - if rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2])) + if rid > interval[1] || (rid == interval[1] && leftposition(record) > last(interval[2])) # strictly right return +1 end diff --git a/src/bam/record.jl b/src/bam/record.jl index 56e048d..6de18a5 100644 --- a/src/bam/record.jl +++ b/src/bam/record.jl @@ -11,18 +11,21 @@ mutable struct Record block_size::Int32 refid::Int32 pos::Int32 - bin_mq_nl::UInt32 - flag_nc::UInt32 + l_read_name::UInt8 + mapq::UInt8 + bin::UInt16 + n_cigar_op::UInt16 + flag::UInt16 l_seq::Int32 next_refid::Int32 next_pos::Int32 tlen::Int32 # variable length data data::Vector{UInt8} - reader::Reader + reader::Union{Reader, Nothing} function Record() - return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[]) + return new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[]) end end @@ -49,8 +52,11 @@ function Base.:(==)(a::Record, b::Record) return a.block_size == b.block_size && a.refid == b.refid && a.pos == b.pos && - a.bin_mq_nl == b.bin_mq_nl && - a.flag_nc == b.flag_nc && + a.l_read_name == b.l_read_name && + a.mapq == b.mapq && + a.bin == b.bin && + a.n_cigar_op == b.n_cigar_op && + a.flag == b.flag && a.l_seq == b.l_seq && a.next_refid == b.next_refid && a.next_pos == b.next_pos && @@ -60,19 +66,13 @@ end function Base.copy(record::Record) copy = Record() - copy.block_size = record.block_size - copy.refid = record.refid - copy.pos = record.pos - copy.bin_mq_nl = record.bin_mq_nl - copy.flag_nc = record.flag_nc - copy.l_seq = record.l_seq - copy.next_refid = record.next_refid - copy.next_pos = record.next_pos - copy.tlen = record.tlen - copy.data = record.data[1:data_size(record)] - if isdefined(record, :reader) - copy.reader = record.reader + GC.@preserve copy record begin + dst_pointer = Ptr{UInt8}(pointer_from_objref(copy)) + src_pointer = Ptr{UInt8}(pointer_from_objref(record)) + unsafe_copyto!(dst_pointer, src_pointer, FIXED_FIELDS_BYTES) end + copy.data = record.data[1:data_size(record)] + copy.reader = record.reader return copy end @@ -80,15 +80,18 @@ function Base.empty!(record::Record) record.block_size = 0 record.refid = 0 record.pos = 0 - record.bin_mq_nl = 0 - record.flag_nc = 0 + record.l_read_name = 0 + record.mapq = 0 + record.bin = 0 + record.flag = 0 + record.n_cigar_op = 0 record.l_seq = 0 record.next_refid = 0 record.next_pos = 0 record.tlen = 0 #Note: data will be overwritten and indexed using data_size. - + return record end @@ -125,14 +128,9 @@ end # Accessor Fuctions # ----------------- -""" - flag(record::Record)::UInt16 - -Get the bitwise flag of `record`. -""" function flag(record::Record)::UInt16 checkfilled(record) - return UInt16(record.flag_nc >> 16) + return record.flag end function hasflag(record::Record) @@ -318,7 +316,7 @@ end Get the mapping quality of `record`. """ function mappingquality(record::Record)::UInt8 - return UInt8((record.bin_mq_nl >> 8) & 0xff) + return record.mapq end function hasmappingquality(record::Record) @@ -397,7 +395,7 @@ function extract_cigar_rle(data::Vector{UInt8}, offset, n) end function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int} - cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF + cigaridx, nops = seqname_length(record) + 1, record.n_cigar_op if !checkCG return cigaridx, nops end @@ -507,13 +505,16 @@ function hastemplength(record::Record) end """ - sequence(record::Record)::BioSequences.LongDNASeq + sequence(record::Record)::BioSequences.LongDNA{4} Get the segment sequence of `record`. """ -function sequence(record::Record)::BioSequences.LongDNASeq +function sequence(record::Record) checkfilled(record) seqlen = seqlength(record) + if seqlen == 0 + return nothing + end data = Vector{UInt64}(undef, cld(seqlen, 16)) src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1) for i in 1:lastindex(data) @@ -521,7 +522,7 @@ function sequence(record::Record)::BioSequences.LongDNASeq x = unsafe_load(src, i) data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4 end - return BioSequences.LongDNASeq(data, 1:seqlen, false) + return BioSequences.LongDNA{4}(data, UInt(seqlen)) end function hassequence(record::Record) @@ -543,15 +544,15 @@ function hasseqlength(record::Record) end """ - quality(record::Record)::Vector{UInt8} + quality(record::Record) -Get the base quality of `record`. +Get the base quality of `record`. """ -function quality(record::Record)::Vector{UInt8} +function quality(record::Record) checkfilled(record) seqlen = seqlength(record) offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) - return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen] + return record.data[(1+offset):(seqlen+offset)] end function hasquality(record::Record) @@ -661,5 +662,5 @@ end # Return the length of the read name. function seqname_length(record::Record) - return record.bin_mq_nl & 0xff + return record.l_read_name end diff --git a/src/bam/writer.jl b/src/bam/writer.jl index 2460b3f..81bdf23 100644 --- a/src/bam/writer.jl +++ b/src/bam/writer.jl @@ -37,7 +37,7 @@ function Base.write(writer::Writer, record::Record) end function write_header(stream, header, refseqnames, refseqlens) - @assert length(refseqnames) == length(refseqlens) + @assert length(refseqnames) == length(refseqlens) "Lengths of refseq names and lengths must match." n = 0 # magic bytes diff --git a/src/sam/flags.jl b/src/sam/flags.jl index 0b629d1..ded0df1 100644 --- a/src/sam/flags.jl +++ b/src/sam/flags.jl @@ -16,11 +16,14 @@ for (name, bits, doc) in [ (:QCFAIL, UInt16(0x200), "QC failure" ), (:DUP, UInt16(0x400), "optical or PCR duplicate" ), (:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),] - @assert bits isa UInt16 + @assert bits isa UInt16 "The bits must be of type UInt16." sym = Symbol("FLAG_", name) - doc = string(@sprintf("0x%04x: ", bits), doc) + docstring = """ $sym + SAM/BAM flag: $doc + + See also: [`flag`](@ref) + """ @eval begin - @doc $(doc) const $(sym) = $(bits) + @doc $(docstring) const $(sym) = $(bits) end end - diff --git a/src/sam/header.jl b/src/sam/header.jl index 668cb59..5111151 100644 --- a/src/sam/header.jl +++ b/src/sam/header.jl @@ -38,7 +38,7 @@ function Base.iterate(header::Header, i=1) end """ - find(header::Header, key::AbstractString)::Vector{MetaInfo} + findall(header::Header, key::AbstractString)::Vector{MetaInfo} Find metainfo objects satisfying `SAM.tag(metainfo) == key`. """ diff --git a/src/sam/reader.jl b/src/sam/reader.jl index aa5546e..46fd286 100644 --- a/src/sam/reader.jl +++ b/src/sam/reader.jl @@ -10,12 +10,16 @@ function Reader(state::State{S}) where {S <: TranscodingStream} rdr = Reader(state, Header()) - cs, ln, f = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum)) + cs, ln = readheader!(rdr.state.stream, rdr.header, (sam_machine_header.start_state, rdr.state.linenum)) - rdr.state.state = sam_machine_body.start_state + rdr.state.state = sam_machine_body.start_state # Get the reader ready to read the body. rdr.state.linenum = ln rdr.state.filled = false + if cs != -1 && cs != 0 #Note: the header is finished when the state machine fails to transition after a new line (state 1). + throw(ArgumentError("Malformed SAM file header at line $(ln). Machine failed to transition from state $(cs).")) + end + return rdr end @@ -65,18 +69,18 @@ end function index!(record::MetaInfo) stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) - found = index!(stream, record) - if !found - throw(ArgumentError("invalid SAM metadata")) + cs = index!(stream, record) + if cs != 0 + throw(ArgumentError("Invalid SAM metadata. Machine failed to transition from state $(cs).")) end return record end function index!(record::Record) stream = TranscodingStreams.NoopStream(IOBuffer(record.data)) - found = index!(stream, record) - if !found - throw(ArgumentError("invalid SAM record")) + cs = index!(stream, record) + if cs != 0 + throw(ArgumentError("Invalid SAM record. Machine failed to transition from state $(cs).")) end return record end @@ -110,5 +114,6 @@ function Base.read!(rdr::Reader, record::Record) throw(EOFError()) end - throw(ArgumentError("malformed SAM file")) + throw(ArgumentError("Malformed SAM file record at line $(ln). Machine failed to transition from state $(cs).")) + end diff --git a/src/sam/readrecord.jl b/src/sam/readrecord.jl index e2d6669..a3c91c2 100644 --- a/src/sam/readrecord.jl +++ b/src/sam/readrecord.jl @@ -41,7 +41,7 @@ const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_ co.actions[:enter] = [:pos1] co.actions[:exit] = [:metainfo_tag] - comment = re"[^\r\n]*" + comment = re"[^\r\n]*" # Note: Only single line comments are allowed. comment.actions[:enter] = [:pos1] comment.actions[:exit] = [:metainfo_val] @@ -137,7 +137,7 @@ const sam_machine_metainfo, sam_machine_record, sam_machine_header, sam_machine_ header = rep(cat(metainfo, newline)) header.actions[:exit] = [:header] - body = record * rep(newline * record) * opt(newline) + body = rep(cat(record, newline)) body.actions[:exit] = [:body] sam = cat(header, body) @@ -168,13 +168,6 @@ function appendfrom!(dst, dpos, src, spos, n) return dst end -const action_metainfo = quote - appendfrom!(metainfo.data, 1, data, @markpos, p-@markpos) - metainfo.filled = 1:(p-@markpos) - - found_metainfo = true -end - const sam_actions_metainfo = Dict( :mark => :(@mark), :pos1 => :(pos1 = @relpos(p)), @@ -183,7 +176,10 @@ const sam_actions_metainfo = Dict( :metainfo_val => :(metainfo.val = pos1:@relpos(p-1)), :metainfo_dict_key => :(push!(metainfo.dictkey, pos2:@relpos(p-1))), :metainfo_dict_val => :(push!(metainfo.dictval, pos2:@relpos(p-1))), - :metainfo => action_metainfo + :metainfo => quote + appendfrom!(metainfo.data, 1, data, @markpos, p-@markpos) + metainfo.filled = 1:(p-@markpos) + end ) const sam_actions_header = merge( @@ -191,16 +187,11 @@ const sam_actions_header = merge( Dict( :countline => :(linenum += 1), :metainfo => quote - $(action_metainfo) + $(sam_actions_metainfo[:metainfo]) push!(header, metainfo) metainfo = MetaInfo() end, - :header => quote - - finish_header = true - - @escape - end + :header => :(@escape) ) ) @@ -222,9 +213,6 @@ const sam_actions_record = Dict( :record => quote appendfrom!(record.data, 1, data, @markpos, p-@markpos) record.filled = 1:(p-@markpos) - - found_record = true - @escape end ) @@ -232,18 +220,15 @@ const sam_actions_body = merge( sam_actions_record, Dict( :countline => :(linenum += 1), - :body => quote - finish_body = true + :record => quote + found_record = true + $(sam_actions_record[:record]) @escape - end + end, + :body => :(@escape) ) ) -# const sam_actions = merge( -# sam_actions_header, -# sam_actions_body -# ) - const sam_context = Automa.CodeGenContext( generator = :goto, checkbounds = false, @@ -253,43 +238,24 @@ const sam_context = Automa.CodeGenContext( const sam_initcode_metainfo = quote pos1 = 0 pos2 = 0 - found_metainfo = false -end - -const sam_initcode_record = quote - pos = 0 - found_record = false end const sam_initcode_header = quote $(sam_initcode_metainfo) metainfo = MetaInfo() - finish_header = false cs, linenum = state end +const sam_initcode_record = quote + pos = 0 +end + const sam_initcode_body = quote $(sam_initcode_record) - finish_body = false + found_record = false cs, linenum = state end -const sam_loopcode_metainfo = quote - - if cs < 0 - throw(ArgumentError("malformed metainfo at pos $(p)")) - end - - if found_metainfo - @goto __return__ - end -end - -const sam_returncode_metainfo = quote - return found_metainfo -end - - Automa.Stream.generate_reader( :index!, sam_machine_metainfo, @@ -297,19 +263,10 @@ Automa.Stream.generate_reader( actions = sam_actions_metainfo, context = sam_context, initcode = sam_initcode_metainfo, - loopcode = sam_loopcode_metainfo, - returncode = sam_returncode_metainfo ) |> eval -const sam_loopcode_header = quote - - if finish_header - @goto __return__ - end -end - const sam_returncode_header = quote - return cs, linenum, finish_header + return cs, linenum end Automa.Stream.generate_reader( @@ -319,38 +276,13 @@ Automa.Stream.generate_reader( actions = sam_actions_header, context = sam_context, initcode = sam_initcode_header, - loopcode = sam_loopcode_header, returncode = sam_returncode_header ) |> eval - -const sam_loopcode_record = quote - - if cs < 0 - throw(ArgumentError("malformed SAM record at position $(p), line $(linenum)")) - end - - # # if cs != 0 - # # throw(ArgumentError(string("failed to index ", $(record_type), " ~>", repr(String(data[p:min(p+7,p_end)]))))) - # # end - +const sam_loopcode_body = quote if found_record @goto __return__ end - -end - -const sam_loopcode_body = quote - - $(sam_loopcode_record) - - if finish_body - @goto __return__ - end -end - -const sam_returncode_record = quote - return found_record end Automa.Stream.generate_reader( @@ -360,12 +292,9 @@ Automa.Stream.generate_reader( actions = sam_actions_record, context = sam_context, initcode = sam_initcode_record, - loopcode = sam_loopcode_record, - returncode = sam_returncode_record ) |> eval - const sam_returncode_body = quote return cs, linenum, found_record end diff --git a/src/sam/record.jl b/src/sam/record.jl index 2d94a12..ef8bb6d 100644 --- a/src/sam/record.jl +++ b/src/sam/record.jl @@ -2,10 +2,11 @@ # ========== mutable struct Record - # data and filled range + # Data and filled range. data::Vector{UInt8} - filled::UnitRange{Int} - # indexes + filled::UnitRange{Int} # Note: Specifies the data in use. + + # Mandatory fields. qname::UnitRange{Int} flag::UnitRange{Int} rname::UnitRange{Int} @@ -17,6 +18,8 @@ mutable struct Record tlen::UnitRange{Int} seq::UnitRange{Int} qual::UnitRange{Int} + + # Auxiliary fields. fields::Vector{UnitRange{Int}} end @@ -147,11 +150,6 @@ end # Accessor Functions # ------------------ -""" - flag(record::Record)::UInt16 - -Get the bitwise flag of `record`. -""" function flag(record::Record)::UInt16 checkfilled(record) return unsafe_parse_decimal(UInt16, record.data, record.flag) @@ -206,9 +204,9 @@ Get the 1-based leftmost mapping position of `record`. function position(record::Record)::Int checkfilled(record) pos = unsafe_parse_decimal(Int, record.data, record.pos) - if pos == 0 - missingerror(:position) - end + # if pos == 0 + # missingerror(:position) + # end return pos end @@ -263,9 +261,9 @@ Get the position of the mate/next read of `record`. function nextposition(record::Record)::Int checkfilled(record) pos = unsafe_parse_decimal(Int, record.data, record.pnext) - if pos == 0 - missingerror(:nextposition) - end + # if pos == 0 + # missingerror(:nextposition) + # end return pos end @@ -299,7 +297,8 @@ Get the CIGAR string of `record`. function cigar(record::Record)::String checkfilled(record) if ismissing(record, record.cigar) - missingerror(:cigar) + # missingerror(:cigar) + return "" end return String(record.data[record.cigar]) end @@ -377,9 +376,9 @@ Get the template length of `record`. function templength(record::Record)::Int checkfilled(record) len = unsafe_parse_decimal(Int, record.data, record.tlen) - if len == 0 - missingerror(:tlen) - end + # if len == 0 + # missingerror(:tlen) + # end return len end @@ -388,17 +387,18 @@ function hastemplength(record::Record) end """ - sequence(record::Record)::BioSequences.LongDNASeq + sequence(record::Record)::BioSequences.LongDNA{4} Get the segment sequence of `record`. """ -function sequence(record::Record)::BioSequences.LongDNASeq +function sequence(record::Record) checkfilled(record) if ismissing(record, record.seq) - missingerror(:sequence) + # missingerror(:sequence) + return nothing end seqlen = length(record.seq) - ret = BioSequences.LongDNASeq(seqlen) + ret = BioSequences.LongDNA{4}(undef, seqlen) copyto!(ret, 1, record.data, first(record.seq), seqlen) return ret end @@ -494,7 +494,7 @@ function Base.getindex(record::Record, tag::AbstractString) end if typ == UInt8('A') - @assert lo == hi + @assert lo == hi "Values lo and hi must be equivalent." return Char(record.data[lo]) end if typ == UInt8('i') diff --git a/src/sam/sam.jl b/src/sam/sam.jl index 9710269..fcd0821 100644 --- a/src/sam/sam.jl +++ b/src/sam/sam.jl @@ -11,6 +11,7 @@ import BioGenerics.Exceptions: missingerror import BioGenerics.Automa: State import BioSequences import TranscodingStreams: TranscodingStreams, TranscodingStream +import ..XAM: flag using Printf: @sprintf @@ -46,20 +47,6 @@ function unsafe_parse_decimal(::Type{T}, data::Vector{UInt8}, range::UnitRange{I return sign * x end -# #TODO: update BioCore.Ragel.State (will likely change with TrnscodingStreams). -# import BufferedStreams: BufferedStreams, BufferedInputStream -# # A type keeping track of a ragel-based parser's state. -# mutable struct State{T<:BufferedInputStream} -# stream::T # input stream -# cs::Int # current DFA state of Ragel -# linenum::Int # line number: parser is responsible for updating this -# finished::Bool # true if finished (regardless of where in the stream we are) -# end - -# function State(initstate::Int, input::BufferedInputStream) -# return State(input, initstate, 1, false) -# end - include("flags.jl") include("metainfo.jl") diff --git a/test/runtests.jl b/test/runtests.jl index 16f4725..d0a243e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ using Test +using Documenter using BioGenerics using FormatSpecimens @@ -23,6 +24,13 @@ function randrange(range) end -include("test_sam.jl") -include("test_bam.jl") -include("test_issues.jl") +@testset "XAM" begin + include("test_sam.jl") + include("test_bam.jl") + include("test_issues.jl") + include("test_crosscheck.jl") + + # Include doctests. + DocMeta.setdocmeta!(XAM, :DocTestSetup, :(using XAM); recursive=true) + doctest(XAM; manual = false) +end diff --git a/test/test_bam.jl b/test/test_bam.jl index 186d231..beb6d03 100644 --- a/test/test_bam.jl +++ b/test/test_bam.jl @@ -104,9 +104,12 @@ unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize) end new_record = convert(BAM.Record, array) - @test record.bin_mq_nl == new_record.bin_mq_nl + @test record.l_read_name == new_record.l_read_name + @test record.mapq == new_record.mapq + @test record.bin == new_record.bin @test record.block_size == new_record.block_size - @test record.flag_nc == new_record.flag_nc + @test record.flag == new_record.flag + @test record.n_cigar_op == new_record.n_cigar_op @test record.l_seq == new_record.l_seq @test record.next_refid == new_record.next_refid @test record.next_pos == new_record.next_pos @@ -162,18 +165,21 @@ if length(xs) != length(ys) return false end - for (x, y) in zip(xs, ys) + for (a, b) in zip(xs, ys) if !( - x.block_size == y.block_size && - x.refid == y.refid && - x.pos == y.pos && - x.bin_mq_nl == y.bin_mq_nl && - x.flag_nc == y.flag_nc && - x.l_seq == y.l_seq && - x.next_refid == y.next_refid && - x.next_pos == y.next_pos && - x.tlen == y.tlen && - x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)]) + a.block_size == b.block_size && + a.refid == b.refid && + a.pos == b.pos && + a.l_read_name == b.l_read_name && + a.mapq == b.mapq && + a.bin == b.bin && + a.n_cigar_op == b.n_cigar_op && + a.flag == b.flag && + a.l_seq == b.l_seq && + a.next_refid == b.next_refid && + a.next_pos == b.next_pos && + a.tlen == b.tlen && + a.data[1:BAM.data_size(a)] == b.data[1:BAM.data_size(b)]) return false end end diff --git a/test/test_crosscheck.jl b/test/test_crosscheck.jl new file mode 100644 index 0000000..0e853ef --- /dev/null +++ b/test/test_crosscheck.jl @@ -0,0 +1,54 @@ +@testset "Cross Check Properties" begin + + Broadcast.broadcastable(x::XAM.BAM.Record) = Ref(x) #TODO: consider moving to XAM.jl. + Broadcast.broadcastable(x::XAM.SAM.Record) = Ref(x) #TODO: consider moving to XAM.jl + + function crosscheck(bam::BAM.Record, sam::SAM.Record, property::Symbol) + + bam_property = getproperty(XAM.BAM, property) + sam_property = getproperty(XAM.SAM, property) + + if bam_property(bam) != sam_property(sam) + @warn "$property result is not the same" bam_property(bam) sam_property(sam) + return false + end + + return true + end + + samdir = path_of_format("SAM") + bamdir = path_of_format("BAM") + + filenames = [ + "ce#1", + "ce#2", + "ce#5", + "ce#5b", + "ce#unmap", + "ce#unmap1", + "ce#unmap2", + ] + + properties = [ + :position,# POS + :tempname,# QNAME + :mappingquality,# MAPQ + :cigar, # CIGAR + :flag, # FLAG + :sequence, # SEQ + :nextposition, # PNEXT + :templength, # TLEN + ] + + for filename in filenames + + records_bam = collect(open(BAM.Reader, joinpath(bamdir, filename * ".bam"))) + records_sam = collect(open(SAM.Reader, joinpath(samdir, filename * ".sam"))) + + for (bam, sam) in zip(records_bam, records_sam) + @test all(crosscheck.(bam, sam, properties)) == true + end + + end + +end # testset Crosscheck diff --git a/test/test_sam.jl b/test/test_sam.jl index c6fadc1..195288d 100644 --- a/test/test_sam.jl +++ b/test/test_sam.jl @@ -199,6 +199,6 @@ records = open(collect, SAM.Reader, file_sam) @test records == [] - + end end