From 37965ac7eb80992a0580d86b7e4d9a67b65e752f Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Fri, 30 Dec 2022 14:22:45 -0600 Subject: [PATCH] Move Variant-related code to Variant.jl --- src/SequenceVariation.jl | 160 +------------------------------------- src/Variant.jl | 161 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 159 deletions(-) create mode 100644 src/Variant.jl diff --git a/src/SequenceVariation.jl b/src/SequenceVariation.jl index 25f5260..dc439b6 100644 --- a/src/SequenceVariation.jl +++ b/src/SequenceVariation.jl @@ -32,165 +32,7 @@ struct Unsafe end struct Inapplicable end include("Edit.jl") - -# Edits are applied sequentially from first to last pos. -# The vector must always be sorted by pos. -struct Variant{S <: BioSequence, T <: BioSymbol} - ref::S - edits::Vector{Edit{S, T}} - - Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits) -end - -function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol} - sort!(edits, by=x -> x.pos) - result = Variant{S, T}(ref, edits, Unsafe()) - is_valid(result) || error("TODO") # report what kind of error message? - return result -end - -function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol} - Variant{S, T}(ref, edits) -end - - - -function Base.show(io::IO, x::Variant) - n = length(x.edits) - print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):") - for i in x.edits - v = Variation(x.ref, i) - print(io, "\n ") - show(io, v) - end -end - -# Validate: -# A sequence is invalid if any of its operations are out of bounds, or the same position -# is affected by multiple edits. -function is_valid(v::Variant) - isempty(v.ref) && return false - valid_positions = 1:length(v.ref) - last_was_insert = false - for edit in v.edits - pos = edit.pos - op = edit.x - # Sanity check: for this to be a valid variant, it must be comprised of valid - # variations - is_valid(Variation(v.ref, edit)) || return false - - # For substitutions we simply do not allow another modification of the same base - if op isa Substitution - pos in valid_positions || return false - valid_positions = first(valid_positions) + 1 : last(valid_positions) - last_was_insert = false - # Insertions affect 0 reference bases, so it does not modify the valid positions - # for next op. However, we cannot have two insertions at the same position, because - # then the order of them is ambiguous - elseif op isa Insertion - pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false - last_was_insert = true - # Deletions obviously invalidate the reference bases that are deleted. - elseif op isa Deletion - len = length(op) - pos in (first(valid_positions):last(valid_positions)-len+1) || return false - valid_positions = first(valid_positions) + len : last(valid_positions) - last_was_insert = false - end - end - return true -end - -function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}} - ref = aln.b - E = eltype(typeof(ref)) - edits = Edit{T, E}[] - refpos = first(aln.a.aln.anchors).refpos - seqpos = first(aln.a.aln.anchors).seqpos - markpos = 0 - n_gaps = n_ins = 0 - insertion_buffer = E[] - for (seqi, refi) in aln - isgap(refi) || (refpos += 1) - isgap(seqi) || (seqpos += 1) - - # Check for deletions - if isgap(seqi) - iszero(n_gaps) && (markpos = refpos) - n_gaps += 1 - elseif !iszero(n_gaps) - push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos))) - n_gaps = 0 - end - - # Check for insertions - if isgap(refi) - iszero(n_ins) && (markpos = refpos + 1) - push!(insertion_buffer, seqi) - n_ins += 1 - elseif !iszero(n_ins) - seq = T(insertion_buffer) - push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos))) - empty!(insertion_buffer) - n_ins = 0 - end - - # Substitutions - if !isgap(refi) && !isgap(seqi) && seqi != refi - push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos))) - end - end - - # Check for clips at the end of the alignment - last_anchors = aln.a.aln.anchors[end-1:end] - - # Final indel, if applicable - if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors) - if !iszero(n_gaps) - push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos))) - elseif !iszero(n_ins) - push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos))) - end - end - - return Variant(ref, edits) -end - -edits(v::Variant) = v.edits -reference(v::Variant) = v.ref -Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits - -function reconstruct!(seq::S, x::Variant{S}) where S - len = length(x.ref) + sum(edit -> lendiff(edit), x.edits) - resize!(seq, len % UInt) - refpos = seqpos = 1 - for edit in x.edits - while refpos < edit.pos - seq[seqpos] = x.ref[refpos] - refpos += 1 - seqpos += 1 - end - editx = edit.x - if editx isa Substitution - seq[seqpos] = editx.x - seqpos += 1 - refpos += 1 - elseif editx isa Deletion - refpos += editx.len - elseif editx isa Insertion - for i in editx.x - seq[seqpos] = i - seqpos += 1 - end - end - end - while seqpos ≤ length(seq) - seq[seqpos] = x.ref[refpos] - refpos += 1 - seqpos += 1 - end - seq -end +include("Variant.jl") struct Variation{S <: BioSequence, T <: BioSymbol} ref::S diff --git a/src/Variant.jl b/src/Variant.jl new file mode 100644 index 0000000..02f2d03 --- /dev/null +++ b/src/Variant.jl @@ -0,0 +1,161 @@ +# Edits are applied sequentially from first to last pos. +# The vector must always be sorted by pos. +struct Variant{S<:BioSequence,T<:BioSymbol} + ref::S + edits::Vector{Edit{S,T}} + + Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits) +end + +function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} + sort!(edits; by=x -> x.pos) + result = Variant{S,T}(ref, edits, Unsafe()) + is_valid(result) || error("TODO") # report what kind of error message? + return result +end + +function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} + return Variant{S,T}(ref, edits) +end + +function Base.show(io::IO, x::Variant) + n = length(x.edits) + print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):") + for i in x.edits + v = Variation(x.ref, i) + print(io, "\n ") + show(io, v) + end +end + +# Validate: +# A sequence is invalid if any of its operations are out of bounds, or the same position +# is affected by multiple edits. +function is_valid(v::Variant) + isempty(v.ref) && return false + valid_positions = 1:length(v.ref) + last_was_insert = false + for edit in v.edits + pos = edit.pos + op = edit.x + # Sanity check: for this to be a valid variant, it must be comprised of valid + # variations + is_valid(Variation(v.ref, edit)) || return false + + # For substitutions we simply do not allow another modification of the same base + if op isa Substitution + pos in valid_positions || return false + valid_positions = (first(valid_positions) + 1):last(valid_positions) + last_was_insert = false + # Insertions affect 0 reference bases, so it does not modify the valid positions + # for next op. However, we cannot have two insertions at the same position, because + # then the order of them is ambiguous + elseif op isa Insertion + pos in + ((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) || + return false + last_was_insert = true + # Deletions obviously invalidate the reference bases that are deleted. + elseif op isa Deletion + len = length(op) + pos in (first(valid_positions):(last(valid_positions) - len + 1)) || + return false + valid_positions = (first(valid_positions) + len):last(valid_positions) + last_was_insert = false + end + end + return true +end + +function Variant( + aln::PairwiseAlignment{T,T} +) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} + ref = aln.b + E = eltype(typeof(ref)) + edits = Edit{T,E}[] + refpos = first(aln.a.aln.anchors).refpos + seqpos = first(aln.a.aln.anchors).seqpos + markpos = 0 + n_gaps = n_ins = 0 + insertion_buffer = E[] + for (seqi, refi) in aln + isgap(refi) || (refpos += 1) + isgap(seqi) || (seqpos += 1) + + # Check for deletions + if isgap(seqi) + iszero(n_gaps) && (markpos = refpos) + n_gaps += 1 + elseif !iszero(n_gaps) + push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos))) + n_gaps = 0 + end + + # Check for insertions + if isgap(refi) + iszero(n_ins) && (markpos = refpos + 1) + push!(insertion_buffer, seqi) + n_ins += 1 + elseif !iszero(n_ins) + seq = T(insertion_buffer) + push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos))) + empty!(insertion_buffer) + n_ins = 0 + end + + # Substitutions + if !isgap(refi) && !isgap(seqi) && seqi != refi + push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos))) + end + end + + # Check for clips at the end of the alignment + last_anchors = aln.a.aln.anchors[(end - 1):end] + + # Final indel, if applicable + if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors) + if !iszero(n_gaps) + push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos))) + elseif !iszero(n_ins) + push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos))) + end + end + + return Variant(ref, edits) +end + +edits(v::Variant) = v.edits +reference(v::Variant) = v.ref +Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits + +function reconstruct!(seq::S, x::Variant{S}) where {S} + len = length(x.ref) + sum(edit -> lendiff(edit), x.edits) + resize!(seq, len % UInt) + refpos = seqpos = 1 + for edit in x.edits + while refpos < edit.pos + seq[seqpos] = x.ref[refpos] + refpos += 1 + seqpos += 1 + end + editx = edit.x + if editx isa Substitution + seq[seqpos] = editx.x + seqpos += 1 + refpos += 1 + elseif editx isa Deletion + refpos += editx.len + elseif editx isa Insertion + for i in editx.x + seq[seqpos] = i + seqpos += 1 + end + end + end + while seqpos ≤ length(seq) + seq[seqpos] = x.ref[refpos] + refpos += 1 + seqpos += 1 + end + return seq +end