mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-25 22:59:56 +00:00
162 lines
5.1 KiB
Julia
162 lines
5.1 KiB
Julia
|
# Edits are applied sequentially from first to last pos.
|
||
|
# The vector must always be sorted by pos.
|
||
|
struct Variant{S<:BioSequence,T<:BioSymbol}
|
||
|
ref::S
|
||
|
edits::Vector{Edit{S,T}}
|
||
|
|
||
|
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||
|
end
|
||
|
|
||
|
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||
|
sort!(edits; by=x -> x.pos)
|
||
|
result = Variant{S,T}(ref, edits, Unsafe())
|
||
|
is_valid(result) || error("TODO") # report what kind of error message?
|
||
|
return result
|
||
|
end
|
||
|
|
||
|
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||
|
return Variant{S,T}(ref, edits)
|
||
|
end
|
||
|
|
||
|
function Base.show(io::IO, x::Variant)
|
||
|
n = length(x.edits)
|
||
|
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||
|
for i in x.edits
|
||
|
v = Variation(x.ref, i)
|
||
|
print(io, "\n ")
|
||
|
show(io, v)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
# Validate:
|
||
|
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||
|
# is affected by multiple edits.
|
||
|
function is_valid(v::Variant)
|
||
|
isempty(v.ref) && return false
|
||
|
valid_positions = 1:length(v.ref)
|
||
|
last_was_insert = false
|
||
|
for edit in v.edits
|
||
|
pos = edit.pos
|
||
|
op = edit.x
|
||
|
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||
|
# variations
|
||
|
is_valid(Variation(v.ref, edit)) || return false
|
||
|
|
||
|
# For substitutions we simply do not allow another modification of the same base
|
||
|
if op isa Substitution
|
||
|
pos in valid_positions || return false
|
||
|
valid_positions = (first(valid_positions) + 1):last(valid_positions)
|
||
|
last_was_insert = false
|
||
|
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||
|
# for next op. However, we cannot have two insertions at the same position, because
|
||
|
# then the order of them is ambiguous
|
||
|
elseif op isa Insertion
|
||
|
pos in
|
||
|
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
|
||
|
return false
|
||
|
last_was_insert = true
|
||
|
# Deletions obviously invalidate the reference bases that are deleted.
|
||
|
elseif op isa Deletion
|
||
|
len = length(op)
|
||
|
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
|
||
|
return false
|
||
|
valid_positions = (first(valid_positions) + len):last(valid_positions)
|
||
|
last_was_insert = false
|
||
|
end
|
||
|
end
|
||
|
return true
|
||
|
end
|
||
|
|
||
|
function Variant(
|
||
|
aln::PairwiseAlignment{T,T}
|
||
|
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||
|
ref = aln.b
|
||
|
E = eltype(typeof(ref))
|
||
|
edits = Edit{T,E}[]
|
||
|
refpos = first(aln.a.aln.anchors).refpos
|
||
|
seqpos = first(aln.a.aln.anchors).seqpos
|
||
|
markpos = 0
|
||
|
n_gaps = n_ins = 0
|
||
|
insertion_buffer = E[]
|
||
|
for (seqi, refi) in aln
|
||
|
isgap(refi) || (refpos += 1)
|
||
|
isgap(seqi) || (seqpos += 1)
|
||
|
|
||
|
# Check for deletions
|
||
|
if isgap(seqi)
|
||
|
iszero(n_gaps) && (markpos = refpos)
|
||
|
n_gaps += 1
|
||
|
elseif !iszero(n_gaps)
|
||
|
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||
|
n_gaps = 0
|
||
|
end
|
||
|
|
||
|
# Check for insertions
|
||
|
if isgap(refi)
|
||
|
iszero(n_ins) && (markpos = refpos + 1)
|
||
|
push!(insertion_buffer, seqi)
|
||
|
n_ins += 1
|
||
|
elseif !iszero(n_ins)
|
||
|
seq = T(insertion_buffer)
|
||
|
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
|
||
|
empty!(insertion_buffer)
|
||
|
n_ins = 0
|
||
|
end
|
||
|
|
||
|
# Substitutions
|
||
|
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||
|
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
|
||
|
end
|
||
|
end
|
||
|
|
||
|
# Check for clips at the end of the alignment
|
||
|
last_anchors = aln.a.aln.anchors[(end - 1):end]
|
||
|
|
||
|
# Final indel, if applicable
|
||
|
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||
|
if !iszero(n_gaps)
|
||
|
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||
|
elseif !iszero(n_ins)
|
||
|
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||
|
end
|
||
|
end
|
||
|
|
||
|
return Variant(ref, edits)
|
||
|
end
|
||
|
|
||
|
edits(v::Variant) = v.edits
|
||
|
reference(v::Variant) = v.ref
|
||
|
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||
|
|
||
|
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
||
|
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||
|
resize!(seq, len % UInt)
|
||
|
refpos = seqpos = 1
|
||
|
for edit in x.edits
|
||
|
while refpos < edit.pos
|
||
|
seq[seqpos] = x.ref[refpos]
|
||
|
refpos += 1
|
||
|
seqpos += 1
|
||
|
end
|
||
|
editx = edit.x
|
||
|
if editx isa Substitution
|
||
|
seq[seqpos] = editx.x
|
||
|
seqpos += 1
|
||
|
refpos += 1
|
||
|
elseif editx isa Deletion
|
||
|
refpos += editx.len
|
||
|
elseif editx isa Insertion
|
||
|
for i in editx.x
|
||
|
seq[seqpos] = i
|
||
|
seqpos += 1
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
while seqpos ≤ length(seq)
|
||
|
seq[seqpos] = x.ref[refpos]
|
||
|
refpos += 1
|
||
|
seqpos += 1
|
||
|
end
|
||
|
return seq
|
||
|
end
|