mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 05:19:55 +00:00
Move Variant-related code to Variant.jl
This commit is contained in:
parent
3f631ace8f
commit
37965ac7eb
2 changed files with 162 additions and 159 deletions
|
@ -32,165 +32,7 @@ struct Unsafe end
|
||||||
struct Inapplicable end
|
struct Inapplicable end
|
||||||
|
|
||||||
include("Edit.jl")
|
include("Edit.jl")
|
||||||
|
include("Variant.jl")
|
||||||
# Edits are applied sequentially from first to last pos.
|
|
||||||
# The vector must always be sorted by pos.
|
|
||||||
struct Variant{S <: BioSequence, T <: BioSymbol}
|
|
||||||
ref::S
|
|
||||||
edits::Vector{Edit{S, T}}
|
|
||||||
|
|
||||||
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
|
||||||
sort!(edits, by=x -> x.pos)
|
|
||||||
result = Variant{S, T}(ref, edits, Unsafe())
|
|
||||||
is_valid(result) || error("TODO") # report what kind of error message?
|
|
||||||
return result
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
|
||||||
Variant{S, T}(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function Base.show(io::IO, x::Variant)
|
|
||||||
n = length(x.edits)
|
|
||||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
|
||||||
for i in x.edits
|
|
||||||
v = Variation(x.ref, i)
|
|
||||||
print(io, "\n ")
|
|
||||||
show(io, v)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Validate:
|
|
||||||
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
|
||||||
# is affected by multiple edits.
|
|
||||||
function is_valid(v::Variant)
|
|
||||||
isempty(v.ref) && return false
|
|
||||||
valid_positions = 1:length(v.ref)
|
|
||||||
last_was_insert = false
|
|
||||||
for edit in v.edits
|
|
||||||
pos = edit.pos
|
|
||||||
op = edit.x
|
|
||||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
|
||||||
# variations
|
|
||||||
is_valid(Variation(v.ref, edit)) || return false
|
|
||||||
|
|
||||||
# For substitutions we simply do not allow another modification of the same base
|
|
||||||
if op isa Substitution
|
|
||||||
pos in valid_positions || return false
|
|
||||||
valid_positions = first(valid_positions) + 1 : last(valid_positions)
|
|
||||||
last_was_insert = false
|
|
||||||
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
|
||||||
# for next op. However, we cannot have two insertions at the same position, because
|
|
||||||
# then the order of them is ambiguous
|
|
||||||
elseif op isa Insertion
|
|
||||||
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
|
|
||||||
last_was_insert = true
|
|
||||||
# Deletions obviously invalidate the reference bases that are deleted.
|
|
||||||
elseif op isa Deletion
|
|
||||||
len = length(op)
|
|
||||||
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
|
|
||||||
valid_positions = first(valid_positions) + len : last(valid_positions)
|
|
||||||
last_was_insert = false
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return true
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
|
|
||||||
ref = aln.b
|
|
||||||
E = eltype(typeof(ref))
|
|
||||||
edits = Edit{T, E}[]
|
|
||||||
refpos = first(aln.a.aln.anchors).refpos
|
|
||||||
seqpos = first(aln.a.aln.anchors).seqpos
|
|
||||||
markpos = 0
|
|
||||||
n_gaps = n_ins = 0
|
|
||||||
insertion_buffer = E[]
|
|
||||||
for (seqi, refi) in aln
|
|
||||||
isgap(refi) || (refpos += 1)
|
|
||||||
isgap(seqi) || (seqpos += 1)
|
|
||||||
|
|
||||||
# Check for deletions
|
|
||||||
if isgap(seqi)
|
|
||||||
iszero(n_gaps) && (markpos = refpos)
|
|
||||||
n_gaps += 1
|
|
||||||
elseif !iszero(n_gaps)
|
|
||||||
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
|
||||||
n_gaps = 0
|
|
||||||
end
|
|
||||||
|
|
||||||
# Check for insertions
|
|
||||||
if isgap(refi)
|
|
||||||
iszero(n_ins) && (markpos = refpos + 1)
|
|
||||||
push!(insertion_buffer, seqi)
|
|
||||||
n_ins += 1
|
|
||||||
elseif !iszero(n_ins)
|
|
||||||
seq = T(insertion_buffer)
|
|
||||||
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
|
|
||||||
empty!(insertion_buffer)
|
|
||||||
n_ins = 0
|
|
||||||
end
|
|
||||||
|
|
||||||
# Substitutions
|
|
||||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
|
||||||
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Check for clips at the end of the alignment
|
|
||||||
last_anchors = aln.a.aln.anchors[end-1:end]
|
|
||||||
|
|
||||||
# Final indel, if applicable
|
|
||||||
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
|
||||||
if !iszero(n_gaps)
|
|
||||||
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
|
||||||
elseif !iszero(n_ins)
|
|
||||||
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return Variant(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
edits(v::Variant) = v.edits
|
|
||||||
reference(v::Variant) = v.ref
|
|
||||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
|
||||||
|
|
||||||
function reconstruct!(seq::S, x::Variant{S}) where S
|
|
||||||
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
|
||||||
resize!(seq, len % UInt)
|
|
||||||
refpos = seqpos = 1
|
|
||||||
for edit in x.edits
|
|
||||||
while refpos < edit.pos
|
|
||||||
seq[seqpos] = x.ref[refpos]
|
|
||||||
refpos += 1
|
|
||||||
seqpos += 1
|
|
||||||
end
|
|
||||||
editx = edit.x
|
|
||||||
if editx isa Substitution
|
|
||||||
seq[seqpos] = editx.x
|
|
||||||
seqpos += 1
|
|
||||||
refpos += 1
|
|
||||||
elseif editx isa Deletion
|
|
||||||
refpos += editx.len
|
|
||||||
elseif editx isa Insertion
|
|
||||||
for i in editx.x
|
|
||||||
seq[seqpos] = i
|
|
||||||
seqpos += 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
while seqpos ≤ length(seq)
|
|
||||||
seq[seqpos] = x.ref[refpos]
|
|
||||||
refpos += 1
|
|
||||||
seqpos += 1
|
|
||||||
end
|
|
||||||
seq
|
|
||||||
end
|
|
||||||
|
|
||||||
struct Variation{S <: BioSequence, T <: BioSymbol}
|
struct Variation{S <: BioSequence, T <: BioSymbol}
|
||||||
ref::S
|
ref::S
|
||||||
|
|
161
src/Variant.jl
Normal file
161
src/Variant.jl
Normal file
|
@ -0,0 +1,161 @@
|
||||||
|
# Edits are applied sequentially from first to last pos.
|
||||||
|
# The vector must always be sorted by pos.
|
||||||
|
struct Variant{S<:BioSequence,T<:BioSymbol}
|
||||||
|
ref::S
|
||||||
|
edits::Vector{Edit{S,T}}
|
||||||
|
|
||||||
|
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||||
|
sort!(edits; by=x -> x.pos)
|
||||||
|
result = Variant{S,T}(ref, edits, Unsafe())
|
||||||
|
is_valid(result) || error("TODO") # report what kind of error message?
|
||||||
|
return result
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||||
|
return Variant{S,T}(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, x::Variant)
|
||||||
|
n = length(x.edits)
|
||||||
|
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||||
|
for i in x.edits
|
||||||
|
v = Variation(x.ref, i)
|
||||||
|
print(io, "\n ")
|
||||||
|
show(io, v)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Validate:
|
||||||
|
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||||
|
# is affected by multiple edits.
|
||||||
|
function is_valid(v::Variant)
|
||||||
|
isempty(v.ref) && return false
|
||||||
|
valid_positions = 1:length(v.ref)
|
||||||
|
last_was_insert = false
|
||||||
|
for edit in v.edits
|
||||||
|
pos = edit.pos
|
||||||
|
op = edit.x
|
||||||
|
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||||
|
# variations
|
||||||
|
is_valid(Variation(v.ref, edit)) || return false
|
||||||
|
|
||||||
|
# For substitutions we simply do not allow another modification of the same base
|
||||||
|
if op isa Substitution
|
||||||
|
pos in valid_positions || return false
|
||||||
|
valid_positions = (first(valid_positions) + 1):last(valid_positions)
|
||||||
|
last_was_insert = false
|
||||||
|
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||||
|
# for next op. However, we cannot have two insertions at the same position, because
|
||||||
|
# then the order of them is ambiguous
|
||||||
|
elseif op isa Insertion
|
||||||
|
pos in
|
||||||
|
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
|
||||||
|
return false
|
||||||
|
last_was_insert = true
|
||||||
|
# Deletions obviously invalidate the reference bases that are deleted.
|
||||||
|
elseif op isa Deletion
|
||||||
|
len = length(op)
|
||||||
|
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
|
||||||
|
return false
|
||||||
|
valid_positions = (first(valid_positions) + len):last(valid_positions)
|
||||||
|
last_was_insert = false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant(
|
||||||
|
aln::PairwiseAlignment{T,T}
|
||||||
|
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||||
|
ref = aln.b
|
||||||
|
E = eltype(typeof(ref))
|
||||||
|
edits = Edit{T,E}[]
|
||||||
|
refpos = first(aln.a.aln.anchors).refpos
|
||||||
|
seqpos = first(aln.a.aln.anchors).seqpos
|
||||||
|
markpos = 0
|
||||||
|
n_gaps = n_ins = 0
|
||||||
|
insertion_buffer = E[]
|
||||||
|
for (seqi, refi) in aln
|
||||||
|
isgap(refi) || (refpos += 1)
|
||||||
|
isgap(seqi) || (seqpos += 1)
|
||||||
|
|
||||||
|
# Check for deletions
|
||||||
|
if isgap(seqi)
|
||||||
|
iszero(n_gaps) && (markpos = refpos)
|
||||||
|
n_gaps += 1
|
||||||
|
elseif !iszero(n_gaps)
|
||||||
|
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||||
|
n_gaps = 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Check for insertions
|
||||||
|
if isgap(refi)
|
||||||
|
iszero(n_ins) && (markpos = refpos + 1)
|
||||||
|
push!(insertion_buffer, seqi)
|
||||||
|
n_ins += 1
|
||||||
|
elseif !iszero(n_ins)
|
||||||
|
seq = T(insertion_buffer)
|
||||||
|
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
|
||||||
|
empty!(insertion_buffer)
|
||||||
|
n_ins = 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Substitutions
|
||||||
|
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||||
|
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Check for clips at the end of the alignment
|
||||||
|
last_anchors = aln.a.aln.anchors[(end - 1):end]
|
||||||
|
|
||||||
|
# Final indel, if applicable
|
||||||
|
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||||||
|
if !iszero(n_gaps)
|
||||||
|
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||||
|
elseif !iszero(n_ins)
|
||||||
|
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return Variant(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
edits(v::Variant) = v.edits
|
||||||
|
reference(v::Variant) = v.ref
|
||||||
|
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||||
|
|
||||||
|
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||||
|
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||||||
|
resize!(seq, len % UInt)
|
||||||
|
refpos = seqpos = 1
|
||||||
|
for edit in x.edits
|
||||||
|
while refpos < edit.pos
|
||||||
|
seq[seqpos] = x.ref[refpos]
|
||||||
|
refpos += 1
|
||||||
|
seqpos += 1
|
||||||
|
end
|
||||||
|
editx = edit.x
|
||||||
|
if editx isa Substitution
|
||||||
|
seq[seqpos] = editx.x
|
||||||
|
seqpos += 1
|
||||||
|
refpos += 1
|
||||||
|
elseif editx isa Deletion
|
||||||
|
refpos += editx.len
|
||||||
|
elseif editx isa Insertion
|
||||||
|
for i in editx.x
|
||||||
|
seq[seqpos] = i
|
||||||
|
seqpos += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
while seqpos ≤ length(seq)
|
||||||
|
seq[seqpos] = x.ref[refpos]
|
||||||
|
refpos += 1
|
||||||
|
seqpos += 1
|
||||||
|
end
|
||||||
|
return seq
|
||||||
|
end
|
Loading…
Reference in a new issue