mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2025-01-08 14:02:08 -05:00
Move Variant-related code to Variant.jl
This commit is contained in:
parent
3f631ace8f
commit
37965ac7eb
2 changed files with 162 additions and 159 deletions
|
@ -32,165 +32,7 @@ struct Unsafe end
|
|||
struct Inapplicable end
|
||||
|
||||
include("Edit.jl")
|
||||
|
||||
# Edits are applied sequentially from first to last pos.
|
||||
# The vector must always be sorted by pos.
|
||||
struct Variant{S <: BioSequence, T <: BioSymbol}
|
||||
ref::S
|
||||
edits::Vector{Edit{S, T}}
|
||||
|
||||
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
|
||||
end
|
||||
|
||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||
sort!(edits, by=x -> x.pos)
|
||||
result = Variant{S, T}(ref, edits, Unsafe())
|
||||
is_valid(result) || error("TODO") # report what kind of error message?
|
||||
return result
|
||||
end
|
||||
|
||||
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||
Variant{S, T}(ref, edits)
|
||||
end
|
||||
|
||||
|
||||
|
||||
function Base.show(io::IO, x::Variant)
|
||||
n = length(x.edits)
|
||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||
for i in x.edits
|
||||
v = Variation(x.ref, i)
|
||||
print(io, "\n ")
|
||||
show(io, v)
|
||||
end
|
||||
end
|
||||
|
||||
# Validate:
|
||||
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||
# is affected by multiple edits.
|
||||
function is_valid(v::Variant)
|
||||
isempty(v.ref) && return false
|
||||
valid_positions = 1:length(v.ref)
|
||||
last_was_insert = false
|
||||
for edit in v.edits
|
||||
pos = edit.pos
|
||||
op = edit.x
|
||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||
# variations
|
||||
is_valid(Variation(v.ref, edit)) || return false
|
||||
|
||||
# For substitutions we simply do not allow another modification of the same base
|
||||
if op isa Substitution
|
||||
pos in valid_positions || return false
|
||||
valid_positions = first(valid_positions) + 1 : last(valid_positions)
|
||||
last_was_insert = false
|
||||
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||
# for next op. However, we cannot have two insertions at the same position, because
|
||||
# then the order of them is ambiguous
|
||||
elseif op isa Insertion
|
||||
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
|
||||
last_was_insert = true
|
||||
# Deletions obviously invalidate the reference bases that are deleted.
|
||||
elseif op isa Deletion
|
||||
len = length(op)
|
||||
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
|
||||
valid_positions = first(valid_positions) + len : last(valid_positions)
|
||||
last_was_insert = false
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
|
||||
ref = aln.b
|
||||
E = eltype(typeof(ref))
|
||||
edits = Edit{T, E}[]
|
||||
refpos = first(aln.a.aln.anchors).refpos
|
||||
seqpos = first(aln.a.aln.anchors).seqpos
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
insertion_buffer = E[]
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
isgap(seqi) || (seqpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(n_gaps) && (markpos = refpos)
|
||||
n_gaps += 1
|
||||
elseif !iszero(n_gaps)
|
||||
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
n_gaps = 0
|
||||
end
|
||||
|
||||
# Check for insertions
|
||||
if isgap(refi)
|
||||
iszero(n_ins) && (markpos = refpos + 1)
|
||||
push!(insertion_buffer, seqi)
|
||||
n_ins += 1
|
||||
elseif !iszero(n_ins)
|
||||
seq = T(insertion_buffer)
|
||||
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
|
||||
empty!(insertion_buffer)
|
||||
n_ins = 0
|
||||
end
|
||||
|
||||
# Substitutions
|
||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
|
||||
end
|
||||
end
|
||||
|
||||
# Check for clips at the end of the alignment
|
||||
last_anchors = aln.a.aln.anchors[end-1:end]
|
||||
|
||||
# Final indel, if applicable
|
||||
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||||
if !iszero(n_gaps)
|
||||
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
elseif !iszero(n_ins)
|
||||
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||
end
|
||||
end
|
||||
|
||||
return Variant(ref, edits)
|
||||
end
|
||||
|
||||
edits(v::Variant) = v.edits
|
||||
reference(v::Variant) = v.ref
|
||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||
|
||||
function reconstruct!(seq::S, x::Variant{S}) where S
|
||||
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
||||
for edit in x.edits
|
||||
while refpos < edit.pos
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
editx = edit.x
|
||||
if editx isa Substitution
|
||||
seq[seqpos] = editx.x
|
||||
seqpos += 1
|
||||
refpos += 1
|
||||
elseif editx isa Deletion
|
||||
refpos += editx.len
|
||||
elseif editx isa Insertion
|
||||
for i in editx.x
|
||||
seq[seqpos] = i
|
||||
seqpos += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
while seqpos ≤ length(seq)
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
seq
|
||||
end
|
||||
include("Variant.jl")
|
||||
|
||||
struct Variation{S <: BioSequence, T <: BioSymbol}
|
||||
ref::S
|
||||
|
|
161
src/Variant.jl
Normal file
161
src/Variant.jl
Normal file
|
@ -0,0 +1,161 @@
|
|||
# Edits are applied sequentially from first to last pos.
|
||||
# The vector must always be sorted by pos.
|
||||
struct Variant{S<:BioSequence,T<:BioSymbol}
|
||||
ref::S
|
||||
edits::Vector{Edit{S,T}}
|
||||
|
||||
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||
end
|
||||
|
||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
sort!(edits; by=x -> x.pos)
|
||||
result = Variant{S,T}(ref, edits, Unsafe())
|
||||
is_valid(result) || error("TODO") # report what kind of error message?
|
||||
return result
|
||||
end
|
||||
|
||||
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
return Variant{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::Variant)
|
||||
n = length(x.edits)
|
||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||
for i in x.edits
|
||||
v = Variation(x.ref, i)
|
||||
print(io, "\n ")
|
||||
show(io, v)
|
||||
end
|
||||
end
|
||||
|
||||
# Validate:
|
||||
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||
# is affected by multiple edits.
|
||||
function is_valid(v::Variant)
|
||||
isempty(v.ref) && return false
|
||||
valid_positions = 1:length(v.ref)
|
||||
last_was_insert = false
|
||||
for edit in v.edits
|
||||
pos = edit.pos
|
||||
op = edit.x
|
||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||
# variations
|
||||
is_valid(Variation(v.ref, edit)) || return false
|
||||
|
||||
# For substitutions we simply do not allow another modification of the same base
|
||||
if op isa Substitution
|
||||
pos in valid_positions || return false
|
||||
valid_positions = (first(valid_positions) + 1):last(valid_positions)
|
||||
last_was_insert = false
|
||||
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||
# for next op. However, we cannot have two insertions at the same position, because
|
||||
# then the order of them is ambiguous
|
||||
elseif op isa Insertion
|
||||
pos in
|
||||
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
|
||||
return false
|
||||
last_was_insert = true
|
||||
# Deletions obviously invalidate the reference bases that are deleted.
|
||||
elseif op isa Deletion
|
||||
len = length(op)
|
||||
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
|
||||
return false
|
||||
valid_positions = (first(valid_positions) + len):last(valid_positions)
|
||||
last_was_insert = false
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
function Variant(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
ref = aln.b
|
||||
E = eltype(typeof(ref))
|
||||
edits = Edit{T,E}[]
|
||||
refpos = first(aln.a.aln.anchors).refpos
|
||||
seqpos = first(aln.a.aln.anchors).seqpos
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
insertion_buffer = E[]
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
isgap(seqi) || (seqpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(n_gaps) && (markpos = refpos)
|
||||
n_gaps += 1
|
||||
elseif !iszero(n_gaps)
|
||||
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
n_gaps = 0
|
||||
end
|
||||
|
||||
# Check for insertions
|
||||
if isgap(refi)
|
||||
iszero(n_ins) && (markpos = refpos + 1)
|
||||
push!(insertion_buffer, seqi)
|
||||
n_ins += 1
|
||||
elseif !iszero(n_ins)
|
||||
seq = T(insertion_buffer)
|
||||
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
|
||||
empty!(insertion_buffer)
|
||||
n_ins = 0
|
||||
end
|
||||
|
||||
# Substitutions
|
||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
|
||||
end
|
||||
end
|
||||
|
||||
# Check for clips at the end of the alignment
|
||||
last_anchors = aln.a.aln.anchors[(end - 1):end]
|
||||
|
||||
# Final indel, if applicable
|
||||
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||||
if !iszero(n_gaps)
|
||||
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
elseif !iszero(n_ins)
|
||||
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||
end
|
||||
end
|
||||
|
||||
return Variant(ref, edits)
|
||||
end
|
||||
|
||||
edits(v::Variant) = v.edits
|
||||
reference(v::Variant) = v.ref
|
||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||
|
||||
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
||||
for edit in x.edits
|
||||
while refpos < edit.pos
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
editx = edit.x
|
||||
if editx isa Substitution
|
||||
seq[seqpos] = editx.x
|
||||
seqpos += 1
|
||||
refpos += 1
|
||||
elseif editx isa Deletion
|
||||
refpos += editx.len
|
||||
elseif editx isa Insertion
|
||||
for i in editx.x
|
||||
seq[seqpos] = i
|
||||
seqpos += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
while seqpos ≤ length(seq)
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
return seq
|
||||
end
|
Loading…
Reference in a new issue