Move Variant-related code to Variant.jl

This commit is contained in:
Thomas A. Christensen II 2022-12-30 14:22:45 -06:00
parent 3f631ace8f
commit 37965ac7eb
2 changed files with 162 additions and 159 deletions

View file

@ -32,165 +32,7 @@ struct Unsafe end
struct Inapplicable end struct Inapplicable end
include("Edit.jl") include("Edit.jl")
include("Variant.jl")
# Edits are applied sequentially from first to last pos.
# The vector must always be sorted by pos.
struct Variant{S <: BioSequence, T <: BioSymbol}
ref::S
edits::Vector{Edit{S, T}}
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
end
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
sort!(edits, by=x -> x.pos)
result = Variant{S, T}(ref, edits, Unsafe())
is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
Variant{S, T}(ref, edits)
end
function Base.show(io::IO, x::Variant)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
v = Variation(x.ref, i)
print(io, "\n ")
show(io, v)
end
end
# Validate:
# A sequence is invalid if any of its operations are out of bounds, or the same position
# is affected by multiple edits.
function is_valid(v::Variant)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
last_was_insert = false
for edit in v.edits
pos = edit.pos
op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
is_valid(Variation(v.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base
if op isa Substitution
pos in valid_positions || return false
valid_positions = first(valid_positions) + 1 : last(valid_positions)
last_was_insert = false
# Insertions affect 0 reference bases, so it does not modify the valid positions
# for next op. However, we cannot have two insertions at the same position, because
# then the order of them is ambiguous
elseif op isa Insertion
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
last_was_insert = true
# Deletions obviously invalidate the reference bases that are deleted.
elseif op isa Deletion
len = length(op)
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
valid_positions = first(valid_positions) + len : last(valid_positions)
last_was_insert = false
end
end
return true
end
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
ref = aln.b
E = eltype(typeof(ref))
edits = Edit{T, E}[]
refpos = first(aln.a.aln.anchors).refpos
seqpos = first(aln.a.aln.anchors).seqpos
markpos = 0
n_gaps = n_ins = 0
insertion_buffer = E[]
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
isgap(seqi) || (seqpos += 1)
# Check for deletions
if isgap(seqi)
iszero(n_gaps) && (markpos = refpos)
n_gaps += 1
elseif !iszero(n_gaps)
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
n_gaps = 0
end
# Check for insertions
if isgap(refi)
iszero(n_ins) && (markpos = refpos + 1)
push!(insertion_buffer, seqi)
n_ins += 1
elseif !iszero(n_ins)
seq = T(insertion_buffer)
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
empty!(insertion_buffer)
n_ins = 0
end
# Substitutions
if !isgap(refi) && !isgap(seqi) && seqi != refi
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
end
end
# Check for clips at the end of the alignment
last_anchors = aln.a.aln.anchors[end-1:end]
# Final indel, if applicable
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
if !iszero(n_gaps)
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
elseif !iszero(n_ins)
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
end
end
return Variant(ref, edits)
end
edits(v::Variant) = v.edits
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
function reconstruct!(seq::S, x::Variant{S}) where S
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
resize!(seq, len % UInt)
refpos = seqpos = 1
for edit in x.edits
while refpos < edit.pos
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
editx = edit.x
if editx isa Substitution
seq[seqpos] = editx.x
seqpos += 1
refpos += 1
elseif editx isa Deletion
refpos += editx.len
elseif editx isa Insertion
for i in editx.x
seq[seqpos] = i
seqpos += 1
end
end
end
while seqpos length(seq)
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
seq
end
struct Variation{S <: BioSequence, T <: BioSymbol} struct Variation{S <: BioSequence, T <: BioSymbol}
ref::S ref::S

161
src/Variant.jl Normal file
View file

@ -0,0 +1,161 @@
# Edits are applied sequentially from first to last pos.
# The vector must always be sorted by pos.
struct Variant{S<:BioSequence,T<:BioSymbol}
ref::S
edits::Vector{Edit{S,T}}
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos)
result = Variant{S,T}(ref, edits, Unsafe())
is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits)
end
function Base.show(io::IO, x::Variant)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
v = Variation(x.ref, i)
print(io, "\n ")
show(io, v)
end
end
# Validate:
# A sequence is invalid if any of its operations are out of bounds, or the same position
# is affected by multiple edits.
function is_valid(v::Variant)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
last_was_insert = false
for edit in v.edits
pos = edit.pos
op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
is_valid(Variation(v.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base
if op isa Substitution
pos in valid_positions || return false
valid_positions = (first(valid_positions) + 1):last(valid_positions)
last_was_insert = false
# Insertions affect 0 reference bases, so it does not modify the valid positions
# for next op. However, we cannot have two insertions at the same position, because
# then the order of them is ambiguous
elseif op isa Insertion
pos in
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
return false
last_was_insert = true
# Deletions obviously invalidate the reference bases that are deleted.
elseif op isa Deletion
len = length(op)
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
return false
valid_positions = (first(valid_positions) + len):last(valid_positions)
last_was_insert = false
end
end
return true
end
function Variant(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b
E = eltype(typeof(ref))
edits = Edit{T,E}[]
refpos = first(aln.a.aln.anchors).refpos
seqpos = first(aln.a.aln.anchors).seqpos
markpos = 0
n_gaps = n_ins = 0
insertion_buffer = E[]
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
isgap(seqi) || (seqpos += 1)
# Check for deletions
if isgap(seqi)
iszero(n_gaps) && (markpos = refpos)
n_gaps += 1
elseif !iszero(n_gaps)
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
n_gaps = 0
end
# Check for insertions
if isgap(refi)
iszero(n_ins) && (markpos = refpos + 1)
push!(insertion_buffer, seqi)
n_ins += 1
elseif !iszero(n_ins)
seq = T(insertion_buffer)
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
empty!(insertion_buffer)
n_ins = 0
end
# Substitutions
if !isgap(refi) && !isgap(seqi) && seqi != refi
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
end
end
# Check for clips at the end of the alignment
last_anchors = aln.a.aln.anchors[(end - 1):end]
# Final indel, if applicable
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
if !iszero(n_gaps)
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
elseif !iszero(n_ins)
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
end
end
return Variant(ref, edits)
end
edits(v::Variant) = v.edits
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
function reconstruct!(seq::S, x::Variant{S}) where {S}
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
resize!(seq, len % UInt)
refpos = seqpos = 1
for edit in x.edits
while refpos < edit.pos
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
editx = edit.x
if editx isa Substitution
seq[seqpos] = editx.x
seqpos += 1
refpos += 1
elseif editx isa Deletion
refpos += editx.len
elseif editx isa Insertion
for i in editx.x
seq[seqpos] = i
seqpos += 1
end
end
end
while seqpos length(seq)
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
return seq
end