mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 13:29:56 +00:00
Start fixing
This commit is contained in:
parent
64ae3bc862
commit
40749eb6f5
1 changed files with 195 additions and 50 deletions
|
@ -26,46 +26,116 @@ using BioAlignments
|
||||||
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
|
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
|
||||||
import BioSymbols: BioSymbol
|
import BioSymbols: BioSymbol
|
||||||
|
|
||||||
|
const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
|
||||||
|
|
||||||
|
@sum_type Edit{S, T}
|
||||||
|
Substitution{S, T}(::T)
|
||||||
|
Deletion{S, T}(::UInt)
|
||||||
|
Insertion{S, T}(::S)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Edit
|
||||||
|
|
||||||
|
Abstract type representing a type of nucleotide edit: Deletion, insertion or
|
||||||
|
substitution.
|
||||||
|
"""
|
||||||
abstract type Edit end
|
abstract type Edit end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Substitution{T <: BioSymbol} <: Edit
|
||||||
|
|
||||||
|
Represents the presence of a `T` at a given position. The position is stored
|
||||||
|
outside this struct.
|
||||||
|
"""
|
||||||
struct Substitution{T <: BioSymbol} <: Edit
|
struct Substitution{T <: BioSymbol} <: Edit
|
||||||
symbol::T
|
symbol::T
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Base.eltype(::Type{<:Substitution{T}}) where T = T
|
||||||
|
|
||||||
|
"""
|
||||||
|
Deletion <: Edit
|
||||||
|
|
||||||
|
Represents the deletion of N symbols. The location of the deletion is stored
|
||||||
|
outside this struct
|
||||||
|
"""
|
||||||
struct Deletion <: Edit
|
struct Deletion <: Edit
|
||||||
len::Int
|
len::UInt
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
Insertion{S <: BioSequence} <: Edit
|
||||||
|
|
||||||
|
Represents the insertion of a `T` into a sequence. The location of the insertion
|
||||||
|
is stored outside the struct.
|
||||||
|
"""
|
||||||
struct Insertion{S <: BioSequence} <: Edit
|
struct Insertion{S <: BioSequence} <: Edit
|
||||||
seq::S
|
seq::S
|
||||||
end
|
end
|
||||||
|
|
||||||
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
|
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
|
||||||
|
|
||||||
|
"""
|
||||||
|
Diff{E <: Edit}
|
||||||
|
|
||||||
|
Represents an `Edit` og type `E` at a given position.
|
||||||
|
"""
|
||||||
struct Diff{E <: Edit}
|
struct Diff{E <: Edit}
|
||||||
pos::Int
|
pos::UInt
|
||||||
edit::E
|
edit::E
|
||||||
end
|
end
|
||||||
|
|
||||||
struct Variant{S <: BioSequence, E <: Edit}
|
Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Variant{S <: BioSequence, D <: Diff}
|
||||||
|
|
||||||
|
Represents a series of diffs of type `D` against a reference of type `S`.
|
||||||
|
See also `Variation`
|
||||||
|
"""
|
||||||
|
struct Variant{S <: BioSequence, D <: Diff}
|
||||||
ref::S
|
ref::S
|
||||||
diffs::Vector{Diff{E}}
|
diffs::Vector{D}
|
||||||
end
|
end
|
||||||
|
|
||||||
struct Variation{S <: BioSequence, E <: Edit}
|
#function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff
|
||||||
ref::S
|
# return Variant{typeof(ref), D}(ref, diffs)
|
||||||
diff::Diff{E}
|
#end
|
||||||
|
|
||||||
|
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D}
|
||||||
|
cols = displaysize(io)[2] - 3
|
||||||
|
recur_io = IOContext(io, :SHOWN_SET => x.diffs)
|
||||||
|
print(io, summary(x), ":")
|
||||||
|
for i in x.diffs
|
||||||
|
v = Variation{S, typeof(i)}(x.ref, i)
|
||||||
|
str = sprint(show, v, context=recur_io, sizehint=0)
|
||||||
|
print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n"))
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
###
|
#Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Variation{S <: BioSequence, D <: Diff}
|
||||||
|
|
||||||
|
Represent a single diff of type `D` is a sequence of type `S`. See also `Variant`
|
||||||
|
"""
|
||||||
|
struct Variation{S <: BioSequence, D <: Diff}
|
||||||
|
ref::S
|
||||||
|
diff::D
|
||||||
|
end
|
||||||
|
|
||||||
|
#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff)
|
||||||
|
|
||||||
function check(v::Variation{S, <:Substitution{T}}) where {S, T}
|
function check(v::Variation{S, <:Substitution{T}}) where {S, T}
|
||||||
T == eltype(S) || throw(TypeError(:check, "", eltype(S), T))
|
T == eltype(S) || throw(TypeError(:check, "", eltype(S), T))
|
||||||
checkbounds(v.ref, v.diff.pos)
|
checkbounds(v.ref, v.diff.pos)
|
||||||
end
|
end
|
||||||
|
|
||||||
check(v::Variation{S, Deletion}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
|
check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
|
||||||
|
|
||||||
function check(v::Variation{S, <:Insertion}) where S
|
function check(v::Variation{S, <:Diff{<:Insertion}}) where S
|
||||||
length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0"))
|
length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0"))
|
||||||
# We can have insertions at the very end, after the reference sequence
|
# We can have insertions at the very end, after the reference sequence
|
||||||
v.diff.pos == lastindex(v.ref) + 1 && return nothing
|
v.diff.pos == lastindex(v.ref) + 1 && return nothing
|
||||||
|
@ -76,27 +146,67 @@ Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol)
|
||||||
Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1)
|
Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1)
|
||||||
Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq)
|
Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq)
|
||||||
|
|
||||||
function Base.show(io::IO, x::Variant{S, <:Substitution}) where S
|
function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S
|
||||||
print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol)
|
print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol)
|
||||||
end
|
end
|
||||||
|
|
||||||
Base.show(io::IO, x::Variant{S, Deletion}) where S = show(io, x.diff)
|
Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff)
|
||||||
Base.show(io::IO, x::Variant{S, <:Insertion} where S) = show(io, x.diff)
|
Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff)
|
||||||
|
|
||||||
Base.:(==)(x::T, y::T) where {T <: Variant} = (x.ref === y.ref) & (x.diff == y.diff)
|
Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff)
|
||||||
|
|
||||||
#################
|
|
||||||
|
|
||||||
#=
|
|
||||||
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
|
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
|
||||||
aln = AlignedSequence(seqaln, refaln)
|
aln = AlignedSequence(seqaln, refaln)
|
||||||
return variations(ref, refaln, seqaln, aln.aln.anchors)
|
return variations(ref, refaln, seqaln, aln.aln.anchors)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"Calculate all substitutions between two sequences"
|
||||||
|
function substitutions(ref::S, seq::S) where {S <: BioSequence}
|
||||||
|
if length(ref) != length(seq)
|
||||||
|
throw(DimensionMismatch("Sequences must match in length"))
|
||||||
|
end
|
||||||
|
if !(count(isgap, ref) == count(isgap, seq) == 0)
|
||||||
|
throw(ArgumentError("Neither sequence can contain gaps"))
|
||||||
|
end
|
||||||
|
diffs = Diff{Substitution{eltype(S)}}[]
|
||||||
|
@inbounds for i in eachindex(ref)
|
||||||
|
refi, seqi = ref[i], seq[i]
|
||||||
|
if refi != seqi
|
||||||
|
push!(diffs, Diff(i, Substitution(seqi)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return Variant(ref, diffs)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
function variant(ref::S, seq::S) where {S <: BioSequence}
|
||||||
|
aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln
|
||||||
|
diffs = Diff[]
|
||||||
|
result = Variant(ref, diffs)
|
||||||
|
refpos = 0
|
||||||
|
markpos = 0
|
||||||
|
n_gaps = n_ins = 0
|
||||||
|
for (seqi, refi) in aln
|
||||||
|
isgap(refi) || (refpos += 1)
|
||||||
|
|
||||||
|
# Check for deletions
|
||||||
|
if isgap(seqi)
|
||||||
|
iszero(n_gaps) && (markpos = refi)
|
||||||
|
n_gaps += 1
|
||||||
|
else
|
||||||
|
if !iszero(n_gaps)
|
||||||
|
push(diffs, Diff{Deletion}Deletion(n_gaps))
|
||||||
|
end
|
||||||
|
n_gaps = 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
#=
|
||||||
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
|
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
|
||||||
result = SeqVar{A}[]
|
result = Variant(dna"TAG", Diff[])
|
||||||
|
diffs = result.diffs
|
||||||
isempty(anchors) && return result
|
isempty(anchors) && return result
|
||||||
firstop = first(anchors)
|
firstop = first(anchors)
|
||||||
|
|
||||||
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
|
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
|
||||||
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
|
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
|
||||||
end
|
end
|
||||||
|
@ -110,18 +220,18 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho
|
||||||
if anchor.op == OP_SEQ_MISMATCH
|
if anchor.op == OP_SEQ_MISMATCH
|
||||||
p = seqalnpos
|
p = seqalnpos
|
||||||
for pos in refpos:refend
|
for pos in refpos:refend
|
||||||
push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p])))
|
push!(diffs, Diff(pos, Substitution{eltype(A)}(seqaln[p])))
|
||||||
p += 1
|
p += 1
|
||||||
end
|
end
|
||||||
|
|
||||||
# Deletions are trivial
|
# Deletions are trivial
|
||||||
elseif anchor.op == OP_DELETE
|
elseif anchor.op == OP_DELETE
|
||||||
len = refend - refpos + 1
|
len = refend - refpos + 1
|
||||||
push!(result, SeqVar(ref, refpos, Deletion(len)))
|
push!(diffs, Diff(refpos, Deletion(len)))
|
||||||
|
|
||||||
# Insertions are a little more tricky
|
# Insertions are a little more tricky
|
||||||
elseif anchor.op == OP_INSERT
|
elseif anchor.op == OP_INSERT
|
||||||
push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend])))
|
push!(diffs, Diff(refend + 1, Insertion{S}(seqaln[seqalnpos:seqalnend])))
|
||||||
|
|
||||||
elseif anchor.op == OP_SEQ_MATCH
|
elseif anchor.op == OP_SEQ_MATCH
|
||||||
nothing
|
nothing
|
||||||
|
@ -130,9 +240,11 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho
|
||||||
end
|
end
|
||||||
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
|
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
|
||||||
end
|
end
|
||||||
|
|
||||||
return result
|
return result
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
function posmap(gap, oldref::BioSequence, newref::BioSequence)
|
function posmap(gap, oldref::BioSequence, newref::BioSequence)
|
||||||
if length(oldref) != length(newref)
|
if length(oldref) != length(newref)
|
||||||
throw(ArgumentError("Sequences must be equal-length and aligned"))
|
throw(ArgumentError("Sequences must be equal-length and aligned"))
|
||||||
|
@ -156,51 +268,84 @@ If `o` maps to a gap in the new sequence, `v[o] === nothing`.
|
||||||
"""
|
"""
|
||||||
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
|
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
|
||||||
|
|
||||||
function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution}
|
|
||||||
newpos = posmap[var.pos]
|
function rereference(diff::Diff{<:Substitution}, posmap)
|
||||||
newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference"))
|
newpos = posmap[diff.pos]
|
||||||
return SeqVar{A, S}(ref, newpos::Int, var.var)
|
newpos === nothing && throw(ArgumentError("Position $(diff.pos) maps to a gap in reference"))
|
||||||
|
return typeof(diff)(newpos::Int, diff.edit)
|
||||||
end
|
end
|
||||||
|
|
||||||
function checkflanks(posmap, varpos::Int, flank::Int)
|
|
||||||
for i in max(1, varpos - flank) : min(length(posmap), varpos + flank)
|
function checkflanks(posmap, pos::Int, flank::Int)
|
||||||
posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion"))
|
for i in max(1, pos - flank) : min(length(posmap), pos + flank)
|
||||||
|
posmap[i] === nothing && throw(ArgumentError("Flanking position $i maps to a deletion"))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A
|
function rereference(diff::Diff{Deletion}, posmap, flank::Int=5)
|
||||||
checkbounds(posmap, var.pos:var.pos + var.var.len - 1)
|
checkbounds(posmap, diff.pos:diff.pos + diff.edit.len - 1)
|
||||||
checkflanks(posmap, var.pos, flank)
|
checkflanks(posmap, diff.pos, flank)
|
||||||
return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var)
|
return typeof(diff)(posmap[diff.pos], diff.edit.len)
|
||||||
end
|
end
|
||||||
|
|
||||||
function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A
|
function rereference(diff::Diff{<:Insertion}, posmap, flank::Int=5)
|
||||||
checkflanks(posmap, var.pos, flank)
|
checkflanks(posmap, diff.pos, flank)
|
||||||
return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var)
|
return typeof(diff)(posmap[diff.pos], diff.edit)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
# Assumes same reference, and that they are not mutally exclusive
|
# Assumes same reference, and that they are not mutally exclusive
|
||||||
# (e.g no substitution in deleted areas)
|
# (e.g no substitution in deleted areas)
|
||||||
#=
|
function reconstruct(v::Variant)
|
||||||
function reconstruct(v::Vector{<:SeqVar{A}}) where A
|
isempty(v.diffs) && return copy(v.ref)
|
||||||
isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence"))
|
diffs = sort(v.diffs, by=x -> x.pos)
|
||||||
srt = sort(v, by=x -> x.pos)
|
|
||||||
len = length(v[1].ref)
|
# First, get length of result
|
||||||
for i in srt
|
len::Int = length(v.ref)
|
||||||
if i isa SeqVar{A, <:Deletion}
|
for diff in diffs
|
||||||
len -= i.var.len
|
if diff.edit isa Deletion
|
||||||
elseif i isa SeqVar{A, <:Insertion}
|
len = len - diff.edit.len
|
||||||
len += length(i.var.seq)
|
elseif diff.edit isa Insertion
|
||||||
|
len = len + length(diff.edit.seq)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Fill in
|
||||||
ref = copy(v[1].ref)
|
dst = typeof(v.ref)(len)
|
||||||
oldpos, newpos = 1, 1
|
src, diffi, srci::Int, dsti = v.ref, 1, 1, 1
|
||||||
for i in srt
|
diff = diffs[diffi]
|
||||||
=#
|
while dsti ≤ len
|
||||||
|
while (srci < diff.pos) & (dsti ≤ len)
|
||||||
|
dst[dsti] = src[srci]
|
||||||
|
srci, dsti = srci + 1, dsti + 1
|
||||||
|
end
|
||||||
|
diff === nothing && break
|
||||||
|
if diff.edit isa Substitution
|
||||||
|
dst[dsti] = diff.edit.symbol
|
||||||
|
srci, dsti = srci + 1, dsti + 1
|
||||||
|
elseif diff.edit isa Insertion
|
||||||
|
for i in 1:(length(diff.edit.seq)::Int)
|
||||||
|
dst[dsti] = diff.edit.seq[i]
|
||||||
|
dsti += 1
|
||||||
|
end
|
||||||
|
elseif diff.edit isa Deletion
|
||||||
|
srci += diff.edit.len
|
||||||
|
end
|
||||||
|
diffi += 1
|
||||||
|
diff = diffi > length(diffs) ? nothing : diffs[diffi]
|
||||||
|
end
|
||||||
|
return dst
|
||||||
|
end
|
||||||
=#
|
=#
|
||||||
|
|
||||||
|
export Diff,
|
||||||
|
Edit,
|
||||||
|
Variation,
|
||||||
|
Variant,
|
||||||
|
Substitution,
|
||||||
|
Insertion,
|
||||||
|
Deletion,
|
||||||
|
variations,
|
||||||
|
substitutions
|
||||||
|
|
||||||
end # module
|
end # module
|
||||||
|
|
Loading…
Reference in a new issue