mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-12-23 08:48:17 +00:00
Start fixing
This commit is contained in:
parent
64ae3bc862
commit
40749eb6f5
1 changed files with 195 additions and 50 deletions
|
@ -26,46 +26,116 @@ using BioAlignments
|
|||
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
|
||||
import BioSymbols: BioSymbol
|
||||
|
||||
const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
|
||||
|
||||
@sum_type Edit{S, T}
|
||||
Substitution{S, T}(::T)
|
||||
Deletion{S, T}(::UInt)
|
||||
Insertion{S, T}(::S)
|
||||
end
|
||||
|
||||
"""
|
||||
Edit
|
||||
|
||||
Abstract type representing a type of nucleotide edit: Deletion, insertion or
|
||||
substitution.
|
||||
"""
|
||||
abstract type Edit end
|
||||
|
||||
"""
|
||||
Substitution{T <: BioSymbol} <: Edit
|
||||
|
||||
Represents the presence of a `T` at a given position. The position is stored
|
||||
outside this struct.
|
||||
"""
|
||||
struct Substitution{T <: BioSymbol} <: Edit
|
||||
symbol::T
|
||||
end
|
||||
|
||||
Base.eltype(::Type{<:Substitution{T}}) where T = T
|
||||
|
||||
"""
|
||||
Deletion <: Edit
|
||||
|
||||
Represents the deletion of N symbols. The location of the deletion is stored
|
||||
outside this struct
|
||||
"""
|
||||
struct Deletion <: Edit
|
||||
len::Int
|
||||
len::UInt
|
||||
end
|
||||
|
||||
"""
|
||||
Insertion{S <: BioSequence} <: Edit
|
||||
|
||||
Represents the insertion of a `T` into a sequence. The location of the insertion
|
||||
is stored outside the struct.
|
||||
"""
|
||||
struct Insertion{S <: BioSequence} <: Edit
|
||||
seq::S
|
||||
end
|
||||
|
||||
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
|
||||
|
||||
"""
|
||||
Diff{E <: Edit}
|
||||
|
||||
Represents an `Edit` og type `E` at a given position.
|
||||
"""
|
||||
struct Diff{E <: Edit}
|
||||
pos::Int
|
||||
pos::UInt
|
||||
edit::E
|
||||
end
|
||||
|
||||
struct Variant{S <: BioSequence, E <: Edit}
|
||||
Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit)
|
||||
|
||||
"""
|
||||
Variant{S <: BioSequence, D <: Diff}
|
||||
|
||||
Represents a series of diffs of type `D` against a reference of type `S`.
|
||||
See also `Variation`
|
||||
"""
|
||||
struct Variant{S <: BioSequence, D <: Diff}
|
||||
ref::S
|
||||
diffs::Vector{Diff{E}}
|
||||
diffs::Vector{D}
|
||||
end
|
||||
|
||||
struct Variation{S <: BioSequence, E <: Edit}
|
||||
ref::S
|
||||
diff::Diff{E}
|
||||
#function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff
|
||||
# return Variant{typeof(ref), D}(ref, diffs)
|
||||
#end
|
||||
|
||||
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D}
|
||||
cols = displaysize(io)[2] - 3
|
||||
recur_io = IOContext(io, :SHOWN_SET => x.diffs)
|
||||
print(io, summary(x), ":")
|
||||
for i in x.diffs
|
||||
v = Variation{S, typeof(i)}(x.ref, i)
|
||||
str = sprint(show, v, context=recur_io, sizehint=0)
|
||||
print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n"))
|
||||
end
|
||||
end
|
||||
|
||||
###
|
||||
#Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v)
|
||||
|
||||
"""
|
||||
Variation{S <: BioSequence, D <: Diff}
|
||||
|
||||
Represent a single diff of type `D` is a sequence of type `S`. See also `Variant`
|
||||
"""
|
||||
struct Variation{S <: BioSequence, D <: Diff}
|
||||
ref::S
|
||||
diff::D
|
||||
end
|
||||
|
||||
#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff)
|
||||
|
||||
function check(v::Variation{S, <:Substitution{T}}) where {S, T}
|
||||
T == eltype(S) || throw(TypeError(:check, "", eltype(S), T))
|
||||
checkbounds(v.ref, v.diff.pos)
|
||||
end
|
||||
|
||||
check(v::Variation{S, Deletion}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
|
||||
check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
|
||||
|
||||
function check(v::Variation{S, <:Insertion}) where S
|
||||
function check(v::Variation{S, <:Diff{<:Insertion}}) where S
|
||||
length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0"))
|
||||
# We can have insertions at the very end, after the reference sequence
|
||||
v.diff.pos == lastindex(v.ref) + 1 && return nothing
|
||||
|
@ -76,27 +146,67 @@ Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol)
|
|||
Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1)
|
||||
Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq)
|
||||
|
||||
function Base.show(io::IO, x::Variant{S, <:Substitution}) where S
|
||||
function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S
|
||||
print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol)
|
||||
end
|
||||
|
||||
Base.show(io::IO, x::Variant{S, Deletion}) where S = show(io, x.diff)
|
||||
Base.show(io::IO, x::Variant{S, <:Insertion} where S) = show(io, x.diff)
|
||||
Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff)
|
||||
Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff)
|
||||
|
||||
Base.:(==)(x::T, y::T) where {T <: Variant} = (x.ref === y.ref) & (x.diff == y.diff)
|
||||
Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff)
|
||||
|
||||
#################
|
||||
|
||||
#=
|
||||
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
|
||||
aln = AlignedSequence(seqaln, refaln)
|
||||
return variations(ref, refaln, seqaln, aln.aln.anchors)
|
||||
end
|
||||
|
||||
"Calculate all substitutions between two sequences"
|
||||
function substitutions(ref::S, seq::S) where {S <: BioSequence}
|
||||
if length(ref) != length(seq)
|
||||
throw(DimensionMismatch("Sequences must match in length"))
|
||||
end
|
||||
if !(count(isgap, ref) == count(isgap, seq) == 0)
|
||||
throw(ArgumentError("Neither sequence can contain gaps"))
|
||||
end
|
||||
diffs = Diff{Substitution{eltype(S)}}[]
|
||||
@inbounds for i in eachindex(ref)
|
||||
refi, seqi = ref[i], seq[i]
|
||||
if refi != seqi
|
||||
push!(diffs, Diff(i, Substitution(seqi)))
|
||||
end
|
||||
end
|
||||
return Variant(ref, diffs)
|
||||
end
|
||||
|
||||
|
||||
function variant(ref::S, seq::S) where {S <: BioSequence}
|
||||
aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln
|
||||
diffs = Diff[]
|
||||
result = Variant(ref, diffs)
|
||||
refpos = 0
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(n_gaps) && (markpos = refi)
|
||||
n_gaps += 1
|
||||
else
|
||||
if !iszero(n_gaps)
|
||||
push(diffs, Diff{Deletion}Deletion(n_gaps))
|
||||
end
|
||||
n_gaps = 0
|
||||
end
|
||||
end
|
||||
#=
|
||||
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
|
||||
result = SeqVar{A}[]
|
||||
result = Variant(dna"TAG", Diff[])
|
||||
diffs = result.diffs
|
||||
isempty(anchors) && return result
|
||||
firstop = first(anchors)
|
||||
|
||||
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
|
||||
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
|
||||
end
|
||||
|
@ -110,18 +220,18 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho
|
|||
if anchor.op == OP_SEQ_MISMATCH
|
||||
p = seqalnpos
|
||||
for pos in refpos:refend
|
||||
push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p])))
|
||||
push!(diffs, Diff(pos, Substitution{eltype(A)}(seqaln[p])))
|
||||
p += 1
|
||||
end
|
||||
|
||||
# Deletions are trivial
|
||||
elseif anchor.op == OP_DELETE
|
||||
len = refend - refpos + 1
|
||||
push!(result, SeqVar(ref, refpos, Deletion(len)))
|
||||
push!(diffs, Diff(refpos, Deletion(len)))
|
||||
|
||||
# Insertions are a little more tricky
|
||||
elseif anchor.op == OP_INSERT
|
||||
push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend])))
|
||||
push!(diffs, Diff(refend + 1, Insertion{S}(seqaln[seqalnpos:seqalnend])))
|
||||
|
||||
elseif anchor.op == OP_SEQ_MATCH
|
||||
nothing
|
||||
|
@ -130,9 +240,11 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho
|
|||
end
|
||||
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
|
||||
end
|
||||
|
||||
return result
|
||||
end
|
||||
|
||||
|
||||
function posmap(gap, oldref::BioSequence, newref::BioSequence)
|
||||
if length(oldref) != length(newref)
|
||||
throw(ArgumentError("Sequences must be equal-length and aligned"))
|
||||
|
@ -156,51 +268,84 @@ If `o` maps to a gap in the new sequence, `v[o] === nothing`.
|
|||
"""
|
||||
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
|
||||
|
||||
function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution}
|
||||
newpos = posmap[var.pos]
|
||||
newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference"))
|
||||
return SeqVar{A, S}(ref, newpos::Int, var.var)
|
||||
|
||||
function rereference(diff::Diff{<:Substitution}, posmap)
|
||||
newpos = posmap[diff.pos]
|
||||
newpos === nothing && throw(ArgumentError("Position $(diff.pos) maps to a gap in reference"))
|
||||
return typeof(diff)(newpos::Int, diff.edit)
|
||||
end
|
||||
|
||||
function checkflanks(posmap, varpos::Int, flank::Int)
|
||||
for i in max(1, varpos - flank) : min(length(posmap), varpos + flank)
|
||||
posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion"))
|
||||
|
||||
function checkflanks(posmap, pos::Int, flank::Int)
|
||||
for i in max(1, pos - flank) : min(length(posmap), pos + flank)
|
||||
posmap[i] === nothing && throw(ArgumentError("Flanking position $i maps to a deletion"))
|
||||
end
|
||||
end
|
||||
|
||||
function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A
|
||||
checkbounds(posmap, var.pos:var.pos + var.var.len - 1)
|
||||
checkflanks(posmap, var.pos, flank)
|
||||
return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var)
|
||||
function rereference(diff::Diff{Deletion}, posmap, flank::Int=5)
|
||||
checkbounds(posmap, diff.pos:diff.pos + diff.edit.len - 1)
|
||||
checkflanks(posmap, diff.pos, flank)
|
||||
return typeof(diff)(posmap[diff.pos], diff.edit.len)
|
||||
end
|
||||
|
||||
function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A
|
||||
checkflanks(posmap, var.pos, flank)
|
||||
return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var)
|
||||
function rereference(diff::Diff{<:Insertion}, posmap, flank::Int=5)
|
||||
checkflanks(posmap, diff.pos, flank)
|
||||
return typeof(diff)(posmap[diff.pos], diff.edit)
|
||||
end
|
||||
|
||||
|
||||
# Assumes same reference, and that they are not mutally exclusive
|
||||
# (e.g no substitution in deleted areas)
|
||||
#=
|
||||
function reconstruct(v::Vector{<:SeqVar{A}}) where A
|
||||
isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence"))
|
||||
srt = sort(v, by=x -> x.pos)
|
||||
len = length(v[1].ref)
|
||||
for i in srt
|
||||
if i isa SeqVar{A, <:Deletion}
|
||||
len -= i.var.len
|
||||
elseif i isa SeqVar{A, <:Insertion}
|
||||
len += length(i.var.seq)
|
||||
function reconstruct(v::Variant)
|
||||
isempty(v.diffs) && return copy(v.ref)
|
||||
diffs = sort(v.diffs, by=x -> x.pos)
|
||||
|
||||
# First, get length of result
|
||||
len::Int = length(v.ref)
|
||||
for diff in diffs
|
||||
if diff.edit isa Deletion
|
||||
len = len - diff.edit.len
|
||||
elseif diff.edit isa Insertion
|
||||
len = len + length(diff.edit.seq)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
ref = copy(v[1].ref)
|
||||
oldpos, newpos = 1, 1
|
||||
for i in srt
|
||||
=#
|
||||
# Fill in
|
||||
dst = typeof(v.ref)(len)
|
||||
src, diffi, srci::Int, dsti = v.ref, 1, 1, 1
|
||||
diff = diffs[diffi]
|
||||
while dsti ≤ len
|
||||
while (srci < diff.pos) & (dsti ≤ len)
|
||||
dst[dsti] = src[srci]
|
||||
srci, dsti = srci + 1, dsti + 1
|
||||
end
|
||||
diff === nothing && break
|
||||
if diff.edit isa Substitution
|
||||
dst[dsti] = diff.edit.symbol
|
||||
srci, dsti = srci + 1, dsti + 1
|
||||
elseif diff.edit isa Insertion
|
||||
for i in 1:(length(diff.edit.seq)::Int)
|
||||
dst[dsti] = diff.edit.seq[i]
|
||||
dsti += 1
|
||||
end
|
||||
elseif diff.edit isa Deletion
|
||||
srci += diff.edit.len
|
||||
end
|
||||
diffi += 1
|
||||
diff = diffi > length(diffs) ? nothing : diffs[diffi]
|
||||
end
|
||||
return dst
|
||||
end
|
||||
=#
|
||||
|
||||
export Diff,
|
||||
Edit,
|
||||
Variation,
|
||||
Variant,
|
||||
Substitution,
|
||||
Insertion,
|
||||
Deletion,
|
||||
variations,
|
||||
substitutions
|
||||
|
||||
end # module
|
||||
|
|
Loading…
Reference in a new issue