Start fixing

This commit is contained in:
Jakob Nybo Nissen 2021-03-02 13:20:25 +01:00
parent 64ae3bc862
commit 40749eb6f5

View file

@ -26,46 +26,116 @@ using BioAlignments
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
import BioSymbols: BioSymbol import BioSymbols: BioSymbol
const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
@sum_type Edit{S, T}
Substitution{S, T}(::T)
Deletion{S, T}(::UInt)
Insertion{S, T}(::S)
end
"""
Edit
Abstract type representing a type of nucleotide edit: Deletion, insertion or
substitution.
"""
abstract type Edit end abstract type Edit end
"""
Substitution{T <: BioSymbol} <: Edit
Represents the presence of a `T` at a given position. The position is stored
outside this struct.
"""
struct Substitution{T <: BioSymbol} <: Edit struct Substitution{T <: BioSymbol} <: Edit
symbol::T symbol::T
end end
Base.eltype(::Type{<:Substitution{T}}) where T = T
"""
Deletion <: Edit
Represents the deletion of N symbols. The location of the deletion is stored
outside this struct
"""
struct Deletion <: Edit struct Deletion <: Edit
len::Int len::UInt
end end
"""
Insertion{S <: BioSequence} <: Edit
Represents the insertion of a `T` into a sequence. The location of the insertion
is stored outside the struct.
"""
struct Insertion{S <: BioSequence} <: Edit struct Insertion{S <: BioSequence} <: Edit
seq::S seq::S
end end
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
"""
Diff{E <: Edit}
Represents an `Edit` og type `E` at a given position.
"""
struct Diff{E <: Edit} struct Diff{E <: Edit}
pos::Int pos::UInt
edit::E edit::E
end end
struct Variant{S <: BioSequence, E <: Edit} Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit)
"""
Variant{S <: BioSequence, D <: Diff}
Represents a series of diffs of type `D` against a reference of type `S`.
See also `Variation`
"""
struct Variant{S <: BioSequence, D <: Diff}
ref::S ref::S
diffs::Vector{Diff{E}} diffs::Vector{D}
end end
struct Variation{S <: BioSequence, E <: Edit} #function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff
ref::S # return Variant{typeof(ref), D}(ref, diffs)
diff::Diff{E} #end
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D}
cols = displaysize(io)[2] - 3
recur_io = IOContext(io, :SHOWN_SET => x.diffs)
print(io, summary(x), ":")
for i in x.diffs
v = Variation{S, typeof(i)}(x.ref, i)
str = sprint(show, v, context=recur_io, sizehint=0)
print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n"))
end
end end
### #Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v)
"""
Variation{S <: BioSequence, D <: Diff}
Represent a single diff of type `D` is a sequence of type `S`. See also `Variant`
"""
struct Variation{S <: BioSequence, D <: Diff}
ref::S
diff::D
end
#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff)
function check(v::Variation{S, <:Substitution{T}}) where {S, T} function check(v::Variation{S, <:Substitution{T}}) where {S, T}
T == eltype(S) || throw(TypeError(:check, "", eltype(S), T)) T == eltype(S) || throw(TypeError(:check, "", eltype(S), T))
checkbounds(v.ref, v.diff.pos) checkbounds(v.ref, v.diff.pos)
end end
check(v::Variation{S, Deletion}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1) check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
function check(v::Variation{S, <:Insertion}) where S function check(v::Variation{S, <:Diff{<:Insertion}}) where S
length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0")) length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0"))
# We can have insertions at the very end, after the reference sequence # We can have insertions at the very end, after the reference sequence
v.diff.pos == lastindex(v.ref) + 1 && return nothing v.diff.pos == lastindex(v.ref) + 1 && return nothing
@ -76,27 +146,67 @@ Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol)
Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1) Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1)
Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq) Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq)
function Base.show(io::IO, x::Variant{S, <:Substitution}) where S function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S
print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol) print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol)
end end
Base.show(io::IO, x::Variant{S, Deletion}) where S = show(io, x.diff) Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff)
Base.show(io::IO, x::Variant{S, <:Insertion} where S) = show(io, x.diff) Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff)
Base.:(==)(x::T, y::T) where {T <: Variant} = (x.ref === y.ref) & (x.diff == y.diff) Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff)
#################
#=
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence} function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
aln = AlignedSequence(seqaln, refaln) aln = AlignedSequence(seqaln, refaln)
return variations(ref, refaln, seqaln, aln.aln.anchors) return variations(ref, refaln, seqaln, aln.aln.anchors)
end end
"Calculate all substitutions between two sequences"
function substitutions(ref::S, seq::S) where {S <: BioSequence}
if length(ref) != length(seq)
throw(DimensionMismatch("Sequences must match in length"))
end
if !(count(isgap, ref) == count(isgap, seq) == 0)
throw(ArgumentError("Neither sequence can contain gaps"))
end
diffs = Diff{Substitution{eltype(S)}}[]
@inbounds for i in eachindex(ref)
refi, seqi = ref[i], seq[i]
if refi != seqi
push!(diffs, Diff(i, Substitution(seqi)))
end
end
return Variant(ref, diffs)
end
function variant(ref::S, seq::S) where {S <: BioSequence}
aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln
diffs = Diff[]
result = Variant(ref, diffs)
refpos = 0
markpos = 0
n_gaps = n_ins = 0
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
# Check for deletions
if isgap(seqi)
iszero(n_gaps) && (markpos = refi)
n_gaps += 1
else
if !iszero(n_gaps)
push(diffs, Diff{Deletion}Deletion(n_gaps))
end
n_gaps = 0
end
end
#=
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
result = SeqVar{A}[] result = Variant(dna"TAG", Diff[])
diffs = result.diffs
isempty(anchors) && return result isempty(anchors) && return result
firstop = first(anchors) firstop = first(anchors)
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0) if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)")) throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
end end
@ -110,18 +220,18 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho
if anchor.op == OP_SEQ_MISMATCH if anchor.op == OP_SEQ_MISMATCH
p = seqalnpos p = seqalnpos
for pos in refpos:refend for pos in refpos:refend
push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p]))) push!(diffs, Diff(pos, Substitution{eltype(A)}(seqaln[p])))
p += 1 p += 1
end end
# Deletions are trivial # Deletions are trivial
elseif anchor.op == OP_DELETE elseif anchor.op == OP_DELETE
len = refend - refpos + 1 len = refend - refpos + 1
push!(result, SeqVar(ref, refpos, Deletion(len))) push!(diffs, Diff(refpos, Deletion(len)))
# Insertions are a little more tricky # Insertions are a little more tricky
elseif anchor.op == OP_INSERT elseif anchor.op == OP_INSERT
push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend]))) push!(diffs, Diff(refend + 1, Insertion{S}(seqaln[seqalnpos:seqalnend])))
elseif anchor.op == OP_SEQ_MATCH elseif anchor.op == OP_SEQ_MATCH
nothing nothing
@ -130,9 +240,11 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho
end end
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1 refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
end end
return result return result
end end
function posmap(gap, oldref::BioSequence, newref::BioSequence) function posmap(gap, oldref::BioSequence, newref::BioSequence)
if length(oldref) != length(newref) if length(oldref) != length(newref)
throw(ArgumentError("Sequences must be equal-length and aligned")) throw(ArgumentError("Sequences must be equal-length and aligned"))
@ -156,51 +268,84 @@ If `o` maps to a gap in the new sequence, `v[o] === nothing`.
""" """
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref) posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution}
newpos = posmap[var.pos] function rereference(diff::Diff{<:Substitution}, posmap)
newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference")) newpos = posmap[diff.pos]
return SeqVar{A, S}(ref, newpos::Int, var.var) newpos === nothing && throw(ArgumentError("Position $(diff.pos) maps to a gap in reference"))
return typeof(diff)(newpos::Int, diff.edit)
end end
function checkflanks(posmap, varpos::Int, flank::Int)
for i in max(1, varpos - flank) : min(length(posmap), varpos + flank) function checkflanks(posmap, pos::Int, flank::Int)
posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion")) for i in max(1, pos - flank) : min(length(posmap), pos + flank)
posmap[i] === nothing && throw(ArgumentError("Flanking position $i maps to a deletion"))
end end
end end
function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A function rereference(diff::Diff{Deletion}, posmap, flank::Int=5)
checkbounds(posmap, var.pos:var.pos + var.var.len - 1) checkbounds(posmap, diff.pos:diff.pos + diff.edit.len - 1)
checkflanks(posmap, var.pos, flank) checkflanks(posmap, diff.pos, flank)
return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var) return typeof(diff)(posmap[diff.pos], diff.edit.len)
end end
function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A function rereference(diff::Diff{<:Insertion}, posmap, flank::Int=5)
checkflanks(posmap, var.pos, flank) checkflanks(posmap, diff.pos, flank)
return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var) return typeof(diff)(posmap[diff.pos], diff.edit)
end end
# Assumes same reference, and that they are not mutally exclusive # Assumes same reference, and that they are not mutally exclusive
# (e.g no substitution in deleted areas) # (e.g no substitution in deleted areas)
#= function reconstruct(v::Variant)
function reconstruct(v::Vector{<:SeqVar{A}}) where A isempty(v.diffs) && return copy(v.ref)
isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence")) diffs = sort(v.diffs, by=x -> x.pos)
srt = sort(v, by=x -> x.pos)
len = length(v[1].ref) # First, get length of result
for i in srt len::Int = length(v.ref)
if i isa SeqVar{A, <:Deletion} for diff in diffs
len -= i.var.len if diff.edit isa Deletion
elseif i isa SeqVar{A, <:Insertion} len = len - diff.edit.len
len += length(i.var.seq) elseif diff.edit isa Insertion
len = len + length(diff.edit.seq)
end end
end end
# Fill in
ref = copy(v[1].ref) dst = typeof(v.ref)(len)
oldpos, newpos = 1, 1 src, diffi, srci::Int, dsti = v.ref, 1, 1, 1
for i in srt diff = diffs[diffi]
=# while dsti len
while (srci < diff.pos) & (dsti len)
dst[dsti] = src[srci]
srci, dsti = srci + 1, dsti + 1
end
diff === nothing && break
if diff.edit isa Substitution
dst[dsti] = diff.edit.symbol
srci, dsti = srci + 1, dsti + 1
elseif diff.edit isa Insertion
for i in 1:(length(diff.edit.seq)::Int)
dst[dsti] = diff.edit.seq[i]
dsti += 1
end
elseif diff.edit isa Deletion
srci += diff.edit.len
end
diffi += 1
diff = diffi > length(diffs) ? nothing : diffs[diffi]
end
return dst
end
=# =#
export Diff,
Edit,
Variation,
Variant,
Substitution,
Insertion,
Deletion,
variations,
substitutions
end # module end # module