From 40749eb6f5eec175f11025eb756666a8694d8f0c Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Mar 2021 13:20:25 +0100 Subject: [PATCH] Start fixing --- src/SequenceVariation.jl | 245 +++++++++++++++++++++++++++++++-------- 1 file changed, 195 insertions(+), 50 deletions(-) diff --git a/src/SequenceVariation.jl b/src/SequenceVariation.jl index fe9b0c3..97f847d 100644 --- a/src/SequenceVariation.jl +++ b/src/SequenceVariation.jl @@ -26,46 +26,116 @@ using BioAlignments import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE import BioSymbols: BioSymbol +const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2) + +@sum_type Edit{S, T} + Substitution{S, T}(::T) + Deletion{S, T}(::UInt) + Insertion{S, T}(::S) +end + +""" + Edit + +Abstract type representing a type of nucleotide edit: Deletion, insertion or +substitution. +""" abstract type Edit end +""" + Substitution{T <: BioSymbol} <: Edit + +Represents the presence of a `T` at a given position. The position is stored +outside this struct. +""" struct Substitution{T <: BioSymbol} <: Edit symbol::T end +Base.eltype(::Type{<:Substitution{T}}) where T = T + +""" + Deletion <: Edit + +Represents the deletion of N symbols. The location of the deletion is stored +outside this struct +""" struct Deletion <: Edit - len::Int + len::UInt end +""" + Insertion{S <: BioSequence} <: Edit + +Represents the insertion of a `T` into a sequence. The location of the insertion +is stored outside the struct. +""" struct Insertion{S <: BioSequence} <: Edit seq::S end Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq +""" + Diff{E <: Edit} + +Represents an `Edit` og type `E` at a given position. +""" struct Diff{E <: Edit} - pos::Int + pos::UInt edit::E end -struct Variant{S <: BioSequence, E <: Edit} +Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit) + +""" + Variant{S <: BioSequence, D <: Diff} + +Represents a series of diffs of type `D` against a reference of type `S`. +See also `Variation` +""" +struct Variant{S <: BioSequence, D <: Diff} ref::S - diffs::Vector{Diff{E}} + diffs::Vector{D} end -struct Variation{S <: BioSequence, E <: Edit} - ref::S - diff::Diff{E} +#function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff +# return Variant{typeof(ref), D}(ref, diffs) +#end + +function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D} + cols = displaysize(io)[2] - 3 + recur_io = IOContext(io, :SHOWN_SET => x.diffs) + print(io, summary(x), ":") + for i in x.diffs + v = Variation{S, typeof(i)}(x.ref, i) + str = sprint(show, v, context=recur_io, sizehint=0) + print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n")) + end end -### +#Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v) + +""" + Variation{S <: BioSequence, D <: Diff} + +Represent a single diff of type `D` is a sequence of type `S`. See also `Variant` +""" +struct Variation{S <: BioSequence, D <: Diff} + ref::S + diff::D +end + +#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff) + function check(v::Variation{S, <:Substitution{T}}) where {S, T} T == eltype(S) || throw(TypeError(:check, "", eltype(S), T)) checkbounds(v.ref, v.diff.pos) end -check(v::Variation{S, Deletion}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1) +check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1) -function check(v::Variation{S, <:Insertion}) where S +function check(v::Variation{S, <:Diff{<:Insertion}}) where S length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0")) # We can have insertions at the very end, after the reference sequence v.diff.pos == lastindex(v.ref) + 1 && return nothing @@ -76,27 +146,67 @@ Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol) Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1) Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq) -function Base.show(io::IO, x::Variant{S, <:Substitution}) where S +function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol) end -Base.show(io::IO, x::Variant{S, Deletion}) where S = show(io, x.diff) -Base.show(io::IO, x::Variant{S, <:Insertion} where S) = show(io, x.diff) +Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff) +Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff) -Base.:(==)(x::T, y::T) where {T <: Variant} = (x.ref === y.ref) & (x.diff == y.diff) +Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff) -################# - -#= function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence} aln = AlignedSequence(seqaln, refaln) return variations(ref, refaln, seqaln, aln.aln.anchors) end +"Calculate all substitutions between two sequences" +function substitutions(ref::S, seq::S) where {S <: BioSequence} + if length(ref) != length(seq) + throw(DimensionMismatch("Sequences must match in length")) + end + if !(count(isgap, ref) == count(isgap, seq) == 0) + throw(ArgumentError("Neither sequence can contain gaps")) + end + diffs = Diff{Substitution{eltype(S)}}[] + @inbounds for i in eachindex(ref) + refi, seqi = ref[i], seq[i] + if refi != seqi + push!(diffs, Diff(i, Substitution(seqi))) + end + end + return Variant(ref, diffs) +end + + +function variant(ref::S, seq::S) where {S <: BioSequence} + aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln + diffs = Diff[] + result = Variant(ref, diffs) + refpos = 0 + markpos = 0 + n_gaps = n_ins = 0 + for (seqi, refi) in aln + isgap(refi) || (refpos += 1) + + # Check for deletions + if isgap(seqi) + iszero(n_gaps) && (markpos = refi) + n_gaps += 1 + else + if !iszero(n_gaps) + push(diffs, Diff{Deletion}Deletion(n_gaps)) + end + n_gaps = 0 + end + end +#= function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A - result = SeqVar{A}[] + result = Variant(dna"TAG", Diff[]) + diffs = result.diffs isempty(anchors) && return result firstop = first(anchors) + if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0) throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)")) end @@ -110,18 +220,18 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho if anchor.op == OP_SEQ_MISMATCH p = seqalnpos for pos in refpos:refend - push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p]))) + push!(diffs, Diff(pos, Substitution{eltype(A)}(seqaln[p]))) p += 1 end # Deletions are trivial elseif anchor.op == OP_DELETE len = refend - refpos + 1 - push!(result, SeqVar(ref, refpos, Deletion(len))) + push!(diffs, Diff(refpos, Deletion(len))) # Insertions are a little more tricky elseif anchor.op == OP_INSERT - push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend]))) + push!(diffs, Diff(refend + 1, Insertion{S}(seqaln[seqalnpos:seqalnend]))) elseif anchor.op == OP_SEQ_MATCH nothing @@ -130,9 +240,11 @@ function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAncho end refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1 end + return result end + function posmap(gap, oldref::BioSequence, newref::BioSequence) if length(oldref) != length(newref) throw(ArgumentError("Sequences must be equal-length and aligned")) @@ -156,51 +268,84 @@ If `o` maps to a gap in the new sequence, `v[o] === nothing`. """ posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref) -function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution} - newpos = posmap[var.pos] - newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference")) - return SeqVar{A, S}(ref, newpos::Int, var.var) + +function rereference(diff::Diff{<:Substitution}, posmap) + newpos = posmap[diff.pos] + newpos === nothing && throw(ArgumentError("Position $(diff.pos) maps to a gap in reference")) + return typeof(diff)(newpos::Int, diff.edit) end -function checkflanks(posmap, varpos::Int, flank::Int) - for i in max(1, varpos - flank) : min(length(posmap), varpos + flank) - posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion")) + +function checkflanks(posmap, pos::Int, flank::Int) + for i in max(1, pos - flank) : min(length(posmap), pos + flank) + posmap[i] === nothing && throw(ArgumentError("Flanking position $i maps to a deletion")) end end -function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A - checkbounds(posmap, var.pos:var.pos + var.var.len - 1) - checkflanks(posmap, var.pos, flank) - return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var) +function rereference(diff::Diff{Deletion}, posmap, flank::Int=5) + checkbounds(posmap, diff.pos:diff.pos + diff.edit.len - 1) + checkflanks(posmap, diff.pos, flank) + return typeof(diff)(posmap[diff.pos], diff.edit.len) end -function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A - checkflanks(posmap, var.pos, flank) - return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var) +function rereference(diff::Diff{<:Insertion}, posmap, flank::Int=5) + checkflanks(posmap, diff.pos, flank) + return typeof(diff)(posmap[diff.pos], diff.edit) end # Assumes same reference, and that they are not mutally exclusive # (e.g no substitution in deleted areas) -#= -function reconstruct(v::Vector{<:SeqVar{A}}) where A - isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence")) - srt = sort(v, by=x -> x.pos) - len = length(v[1].ref) - for i in srt - if i isa SeqVar{A, <:Deletion} - len -= i.var.len - elseif i isa SeqVar{A, <:Insertion} - len += length(i.var.seq) +function reconstruct(v::Variant) + isempty(v.diffs) && return copy(v.ref) + diffs = sort(v.diffs, by=x -> x.pos) + + # First, get length of result + len::Int = length(v.ref) + for diff in diffs + if diff.edit isa Deletion + len = len - diff.edit.len + elseif diff.edit isa Insertion + len = len + length(diff.edit.seq) end end - - ref = copy(v[1].ref) - oldpos, newpos = 1, 1 - for i in srt -=# + # Fill in + dst = typeof(v.ref)(len) + src, diffi, srci::Int, dsti = v.ref, 1, 1, 1 + diff = diffs[diffi] + while dsti ≤ len + while (srci < diff.pos) & (dsti ≤ len) + dst[dsti] = src[srci] + srci, dsti = srci + 1, dsti + 1 + end + diff === nothing && break + if diff.edit isa Substitution + dst[dsti] = diff.edit.symbol + srci, dsti = srci + 1, dsti + 1 + elseif diff.edit isa Insertion + for i in 1:(length(diff.edit.seq)::Int) + dst[dsti] = diff.edit.seq[i] + dsti += 1 + end + elseif diff.edit isa Deletion + srci += diff.edit.len + end + diffi += 1 + diff = diffi > length(diffs) ? nothing : diffs[diffi] + end + return dst +end =# +export Diff, + Edit, + Variation, + Variant, + Substitution, + Insertion, + Deletion, + variations, + substitutions end # module