mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-25 14:49:55 +00:00
Full rewrite
This commit is contained in:
parent
642658f592
commit
87277268f6
4 changed files with 222 additions and 415 deletions
|
@ -114,12 +114,6 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
[[Logging]]
|
[[Logging]]
|
||||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||||
|
|
||||||
[[MacroTools]]
|
|
||||||
deps = ["Markdown", "Random"]
|
|
||||||
git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
|
|
||||||
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
|
||||||
version = "0.5.6"
|
|
||||||
|
|
||||||
[[Markdown]]
|
[[Markdown]]
|
||||||
deps = ["Base64"]
|
deps = ["Base64"]
|
||||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||||
|
@ -189,12 +183,6 @@ version = "0.1.2"
|
||||||
deps = ["LinearAlgebra", "SparseArrays"]
|
deps = ["LinearAlgebra", "SparseArrays"]
|
||||||
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||||
|
|
||||||
[[SumTypes]]
|
|
||||||
deps = ["MacroTools"]
|
|
||||||
git-tree-sha1 = "be127ca4ba297c1574457baa0ac3092b0054395e"
|
|
||||||
uuid = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
|
|
||||||
version = "0.1.1"
|
|
||||||
|
|
||||||
[[TOML]]
|
[[TOML]]
|
||||||
deps = ["Dates"]
|
deps = ["Dates"]
|
||||||
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
||||||
|
|
|
@ -7,4 +7,3 @@ version = "0.1.0"
|
||||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||||
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
||||||
SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
|
|
||||||
|
|
|
@ -1,38 +1,39 @@
|
||||||
module SequenceVariation
|
module SequenceVariation
|
||||||
|
|
||||||
# TODO: Add functionality to move a Variation to a new reference
|
"""
|
||||||
# needs to be done with heavy checks to make sure the alignment of the two
|
Needs to be able to:
|
||||||
# is not full of gaps in the area where the Variation is
|
* Given a sequence and a reference, create a `Variant` that unambiguously represents
|
||||||
#=
|
the sequence
|
||||||
Substitution should work just if the ref nuc don't match a deletion
|
|
||||||
Insertions/deletions should work if there are only matches (+/- 3 nucs), or at the ends
|
|
||||||
|
|
||||||
Perhaps first make an array of oldref -> newref, with some positions marked nothing?
|
* Given a `Variant` and a new reference, translate the variant to the new reference.
|
||||||
=#
|
|
||||||
# TODO: Do we need to prevent calling indels at the ends of alignments?
|
|
||||||
# No, we need a filtering step that trims the alignment before calling Variation
|
|
||||||
|
|
||||||
#=
|
"""
|
||||||
Extract alignment
|
|
||||||
Trim ends of gaps
|
|
||||||
Call Variations
|
|
||||||
|
|
||||||
=#
|
|
||||||
|
|
||||||
# TODO: Add inversions?
|
|
||||||
|
|
||||||
using BioSequences
|
|
||||||
using BioAlignments
|
|
||||||
import BioSymbols: BioSymbol
|
import BioSymbols: BioSymbol
|
||||||
using SumTypes
|
using BioAlignments
|
||||||
|
using BioSequences
|
||||||
|
|
||||||
const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
|
#=
|
||||||
|
import Automa
|
||||||
|
import Automa.RegExp: @re_str
|
||||||
|
=#
|
||||||
|
|
||||||
@sum_type Edit{S, T} begin
|
struct Unsafe end
|
||||||
Substitution{S, T}(::T)
|
|
||||||
Deletion{S, T}(::UInt)
|
const DEFAULT_AA_ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
|
||||||
Insertion{S, T}(::S)
|
const DEFAULT_DNA_ALN_MODEL = AffineGapScoreModel(EDNAFULL, gap_open=-12, gap_extend=-2)
|
||||||
end
|
model(::AminoAcidSeq) = DEFAULT_AA_ALN_MODEL
|
||||||
|
model(::NucleotideSeq) = DEFAULT_DNA_ALN_MODEL
|
||||||
|
|
||||||
|
#=
|
||||||
|
const CONTEXT = Automa.CodeGenContext(
|
||||||
|
vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte),
|
||||||
|
generator=:goto,
|
||||||
|
checkbounds=false
|
||||||
|
)
|
||||||
|
=#
|
||||||
|
|
||||||
|
const BYTES = Union{String, SubString{String}, Vector{UInt8}}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Substitution
|
Substitution
|
||||||
|
@ -40,15 +41,10 @@ end
|
||||||
Represents the presence of a `T` at a given position. The position is stored
|
Represents the presence of a `T` at a given position. The position is stored
|
||||||
outside this struct.
|
outside this struct.
|
||||||
"""
|
"""
|
||||||
Substitution
|
struct Substitution{T <: BioSymbol}
|
||||||
|
x::T
|
||||||
function Base.parse(::Type{Substitution{S, T}}, s::AbstractString) where {S, T}
|
|
||||||
mat = match(r"^[A-Z]([0-9]+)([A-Z])$", strip(s))
|
|
||||||
symbol = T(first(mat[2]))
|
|
||||||
Diff(parse(UInt, mat[1]), Substitution{S, T}(symbol))
|
|
||||||
end
|
end
|
||||||
|
Substitution(x::BioSymbol) = Substitution{typeof(x)}(x)
|
||||||
Base.eltype(::Type{<:Substitution{S, T}}) where {S, T} = T
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Deletion
|
Deletion
|
||||||
|
@ -56,168 +52,213 @@ Base.eltype(::Type{<:Substitution{S, T}}) where {S, T} = T
|
||||||
Represents the deletion of N symbols. The location of the deletion is stored
|
Represents the deletion of N symbols. The location of the deletion is stored
|
||||||
outside this struct
|
outside this struct
|
||||||
"""
|
"""
|
||||||
Deletion
|
struct Deletion
|
||||||
|
len::UInt
|
||||||
|
|
||||||
function Base.parse(::Type{T}, s::AbstractString) where {T <: Deletion}
|
function Deletion(len::UInt)
|
||||||
mat = match(r"^Δ([0-9]+)-([0-9]+)$", strip(s))
|
iszero(len) && error("Deletion must be at least 1 symbol")
|
||||||
start = parse(UInt, mat[1])
|
new(len)
|
||||||
stop = parse(UInt, mat[2])
|
end
|
||||||
start ≤ stop || error("Indel cannot have negative range")
|
|
||||||
Diff(start, T(stop - start + 1))
|
|
||||||
end
|
end
|
||||||
|
Deletion(x::Integer) = Deletion(convert(UInt, x))
|
||||||
|
Base.length(x::Deletion) = Int(x.len)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Insertion
|
Insertion{S <: BioSequence}
|
||||||
|
|
||||||
Represents the insertion of a `T` into a sequence. The location of the insertion
|
Represents the insertion of a `S` into a sequence. The location of the insertion
|
||||||
is stored outside the struct.
|
is stored outside the struct.
|
||||||
"""
|
"""
|
||||||
Insertion
|
struct Insertion{S <: BioSequence}
|
||||||
|
x::S
|
||||||
|
|
||||||
function Base.parse(::Type{<:Insertion{S}}, s::AbstractString) where S
|
function Insertion{S}(x::S) where {S <: BioSequence}
|
||||||
mat = match(r"^([0-9]+)([A-Z]+)$", strip(s))
|
isempty(x) && error("Insertion must be at least 1 symbol")
|
||||||
seq = S(mat[2])
|
new(x)
|
||||||
Diff(parse(UInt, mat[1]), Insertion(seq))
|
end
|
||||||
end
|
end
|
||||||
|
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
||||||
function Insertion(s::BioSequence)
|
|
||||||
length(s) == 0 && throw(ArgumentError("Insertions cannot be length 0"))
|
|
||||||
Insertion{typeof(s), eltype(s)}(s)
|
|
||||||
end
|
|
||||||
|
|
||||||
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Diff{E <: Edit}
|
Edit{S <: BioSequence, T <: BioSymbol}
|
||||||
|
|
||||||
Represents an `Edit` og type `E` at a given position.
|
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
|
||||||
|
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
|
||||||
|
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
|
||||||
"""
|
"""
|
||||||
struct Diff{S <: BioSequence, T <: BioSymbol}
|
struct Edit{S <: BioSequence, T <: BioSymbol}
|
||||||
|
x::Union{Substitution{T}, Deletion, Insertion{S}}
|
||||||
pos::UInt
|
pos::UInt
|
||||||
edit::Edit{S, T}
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.parse(::Type{Diff{S, T}}, s::AbstractString) where {S, T}
|
#=
|
||||||
beginning = first(s)
|
@noinline throw_parse_error(T, p::Integer) = error("Failed to parse $T at byte $p")
|
||||||
if beginning == 'Δ'
|
|
||||||
parse(Deletion{S, T}, s)
|
# Parse substitution
|
||||||
elseif isnumeric(beginning)
|
let
|
||||||
parse(Insertion{S, T}, s)
|
machine = let
|
||||||
else
|
biosymbol = re"[A-Za-z]"
|
||||||
parse(Substitution{S, T}, s)
|
number = re"[0-9]+"
|
||||||
|
|
||||||
|
biosymbol.actions[:enter] = [:enter]
|
||||||
|
number.actions[:all] = [:digit]
|
||||||
|
|
||||||
|
Automa.compile(biosymbol * number * biosymbol)
|
||||||
|
end
|
||||||
|
actions = Dict(
|
||||||
|
:enter => quote
|
||||||
|
symbol = T(Char(byte))
|
||||||
|
if firstsymbol === nothing
|
||||||
|
firstsymbol = symbol
|
||||||
|
else
|
||||||
|
lastsymbol = symbol
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
:digit => :(num = UInt(10)*num + (byte - 0x30) % UInt),
|
||||||
|
)
|
||||||
|
@eval begin
|
||||||
|
function Base.parse(::Type{Edit{S, T}}, data::BYTES) where {S, T}
|
||||||
|
$(Automa.generate_init_code(CONTEXT, machine))
|
||||||
|
p_eof = p_end = sizeof(data)
|
||||||
|
firstsymbol = lastsymbol = nothing
|
||||||
|
num = UInt(0)
|
||||||
|
$(Automa.generate_exec_code(CONTEXT, machine, actions))
|
||||||
|
iszero(cs) || throw_parse_error(Edit{S, T}, p)
|
||||||
|
if firstsymbol == lastsymbol
|
||||||
|
error("First symbol and last symbol are identical")
|
||||||
|
end
|
||||||
|
return Edit{S, T}(Substitution{T}(lastsymbol), num)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
=#
|
||||||
|
|
||||||
"""
|
# Edits are applied sequentially from first to last pos.
|
||||||
Variant{S <: BioSequence, T <: BioSymbol}
|
# The vector must always be sorted by pos.
|
||||||
|
|
||||||
Represents a series of diffs of type `Diff{S, T}` against a reference of type `S`.
|
|
||||||
See also `Variation`.
|
|
||||||
"""
|
|
||||||
struct Variant{S <: BioSequence, T <: BioSymbol}
|
struct Variant{S <: BioSequence, T <: BioSymbol}
|
||||||
ref::S
|
ref::S
|
||||||
diffs::Vector{Diff{S, T}}
|
edits::Vector{Edit{S, T}}
|
||||||
|
|
||||||
|
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,T}) where {S,T}
|
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||||
cols = displaysize(io)[2] - 3
|
sort!(edits, by=x -> x.pos)
|
||||||
recur_io = IOContext(io, :SHOWN_SET => x.diffs)
|
result = Variant{S, T}(ref, edits, Unsafe())
|
||||||
print(io, summary(x), ":")
|
is_valid(result) || error("TODO") # report what kind of error message?
|
||||||
for i in x.diffs
|
return result
|
||||||
v = Variation{S, eltype(S)}(x.ref, i)
|
end
|
||||||
str = sprint(show, v, context=recur_io, sizehint=0)
|
|
||||||
print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n"))
|
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||||
|
Variant{S, T}(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Validate:
|
||||||
|
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||||
|
# is affected by multiple edits.
|
||||||
|
function is_valid(v::Variant)
|
||||||
|
valid_positions = 1:length(v.ref)
|
||||||
|
last_was_insert = false
|
||||||
|
for edit in v.edits
|
||||||
|
pos = edit.pos
|
||||||
|
op = edit.x
|
||||||
|
# For substitutions we simply do not allow another modification of the same base
|
||||||
|
if op isa Substitution
|
||||||
|
pos in valid_positions || return false
|
||||||
|
valid_positions = first(valid_positions) + 1 : last(valid_positions)
|
||||||
|
last_was_insert = false
|
||||||
|
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||||
|
# for next op. However, we cannot have two insertions at the same position, because
|
||||||
|
# then the order of them is ambiguous
|
||||||
|
elseif op isa Insertion
|
||||||
|
pos in (first(valid_positions)+last_was_insert:last(valid_positions)) || return false
|
||||||
|
last_was_insert = true
|
||||||
|
# Deletions obviously invalidate the reference bases that are deleted.
|
||||||
|
elseif op isa Deletion
|
||||||
|
len = length(op)
|
||||||
|
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
|
||||||
|
valid_positions = first(valid_positions) + len : last(valid_positions)
|
||||||
|
last_was_insert = false
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
return true
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
#=
|
||||||
Variation{S <: BioSequence, T <: BioSymbol}
|
# Substituion can only occur in 1:len
|
||||||
|
# Deletions of len L only in 1:(len-L)+1
|
||||||
|
# Insertions at 1:len+1
|
||||||
|
"If a Variant passes this validation, then it should be possible to unambiguously
|
||||||
|
reconstruct the variant."
|
||||||
|
function validate(v::Variant)
|
||||||
|
len = length(v.ref)
|
||||||
|
sort!(v.edits, by=x -> x.pos)
|
||||||
|
lower = 1
|
||||||
|
|
||||||
Represent a single diff against a biosequence. See also `Variant`
|
# No not allow two inserts right after each other at same pos, because then it's
|
||||||
"""
|
# ambiguous which to pick first.
|
||||||
struct Variation{S <: BioSequence, T <: BioSymbol}
|
last_was_insert = false
|
||||||
ref::S
|
for edit in v.edits
|
||||||
diff::Diff{S, T}
|
editx = edit.x
|
||||||
end
|
pos = edit.pos
|
||||||
|
if editx isa Substitution
|
||||||
@case Edit function check((x,)::Substitution{S, T}) where {S, T}
|
(pos < lower || pos > len) && error()
|
||||||
(ref, pos) -> begin
|
lower = pos + 1
|
||||||
eltype(ref) == T || throw(TypeError(:check, "", eltype(ref), T))
|
last_was_insert = false
|
||||||
checkbounds(ref, pos)
|
elseif editx isa Deletion
|
||||||
|
(pos < lower || pos > (len - editx.len + 1)) && error()
|
||||||
|
lower = pos + editx.len
|
||||||
|
last_was_insert = false
|
||||||
|
else # Insertion
|
||||||
|
(pos < lower + last_was_insert || pos > len + 1) && error()
|
||||||
|
lower = pos
|
||||||
|
last_was_insert = true
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
return v
|
||||||
|
end
|
||||||
|
=#
|
||||||
|
|
||||||
|
# TODO: We NEED to include diffs at ends. Yes, they may not be proper mutations
|
||||||
|
# but we need it to reconstruct the actual seqs
|
||||||
|
function variant(seq::T, ref::T) where {T <: LongSequence{<:Union{AminoAcidAlphabet, NucleicAcidAlphabet}}}
|
||||||
|
return variant(pairalign(OverlapAlignment(), seq, ref, model(seq)).aln)
|
||||||
end
|
end
|
||||||
|
|
||||||
@case Edit function check((seq,)::Insertion{S, T}) where {S, T}
|
function variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{AminoAcidAlphabet, NucleicAcidAlphabet}}}
|
||||||
(ref, pos) -> begin
|
ref = aln.b
|
||||||
# We can have insertions at the very end, after the reference sequence
|
E = eltype(typeof(ref))
|
||||||
pos == lastindex(ref) + 1 && return nothing
|
edits = Edit{T, E}[]
|
||||||
checkbounds(ref, pos)
|
result = Variant(ref, edits)
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@case Edit function check((len,)::Deletion{S, T}) where {S, T}
|
|
||||||
(ref, pos) -> begin
|
|
||||||
checkbounds(ref, pos:(pos+len)-1)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@assert SumTypes.iscomplete(check, Edit)
|
|
||||||
|
|
||||||
function check(x::Variation)
|
|
||||||
check(x.diff.edit)(x.ref, x.diff.pos)
|
|
||||||
end
|
|
||||||
|
|
||||||
@case Edit _show((x,)::Substitution) = (io, pos) -> print(io, pos, x)
|
|
||||||
@case Edit _show((len,)::Deletion) = (io, pos) -> print(io, 'Δ', pos, '-', pos + len - 1)
|
|
||||||
@case Edit _show((seq,)::Insertion) = (io, pos) -> print(io, pos, seq)
|
|
||||||
|
|
||||||
@assert SumTypes.iscomplete(_show, Edit)
|
|
||||||
|
|
||||||
Base.show(io::IO, x::Diff) = _show(x.edit)(io, x.pos)
|
|
||||||
|
|
||||||
function Base.show(io::IO, x::Variation)
|
|
||||||
if x.diff.edit.data isa Substitution
|
|
||||||
print(io, x.ref[x.diff.pos])
|
|
||||||
end
|
|
||||||
show(io, x.diff)
|
|
||||||
end
|
|
||||||
|
|
||||||
Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff)
|
|
||||||
|
|
||||||
function variant(seq::LongAminoAcidSeq, ref::LongAminoAcidSeq)
|
|
||||||
aln = pairalign(OverlapAlignment(), seq, ref, ALN_MODEL).aln
|
|
||||||
diffs = Diff{LongAminoAcidSeq, AminoAcid}[]
|
|
||||||
result = Variant(ref, diffs)
|
|
||||||
refpos = seqpos = 0
|
refpos = seqpos = 0
|
||||||
markpos = 0
|
markpos = 0
|
||||||
n_gaps = n_ins = 0
|
n_gaps = n_ins = 0
|
||||||
insertion_buffer = AminoAcid[]
|
insertion_buffer = E[]
|
||||||
for (seqi, refi) in aln
|
for (seqi, refi) in aln
|
||||||
isgap(refi) || (refpos += 1)
|
isgap(refi) || (refpos += 1)
|
||||||
isgap(seqi) || (seqpos += 1)
|
isgap(seqi) || (seqpos += 1)
|
||||||
|
|
||||||
# Check for deletions
|
# Check for deletions
|
||||||
if isgap(seqi)
|
if isgap(seqi)
|
||||||
iszero(seqpos) && continue # skip indels at start
|
|
||||||
iszero(n_gaps) && (markpos = refpos)
|
iszero(n_gaps) && (markpos = refpos)
|
||||||
n_gaps += 1
|
n_gaps += 1
|
||||||
else
|
else
|
||||||
if !iszero(n_gaps)
|
if !iszero(n_gaps)
|
||||||
push!(diffs, Diff(UInt(markpos), Deletion{LongAminoAcidSeq, AminoAcid}(UInt(n_gaps))))
|
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||||
n_gaps = 0
|
n_gaps = 0
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Check for insertions
|
# Check for insertions
|
||||||
if isgap(refi)
|
if isgap(refi)
|
||||||
iszero(refpos) && continue # skip indels at start
|
|
||||||
iszero(n_ins) && (markpos = refpos + 1)
|
iszero(n_ins) && (markpos = refpos + 1)
|
||||||
push!(insertion_buffer, seqi)
|
push!(insertion_buffer, seqi)
|
||||||
n_ins += 1
|
n_ins += 1
|
||||||
else
|
else
|
||||||
if !iszero(n_ins)
|
if !iszero(n_ins)
|
||||||
seq = LongAminoAcidSeq(insertion_buffer)
|
seq = T(insertion_buffer)
|
||||||
push!(diffs, Diff(UInt(markpos), Insertion(seq)))
|
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
|
||||||
empty!(insertion_buffer)
|
empty!(insertion_buffer)
|
||||||
n_ins = 0
|
n_ins = 0
|
||||||
end
|
end
|
||||||
|
@ -225,36 +266,44 @@ function variant(seq::LongAminoAcidSeq, ref::LongAminoAcidSeq)
|
||||||
|
|
||||||
# Substitutions
|
# Substitutions
|
||||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||||
push!(diffs, Diff(UInt(refpos), Substitution{LongAminoAcidSeq, AminoAcid}(seqi)))
|
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
# At the end of the loop?
|
|
||||||
|
# Final indel, if applicable
|
||||||
|
if !iszero(n_gaps)
|
||||||
|
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||||
|
elseif !iszero(n_ins)
|
||||||
|
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||||
|
end
|
||||||
|
|
||||||
return result
|
return result
|
||||||
end
|
end
|
||||||
|
|
||||||
@case Edit lendiff((x,)::Substitution) = 0
|
function lendiff(edit::Edit)
|
||||||
@case Edit lendiff((len,)::Deletion) = -(len % Int)
|
x = edit.x
|
||||||
@case Edit lendiff((seq,)::Insertion) = length(seq) % Int
|
x isa Substitution ? 0 : (x isa Deletion ? -length(x) : length(x.x))
|
||||||
|
end
|
||||||
|
|
||||||
function reconstruct!(seq::S, x::Variant{S}) where S
|
function reconstruct!(seq::S, x::Variant{S}) where S
|
||||||
sort!(x.diffs, by=y -> y.pos)
|
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||||||
len = length(x.ref) + sum(diff -> lendiff(diff.edit), x.diffs)
|
|
||||||
resize!(seq, len % UInt)
|
resize!(seq, len % UInt)
|
||||||
refpos = seqpos = 1
|
refpos = seqpos = 1
|
||||||
for diff in x.diffs
|
for edit in x.edits
|
||||||
while refpos < diff.pos
|
while refpos < edit.pos
|
||||||
seq[seqpos] = x.ref[refpos]
|
seq[seqpos] = x.ref[refpos]
|
||||||
refpos += 1
|
refpos += 1
|
||||||
seqpos += 1
|
seqpos += 1
|
||||||
end
|
end
|
||||||
if diff.edit.data isa Substitution
|
editx = edit.x
|
||||||
seq[seqpos] = diff.edit.data._1
|
if editx isa Substitution
|
||||||
|
seq[seqpos] = editx.x
|
||||||
seqpos += 1
|
seqpos += 1
|
||||||
refpos += 1
|
refpos += 1
|
||||||
elseif diff.edit.data isa Deletion
|
elseif editx isa Deletion
|
||||||
refpos += diff.edit.data._1
|
refpos += editx.len
|
||||||
elseif diff.edit.data isa Insertion
|
elseif editx isa Insertion
|
||||||
for i in diff.edit.data._1
|
for i in editx.x
|
||||||
seq[seqpos] = i
|
seq[seqpos] = i
|
||||||
seqpos += 1
|
seqpos += 1
|
||||||
end
|
end
|
||||||
|
@ -268,194 +317,21 @@ function reconstruct!(seq::S, x::Variant{S}) where S
|
||||||
seq
|
seq
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
struct Variation{S <: BioSequence, T <: BioSymbol}
|
||||||
reconstruct(x::Variant{S})
|
ref::S
|
||||||
|
edit::Edit{S, T}
|
||||||
|
end
|
||||||
|
|
||||||
Reconstruct the sequence of type `S` that created the variant. It is assumed the
|
function Base.in(v::Variation, var::Variant)
|
||||||
variant is well-formed, e.g. no substitutions in deleted sequences, or
|
if v.ref != var.ref
|
||||||
deletions/insertions of the same area multiple times.
|
error("References must be equal")
|
||||||
"""
|
end
|
||||||
reconstruct(x::Variant{S}) where S = reconstruct!(S(), x)
|
any(v.edit == edit for edit in var.edits)
|
||||||
|
end
|
||||||
|
|
||||||
export Substitution, Insertion, Deletion, Diff, Variant, Variation
|
# How to check if a Variation is present in a sequence?
|
||||||
|
function hasvariation(seq::S, var::Variation{S, T}) where {S, T}
|
||||||
|
var in variant(seq, var.ref)
|
||||||
|
end
|
||||||
|
|
||||||
end # module
|
end # module
|
||||||
|
|
||||||
#=
|
|
||||||
|
|
||||||
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
|
|
||||||
aln = AlignedSequence(seqaln, refaln)
|
|
||||||
return variations(ref, refaln, seqaln, aln.aln.anchors)
|
|
||||||
end
|
|
||||||
|
|
||||||
"Calculate all substitutions between two sequences"
|
|
||||||
function substitutions(ref::S, seq::S) where {S <: BioSequence}
|
|
||||||
if length(ref) != length(seq)
|
|
||||||
throw(DimensionMismatch("Sequences must match in length"))
|
|
||||||
end
|
|
||||||
if !(count(isgap, ref) == count(isgap, seq) == 0)
|
|
||||||
throw(ArgumentError("Neither sequence can contain gaps"))
|
|
||||||
end
|
|
||||||
diffs = Diff{Substitution{eltype(S)}}[]
|
|
||||||
@inbounds for i in eachindex(ref)
|
|
||||||
refi, seqi = ref[i], seq[i]
|
|
||||||
if refi != seqi
|
|
||||||
push!(diffs, Diff(i, Substitution(seqi)))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return Variant(ref, diffs)
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=
|
|
||||||
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
|
|
||||||
result = Variant(dna"TAG", Diff[])
|
|
||||||
diffs = result.diffs
|
|
||||||
isempty(anchors) && return result
|
|
||||||
firstop = first(anchors)
|
|
||||||
|
|
||||||
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
|
|
||||||
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
|
|
||||||
end
|
|
||||||
# refpos and seqpos refer to the ungapped sequences. seqaln refer to the gapped one
|
|
||||||
refpos, seqpos, seqalnpos, seqalnend = 1, 1, 1, 1
|
|
||||||
for anchor in @view anchors[2:end]
|
|
||||||
seqend, refend = anchor.seqpos, anchor.refpos
|
|
||||||
seqalnend = seqalnpos + max(seqend - seqpos, refend - refpos)
|
|
||||||
|
|
||||||
# For mismatches, we add one Variation per mismatch
|
|
||||||
if anchor.op == OP_SEQ_MISMATCH
|
|
||||||
p = seqalnpos
|
|
||||||
for pos in refpos:refend
|
|
||||||
push!(diffs, Diff(pos, Substitution{eltype(A)}(seqaln[p])))
|
|
||||||
p += 1
|
|
||||||
end
|
|
||||||
|
|
||||||
# Deletions are trivial
|
|
||||||
elseif anchor.op == OP_DELETE
|
|
||||||
len = refend - refpos + 1
|
|
||||||
push!(diffs, Diff(refpos, Deletion(len)))
|
|
||||||
|
|
||||||
# Insertions are a little more tricky
|
|
||||||
elseif anchor.op == OP_INSERT
|
|
||||||
push!(diffs, Diff(refend + 1, Insertion{S}(seqaln[seqalnpos:seqalnend])))
|
|
||||||
|
|
||||||
elseif anchor.op == OP_SEQ_MATCH
|
|
||||||
nothing
|
|
||||||
else
|
|
||||||
throw(ArgumentError("Cannot create Variation from operation $(anchor.op)"))
|
|
||||||
end
|
|
||||||
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
|
|
||||||
end
|
|
||||||
|
|
||||||
return result
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
function posmap(gap, oldref::BioSequence, newref::BioSequence)
|
|
||||||
if length(oldref) != length(newref)
|
|
||||||
throw(ArgumentError("Sequences must be equal-length and aligned"))
|
|
||||||
end
|
|
||||||
result = Vector{Union{Int, Nothing}}(undef, length(oldref))
|
|
||||||
oldpos, newpos = 0, 0
|
|
||||||
for (o, n) in zip(oldref, newref)
|
|
||||||
oldpos += o != gap
|
|
||||||
newpos += n != gap
|
|
||||||
o != gap && (result[oldpos] = n == gap ? nothing : newpos)
|
|
||||||
end
|
|
||||||
return resize!(result, oldpos)
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
posmap(oldref::S, newref::S)
|
|
||||||
|
|
||||||
Given two aligned sequences, creates a vector `v` such that `v[o] = n`, where `o` is
|
|
||||||
a position in the old (ungapped) sequence and `n` is a position in the new ungapped sequence.
|
|
||||||
If `o` maps to a gap in the new sequence, `v[o] === nothing`.
|
|
||||||
"""
|
|
||||||
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
|
|
||||||
|
|
||||||
|
|
||||||
function rereference(diff::Diff{<:Substitution}, posmap)
|
|
||||||
newpos = posmap[diff.pos]
|
|
||||||
newpos === nothing && throw(ArgumentError("Position $(diff.pos) maps to a gap in reference"))
|
|
||||||
return typeof(diff)(newpos::Int, diff.edit)
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
function checkflanks(posmap, pos::Int, flank::Int)
|
|
||||||
for i in max(1, pos - flank) : min(length(posmap), pos + flank)
|
|
||||||
posmap[i] === nothing && throw(ArgumentError("Flanking position $i maps to a deletion"))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function rereference(diff::Diff{Deletion}, posmap, flank::Int=5)
|
|
||||||
checkbounds(posmap, diff.pos:diff.pos + diff.edit.len - 1)
|
|
||||||
checkflanks(posmap, diff.pos, flank)
|
|
||||||
return typeof(diff)(posmap[diff.pos], diff.edit.len)
|
|
||||||
end
|
|
||||||
|
|
||||||
function rereference(diff::Diff{<:Insertion}, posmap, flank::Int=5)
|
|
||||||
checkflanks(posmap, diff.pos, flank)
|
|
||||||
return typeof(diff)(posmap[diff.pos], diff.edit)
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
# Assumes same reference, and that they are not mutally exclusive
|
|
||||||
# (e.g no substitution in deleted areas)
|
|
||||||
function reconstruct(v::Variant)
|
|
||||||
isempty(v.diffs) && return copy(v.ref)
|
|
||||||
diffs = sort(v.diffs, by=x -> x.pos)
|
|
||||||
|
|
||||||
# First, get length of result
|
|
||||||
len::Int = length(v.ref)
|
|
||||||
for diff in diffs
|
|
||||||
if diff.edit isa Deletion
|
|
||||||
len = len - diff.edit.len
|
|
||||||
elseif diff.edit isa Insertion
|
|
||||||
len = len + length(diff.edit.seq)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Fill in
|
|
||||||
dst = typeof(v.ref)(len)
|
|
||||||
src, diffi, srci::Int, dsti = v.ref, 1, 1, 1
|
|
||||||
diff = diffs[diffi]
|
|
||||||
while dsti ≤ len
|
|
||||||
while (srci < diff.pos) & (dsti ≤ len)
|
|
||||||
dst[dsti] = src[srci]
|
|
||||||
srci, dsti = srci + 1, dsti + 1
|
|
||||||
end
|
|
||||||
diff === nothing && break
|
|
||||||
if diff.edit isa Substitution
|
|
||||||
dst[dsti] = diff.edit.symbol
|
|
||||||
srci, dsti = srci + 1, dsti + 1
|
|
||||||
elseif diff.edit isa Insertion
|
|
||||||
for i in 1:(length(diff.edit.seq)::Int)
|
|
||||||
dst[dsti] = diff.edit.seq[i]
|
|
||||||
dsti += 1
|
|
||||||
end
|
|
||||||
elseif diff.edit isa Deletion
|
|
||||||
srci += diff.edit.len
|
|
||||||
end
|
|
||||||
diffi += 1
|
|
||||||
diff = diffi > length(diffs) ? nothing : diffs[diffi]
|
|
||||||
end
|
|
||||||
return dst
|
|
||||||
end
|
|
||||||
=#
|
|
||||||
|
|
||||||
export Diff,
|
|
||||||
Edit,
|
|
||||||
Variation,
|
|
||||||
Variant,
|
|
||||||
Substitution,
|
|
||||||
Insertion,
|
|
||||||
Deletion,
|
|
||||||
variations,
|
|
||||||
substitutions
|
|
||||||
|
|
||||||
end # module
|
|
||||||
|
|
||||||
=#
|
|
|
@ -1,56 +0,0 @@
|
||||||
const DELETION_PATTERN = r"^Δ(\d+)(?:-(\d+))?$"
|
|
||||||
const INSERTION_PATTERN = r"^(\d+)([A-Z]+)$"
|
|
||||||
const SUBSTITUTION_PATTERN = r"^([A-Z])(\d+)([A-Z])$"
|
|
||||||
const MULTI_SUBST_PATTERN = r"^([A-Z](?:/[A-Z])*)(\d+)([A-Z](?:/[A-Z])*)$"
|
|
||||||
|
|
||||||
function Base.parse(::Type{<:SeqVar{A}}, ref::LongSequence{A}, st::AbstractString) where A
|
|
||||||
m = match(DELETION_PATTERN, st)
|
|
||||||
if m !== nothing
|
|
||||||
return parse(SeqVar{A, Deletion}, ref, m)
|
|
||||||
end
|
|
||||||
m = match(SUBSTITUTION_PATTERN, st)
|
|
||||||
if m !== nothing
|
|
||||||
return parse(SeqVar{A, Substitution{eltype(A)}}, ref, m)
|
|
||||||
end
|
|
||||||
m = match(INSERTION_PATTERN, st)
|
|
||||||
if m !== nothing
|
|
||||||
return parse(SeqVar{A, Insertion{A}}, ref, m)
|
|
||||||
else
|
|
||||||
throw(ArgumentError("Cannot parse $st as Variation"))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.parse(::Type{SeqVar{A, Deletion}}, ref::LongSequence{A}, m::RegexMatch) where A
|
|
||||||
pos = parse(Int, m[1])
|
|
||||||
len = m === nothing ? 1 : parse(Int, m[2]) - pos + 1
|
|
||||||
return SeqVar(ref, pos, Deletion(len))
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.parse(::Type{<:SeqVar{A, Substitution{T}}}, ref::LongSequence{A}, m::RegexMatch) where {A,T}
|
|
||||||
if T !== eltype(A)
|
|
||||||
throw(ArgumentError("Substitution type must be alphabet eltype"))
|
|
||||||
end
|
|
||||||
refst, pos, altst = m[1], m[2], m[3]
|
|
||||||
pos = parse(Int, pos)
|
|
||||||
refT = T(first(refst))
|
|
||||||
altT = T(first(altst))
|
|
||||||
return SeqVar(ref, pos, Substitution{T}(altT))
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.parse(::Type{<:SeqVar{A, Insertion{A}}}, ref::LongSequence{A}, m::RegexMatch) where A
|
|
||||||
pos = parse(Int, m[1])
|
|
||||||
# We accept an insertion immediately after the reference, right?
|
|
||||||
checkbounds(ref, pos - (pos == length(ref)))
|
|
||||||
seq = LongSequence{A}(m[2])
|
|
||||||
return SeqVar(ref, pos, Insertion{A}(seq))
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
totext(m::SeqVar, seqid::IdDict{<:LongSequence, AbstractString}) = string(seqid[m.ref], '\t', m)
|
|
||||||
|
|
||||||
function fromtext(st::AbstractString, seqid::Dict{String, LongSequence{A}}) where A
|
|
||||||
name, varstr = split(st, '\t', limit=2)
|
|
||||||
ref = seqid[name]
|
|
||||||
return parse(SeqVar{A}, ref, varstr)
|
|
||||||
end
|
|
Loading…
Reference in a new issue