mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-12-23 08:48:17 +00:00
Switch to SumTypes
This commit is contained in:
parent
40749eb6f5
commit
354f18e0e4
3 changed files with 229 additions and 89 deletions
|
@ -1,5 +1,11 @@
|
|||
# This file is machine-generated - editing it directly is not advised
|
||||
|
||||
[[ArgTools]]
|
||||
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
|
||||
|
||||
[[Artifacts]]
|
||||
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
|
||||
|
||||
[[Automa]]
|
||||
deps = ["DataStructures", "Printf", "Random", "Test", "TranscodingStreams"]
|
||||
git-tree-sha1 = "c81526bf5f6fb4616b4e22a3cd62ac20e255fd3c"
|
||||
|
@ -62,6 +68,10 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
|
|||
deps = ["Random", "Serialization", "Sockets"]
|
||||
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||
|
||||
[[Downloads]]
|
||||
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
|
||||
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
|
||||
|
||||
[[IndexableBitVectors]]
|
||||
deps = ["Random", "Test"]
|
||||
git-tree-sha1 = "b7f5e42dc867b8a8654a5f899064632dac05bc82"
|
||||
|
@ -78,10 +88,22 @@ git-tree-sha1 = "6c9fcd87677231ae293f6806fad928c216ab6658"
|
|||
uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b"
|
||||
version = "1.0.0"
|
||||
|
||||
[[LibCURL]]
|
||||
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
|
||||
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
|
||||
|
||||
[[LibCURL_jll]]
|
||||
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
|
||||
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
|
||||
|
||||
[[LibGit2]]
|
||||
deps = ["Printf"]
|
||||
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
|
||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||
|
||||
[[LibSSH2_jll]]
|
||||
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
|
||||
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
|
||||
|
||||
[[Libdl]]
|
||||
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
|
||||
|
||||
|
@ -92,20 +114,36 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
|||
[[Logging]]
|
||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||
|
||||
[[MacroTools]]
|
||||
deps = ["Markdown", "Random"]
|
||||
git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
|
||||
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
|
||||
version = "0.5.6"
|
||||
|
||||
[[Markdown]]
|
||||
deps = ["Base64"]
|
||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||
|
||||
[[MbedTLS_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
|
||||
|
||||
[[Mmap]]
|
||||
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
||||
|
||||
[[MozillaCACerts_jll]]
|
||||
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
|
||||
|
||||
[[NetworkOptions]]
|
||||
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.3.1"
|
||||
|
||||
[[Pkg]]
|
||||
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
|
||||
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"]
|
||||
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
|
||||
[[Printf]]
|
||||
|
@ -117,7 +155,7 @@ deps = ["Printf"]
|
|||
uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
|
||||
|
||||
[[REPL]]
|
||||
deps = ["InteractiveUtils", "Markdown", "Sockets"]
|
||||
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
|
||||
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
||||
|
||||
[[Random]]
|
||||
|
@ -151,8 +189,22 @@ version = "0.1.2"
|
|||
deps = ["LinearAlgebra", "SparseArrays"]
|
||||
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||
|
||||
[[SumTypes]]
|
||||
deps = ["MacroTools"]
|
||||
git-tree-sha1 = "be127ca4ba297c1574457baa0ac3092b0054395e"
|
||||
uuid = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
|
||||
version = "0.1.1"
|
||||
|
||||
[[TOML]]
|
||||
deps = ["Dates"]
|
||||
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
|
||||
|
||||
[[Tar]]
|
||||
deps = ["ArgTools", "SHA"]
|
||||
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
|
||||
|
||||
[[Test]]
|
||||
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
|
||||
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
|
||||
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[[TranscodingStreams]]
|
||||
|
@ -172,3 +224,11 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
|||
|
||||
[[Unicode]]
|
||||
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||
|
||||
[[Zlib_jll]]
|
||||
deps = ["Libdl"]
|
||||
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
|
||||
|
||||
[[nghttp2_jll]]
|
||||
deps = ["Artifacts", "Libdl"]
|
||||
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
|
||||
|
|
|
@ -7,3 +7,4 @@ version = "0.1.0"
|
|||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
||||
SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
|
||||
|
|
|
@ -23,138 +23,235 @@ Call Variations
|
|||
|
||||
using BioSequences
|
||||
using BioAlignments
|
||||
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
|
||||
import BioSymbols: BioSymbol
|
||||
using SumTypes
|
||||
|
||||
const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
|
||||
|
||||
@sum_type Edit{S, T}
|
||||
@sum_type Edit{S, T} begin
|
||||
Substitution{S, T}(::T)
|
||||
Deletion{S, T}(::UInt)
|
||||
Insertion{S, T}(::S)
|
||||
end
|
||||
|
||||
"""
|
||||
Edit
|
||||
|
||||
Abstract type representing a type of nucleotide edit: Deletion, insertion or
|
||||
substitution.
|
||||
"""
|
||||
abstract type Edit end
|
||||
|
||||
"""
|
||||
Substitution{T <: BioSymbol} <: Edit
|
||||
Substitution
|
||||
|
||||
Represents the presence of a `T` at a given position. The position is stored
|
||||
outside this struct.
|
||||
"""
|
||||
struct Substitution{T <: BioSymbol} <: Edit
|
||||
symbol::T
|
||||
end
|
||||
Substitution
|
||||
|
||||
Base.eltype(::Type{<:Substitution{T}}) where T = T
|
||||
Base.eltype(::Type{<:Substitution{S, T}}) where {S, T} = T
|
||||
|
||||
"""
|
||||
Deletion <: Edit
|
||||
Deletion
|
||||
|
||||
Represents the deletion of N symbols. The location of the deletion is stored
|
||||
outside this struct
|
||||
"""
|
||||
struct Deletion <: Edit
|
||||
len::UInt
|
||||
end
|
||||
Deletion
|
||||
|
||||
"""
|
||||
Insertion{S <: BioSequence} <: Edit
|
||||
Insertion
|
||||
|
||||
Represents the insertion of a `T` into a sequence. The location of the insertion
|
||||
is stored outside the struct.
|
||||
"""
|
||||
struct Insertion{S <: BioSequence} <: Edit
|
||||
seq::S
|
||||
Insertion
|
||||
|
||||
function Insertion(s::BioSequence)
|
||||
length(s) == 0 && throw(ArgumentError("Insertions cannot be length 0"))
|
||||
Insertion{typeof(s), eltype(s)}(s)
|
||||
end
|
||||
|
||||
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
|
||||
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
||||
|
||||
"""
|
||||
Diff{E <: Edit}
|
||||
|
||||
Represents an `Edit` og type `E` at a given position.
|
||||
"""
|
||||
struct Diff{E <: Edit}
|
||||
struct Diff{S <: BioSequence, T <: BioSymbol}
|
||||
pos::UInt
|
||||
edit::E
|
||||
edit::Edit{S, T}
|
||||
end
|
||||
|
||||
Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit)
|
||||
|
||||
"""
|
||||
Variant{S <: BioSequence, D <: Diff}
|
||||
Variant{S <: BioSequence, T <: BioSymbol}
|
||||
|
||||
Represents a series of diffs of type `D` against a reference of type `S`.
|
||||
See also `Variation`
|
||||
Represents a series of diffs of type `Diff{S, T}` against a reference of type `S`.
|
||||
See also `Variation`.
|
||||
"""
|
||||
struct Variant{S <: BioSequence, D <: Diff}
|
||||
struct Variant{S <: BioSequence, T <: BioSymbol}
|
||||
ref::S
|
||||
diffs::Vector{D}
|
||||
diffs::Vector{Diff{S, T}}
|
||||
end
|
||||
|
||||
#function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff
|
||||
# return Variant{typeof(ref), D}(ref, diffs)
|
||||
#end
|
||||
|
||||
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D}
|
||||
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,T}) where {S,T}
|
||||
cols = displaysize(io)[2] - 3
|
||||
recur_io = IOContext(io, :SHOWN_SET => x.diffs)
|
||||
print(io, summary(x), ":")
|
||||
for i in x.diffs
|
||||
v = Variation{S, typeof(i)}(x.ref, i)
|
||||
v = Variation{S, eltype(S)}(x.ref, i)
|
||||
str = sprint(show, v, context=recur_io, sizehint=0)
|
||||
print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n"))
|
||||
end
|
||||
end
|
||||
|
||||
#Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v)
|
||||
|
||||
"""
|
||||
Variation{S <: BioSequence, D <: Diff}
|
||||
Variation{S <: BioSequence, T <: BioSymbol}
|
||||
|
||||
Represent a single diff of type `D` is a sequence of type `S`. See also `Variant`
|
||||
Represent a single diff against a biosequence. See also `Variant`
|
||||
"""
|
||||
struct Variation{S <: BioSequence, D <: Diff}
|
||||
struct Variation{S <: BioSequence, T <: BioSymbol}
|
||||
ref::S
|
||||
diff::D
|
||||
diff::Diff{S, T}
|
||||
end
|
||||
|
||||
#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff)
|
||||
|
||||
function check(v::Variation{S, <:Substitution{T}}) where {S, T}
|
||||
T == eltype(S) || throw(TypeError(:check, "", eltype(S), T))
|
||||
checkbounds(v.ref, v.diff.pos)
|
||||
@case Edit function check((x,)::Substitution{S, T}) where {S, T}
|
||||
(ref, pos) -> begin
|
||||
eltype(ref) == T || throw(TypeError(:check, "", eltype(ref), T))
|
||||
checkbounds(ref, pos)
|
||||
end
|
||||
end
|
||||
|
||||
check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
|
||||
|
||||
function check(v::Variation{S, <:Diff{<:Insertion}}) where S
|
||||
length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0"))
|
||||
# We can have insertions at the very end, after the reference sequence
|
||||
v.diff.pos == lastindex(v.ref) + 1 && return nothing
|
||||
checkbounds(v.ref, v.diff.pos)
|
||||
@case Edit function check((seq,)::Insertion{S, T}) where {S, T}
|
||||
(ref, pos) -> begin
|
||||
# We can have insertions at the very end, after the reference sequence
|
||||
pos == lastindex(ref) + 1 && return nothing
|
||||
checkbounds(ref, pos)
|
||||
end
|
||||
end
|
||||
|
||||
Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol)
|
||||
Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1)
|
||||
Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq)
|
||||
|
||||
function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S
|
||||
print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol)
|
||||
@case Edit function check((len,)::Deletion{S, T}) where {S, T}
|
||||
(ref, pos) -> begin
|
||||
checkbounds(ref, pos:(pos+len)-1)
|
||||
end
|
||||
end
|
||||
|
||||
Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff)
|
||||
Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff)
|
||||
@assert SumTypes.iscomplete(check, Edit)
|
||||
|
||||
function check(x::Variation)
|
||||
check(x.diff.edit)(x.ref, x.diff.pos)
|
||||
end
|
||||
|
||||
@case Edit _show((x,)::Substitution) = (io, pos) -> print(io, pos, x)
|
||||
@case Edit _show((len,)::Deletion) = (io, pos) -> print(io, 'Δ', pos, '-', pos + len - 1)
|
||||
@case Edit _show((seq,)::Insertion) = (io, pos) -> print(io, pos, seq)
|
||||
|
||||
@assert SumTypes.iscomplete(_show, Edit)
|
||||
|
||||
Base.show(io::IO, x::Diff) = _show(x.edit)(io, x.pos)
|
||||
|
||||
function Base.show(io::IO, x::Variation)
|
||||
if x.diff.edit.data isa Substitution
|
||||
print(io, x.ref[x.diff.pos])
|
||||
end
|
||||
show(io, x.diff)
|
||||
end
|
||||
|
||||
Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff)
|
||||
|
||||
function variant(seq::LongAminoAcidSeq, ref::LongAminoAcidSeq)
|
||||
aln = pairalign(OverlapAlignment(), seq, ref, ALN_MODEL).aln
|
||||
diffs = Diff{LongAminoAcidSeq, AminoAcid}[]
|
||||
result = Variant(ref, diffs)
|
||||
refpos = seqpos = 0
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
insertion_buffer = AminoAcid[]
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
isgap(seqi) || (seqpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(seqpos) && continue # skip indels at start
|
||||
iszero(n_gaps) && (markpos = refpos)
|
||||
n_gaps += 1
|
||||
else
|
||||
if !iszero(n_gaps)
|
||||
push!(diffs, Diff(UInt(markpos), Deletion{LongAminoAcidSeq, AminoAcid}(UInt(n_gaps))))
|
||||
n_gaps = 0
|
||||
end
|
||||
end
|
||||
|
||||
# Check for insertions
|
||||
if isgap(refi)
|
||||
iszero(refpos) && continue # skip indels at start
|
||||
iszero(n_ins) && (markpos = refpos + 1)
|
||||
push!(insertion_buffer, seqi)
|
||||
n_ins += 1
|
||||
else
|
||||
if !iszero(n_ins)
|
||||
seq = LongAminoAcidSeq(insertion_buffer)
|
||||
push!(diffs, Diff(UInt(markpos), Insertion(seq)))
|
||||
empty!(insertion_buffer)
|
||||
n_ins = 0
|
||||
end
|
||||
end
|
||||
|
||||
# Substitutions
|
||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||
push!(diffs, Diff(UInt(refpos), Substitution{LongAminoAcidSeq, AminoAcid}(seqi)))
|
||||
end
|
||||
end
|
||||
# At the end of the loop?
|
||||
return result
|
||||
end
|
||||
|
||||
@case Edit lendiff((x,)::Substitution) = 0
|
||||
@case Edit lendiff((len,)::Deletion) = -(len % Int)
|
||||
@case Edit lendiff((seq,)::Insertion) = length(seq) % Int
|
||||
|
||||
function reconstruct!(seq::S, x::Variant{S}) where S
|
||||
sort!(x.diffs, by=y -> y.pos)
|
||||
len = length(x.ref) + sum(diff -> lendiff(diff.edit), x.diffs)
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
||||
for diff in x.diffs
|
||||
while refpos < diff.pos
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
if diff.edit.data isa Substitution
|
||||
seq[seqpos] = diff.edit.data._1
|
||||
seqpos += 1
|
||||
refpos += 1
|
||||
elseif diff.edit.data isa Deletion
|
||||
refpos += diff.edit.data._1
|
||||
elseif diff.edit.data isa Insertion
|
||||
for i in diff.edit.data._1
|
||||
seq[seqpos] = i
|
||||
seqpos += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
while seqpos ≤ length(seq)
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
seq
|
||||
end
|
||||
|
||||
"""
|
||||
reconstruct(x::Variant{S})
|
||||
|
||||
Reconstruct the sequence of type `S` that created the variant. It is assumed the
|
||||
variant is well-formed, e.g. no substitutions in deleted sequences, or
|
||||
deletions/insertions of the same area multiple times.
|
||||
"""
|
||||
reconstruct(x::Variant{S}) where S = reconstruct!(S(), x)
|
||||
|
||||
export Substitution, Insertion, Deletion, Diff, Variant, Variation
|
||||
|
||||
end # module
|
||||
|
||||
#=
|
||||
|
||||
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
|
||||
aln = AlignedSequence(seqaln, refaln)
|
||||
return variations(ref, refaln, seqaln, aln.aln.anchors)
|
||||
|
@ -179,27 +276,7 @@ function substitutions(ref::S, seq::S) where {S <: BioSequence}
|
|||
end
|
||||
|
||||
|
||||
function variant(ref::S, seq::S) where {S <: BioSequence}
|
||||
aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln
|
||||
diffs = Diff[]
|
||||
result = Variant(ref, diffs)
|
||||
refpos = 0
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(n_gaps) && (markpos = refi)
|
||||
n_gaps += 1
|
||||
else
|
||||
if !iszero(n_gaps)
|
||||
push(diffs, Diff{Deletion}Deletion(n_gaps))
|
||||
end
|
||||
n_gaps = 0
|
||||
end
|
||||
end
|
||||
|
||||
#=
|
||||
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
|
||||
result = Variant(dna"TAG", Diff[])
|
||||
|
@ -349,3 +426,5 @@ export Diff,
|
|||
substitutions
|
||||
|
||||
end # module
|
||||
|
||||
=#
|
Loading…
Reference in a new issue