Switch to SumTypes

This commit is contained in:
Jakob Nybo Nissen 2021-03-02 15:50:37 +01:00
parent 40749eb6f5
commit 354f18e0e4
3 changed files with 229 additions and 89 deletions

View file

@ -1,5 +1,11 @@
# This file is machine-generated - editing it directly is not advised
[[ArgTools]]
uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
[[Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
[[Automa]]
deps = ["DataStructures", "Printf", "Random", "Test", "TranscodingStreams"]
git-tree-sha1 = "c81526bf5f6fb4616b4e22a3cd62ac20e255fd3c"
@ -62,6 +68,10 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[Downloads]]
deps = ["ArgTools", "LibCURL", "NetworkOptions"]
uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
[[IndexableBitVectors]]
deps = ["Random", "Test"]
git-tree-sha1 = "b7f5e42dc867b8a8654a5f899064632dac05bc82"
@ -78,10 +88,22 @@ git-tree-sha1 = "6c9fcd87677231ae293f6806fad928c216ab6658"
uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b"
version = "1.0.0"
[[LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
[[LibCURL_jll]]
deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
[[LibGit2]]
deps = ["Printf"]
deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[LibSSH2_jll]]
deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@ -92,20 +114,36 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[MacroTools]]
deps = ["Markdown", "Random"]
git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0"
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
version = "0.5.6"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[MbedTLS_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[MozillaCACerts_jll]]
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
[[NetworkOptions]]
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
[[OrderedCollections]]
git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.3.1"
[[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Printf]]
@ -117,7 +155,7 @@ deps = ["Printf"]
uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets"]
deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
@ -151,8 +189,22 @@ version = "0.1.2"
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[SumTypes]]
deps = ["MacroTools"]
git-tree-sha1 = "be127ca4ba297c1574457baa0ac3092b0054395e"
uuid = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
version = "0.1.1"
[[TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
[[Tar]]
deps = ["ArgTools", "SHA"]
uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TranscodingStreams]]
@ -172,3 +224,11 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[[Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
[[nghttp2_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"

View file

@ -7,3 +7,4 @@ version = "0.1.0"
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"

View file

@ -23,138 +23,235 @@ Call Variations
using BioSequences
using BioAlignments
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
import BioSymbols: BioSymbol
using SumTypes
const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2)
@sum_type Edit{S, T}
@sum_type Edit{S, T} begin
Substitution{S, T}(::T)
Deletion{S, T}(::UInt)
Insertion{S, T}(::S)
end
"""
Edit
Abstract type representing a type of nucleotide edit: Deletion, insertion or
substitution.
"""
abstract type Edit end
"""
Substitution{T <: BioSymbol} <: Edit
Substitution
Represents the presence of a `T` at a given position. The position is stored
outside this struct.
"""
struct Substitution{T <: BioSymbol} <: Edit
symbol::T
end
Substitution
Base.eltype(::Type{<:Substitution{T}}) where T = T
Base.eltype(::Type{<:Substitution{S, T}}) where {S, T} = T
"""
Deletion <: Edit
Deletion
Represents the deletion of N symbols. The location of the deletion is stored
outside this struct
"""
struct Deletion <: Edit
len::UInt
end
Deletion
"""
Insertion{S <: BioSequence} <: Edit
Insertion
Represents the insertion of a `T` into a sequence. The location of the insertion
is stored outside the struct.
"""
struct Insertion{S <: BioSequence} <: Edit
seq::S
Insertion
function Insertion(s::BioSequence)
length(s) == 0 && throw(ArgumentError("Insertions cannot be length 0"))
Insertion{typeof(s), eltype(s)}(s)
end
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
"""
Diff{E <: Edit}
Represents an `Edit` og type `E` at a given position.
"""
struct Diff{E <: Edit}
struct Diff{S <: BioSequence, T <: BioSymbol}
pos::UInt
edit::E
edit::Edit{S, T}
end
Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit)
"""
Variant{S <: BioSequence, D <: Diff}
Variant{S <: BioSequence, T <: BioSymbol}
Represents a series of diffs of type `D` against a reference of type `S`.
See also `Variation`
Represents a series of diffs of type `Diff{S, T}` against a reference of type `S`.
See also `Variation`.
"""
struct Variant{S <: BioSequence, D <: Diff}
struct Variant{S <: BioSequence, T <: BioSymbol}
ref::S
diffs::Vector{D}
diffs::Vector{Diff{S, T}}
end
#function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff
# return Variant{typeof(ref), D}(ref, diffs)
#end
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D}
function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,T}) where {S,T}
cols = displaysize(io)[2] - 3
recur_io = IOContext(io, :SHOWN_SET => x.diffs)
print(io, summary(x), ":")
for i in x.diffs
v = Variation{S, typeof(i)}(x.ref, i)
v = Variation{S, eltype(S)}(x.ref, i)
str = sprint(show, v, context=recur_io, sizehint=0)
print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n"))
end
end
#Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v)
"""
Variation{S <: BioSequence, D <: Diff}
Variation{S <: BioSequence, T <: BioSymbol}
Represent a single diff of type `D` is a sequence of type `S`. See also `Variant`
Represent a single diff against a biosequence. See also `Variant`
"""
struct Variation{S <: BioSequence, D <: Diff}
struct Variation{S <: BioSequence, T <: BioSymbol}
ref::S
diff::D
diff::Diff{S, T}
end
#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff)
function check(v::Variation{S, <:Substitution{T}}) where {S, T}
T == eltype(S) || throw(TypeError(:check, "", eltype(S), T))
checkbounds(v.ref, v.diff.pos)
@case Edit function check((x,)::Substitution{S, T}) where {S, T}
(ref, pos) -> begin
eltype(ref) == T || throw(TypeError(:check, "", eltype(ref), T))
checkbounds(ref, pos)
end
end
check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1)
function check(v::Variation{S, <:Diff{<:Insertion}}) where S
length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0"))
# We can have insertions at the very end, after the reference sequence
v.diff.pos == lastindex(v.ref) + 1 && return nothing
checkbounds(v.ref, v.diff.pos)
@case Edit function check((seq,)::Insertion{S, T}) where {S, T}
(ref, pos) -> begin
# We can have insertions at the very end, after the reference sequence
pos == lastindex(ref) + 1 && return nothing
checkbounds(ref, pos)
end
end
Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol)
Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1)
Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq)
function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S
print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol)
@case Edit function check((len,)::Deletion{S, T}) where {S, T}
(ref, pos) -> begin
checkbounds(ref, pos:(pos+len)-1)
end
end
Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff)
Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff)
@assert SumTypes.iscomplete(check, Edit)
function check(x::Variation)
check(x.diff.edit)(x.ref, x.diff.pos)
end
@case Edit _show((x,)::Substitution) = (io, pos) -> print(io, pos, x)
@case Edit _show((len,)::Deletion) = (io, pos) -> print(io, 'Δ', pos, '-', pos + len - 1)
@case Edit _show((seq,)::Insertion) = (io, pos) -> print(io, pos, seq)
@assert SumTypes.iscomplete(_show, Edit)
Base.show(io::IO, x::Diff) = _show(x.edit)(io, x.pos)
function Base.show(io::IO, x::Variation)
if x.diff.edit.data isa Substitution
print(io, x.ref[x.diff.pos])
end
show(io, x.diff)
end
Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff)
function variant(seq::LongAminoAcidSeq, ref::LongAminoAcidSeq)
aln = pairalign(OverlapAlignment(), seq, ref, ALN_MODEL).aln
diffs = Diff{LongAminoAcidSeq, AminoAcid}[]
result = Variant(ref, diffs)
refpos = seqpos = 0
markpos = 0
n_gaps = n_ins = 0
insertion_buffer = AminoAcid[]
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
isgap(seqi) || (seqpos += 1)
# Check for deletions
if isgap(seqi)
iszero(seqpos) && continue # skip indels at start
iszero(n_gaps) && (markpos = refpos)
n_gaps += 1
else
if !iszero(n_gaps)
push!(diffs, Diff(UInt(markpos), Deletion{LongAminoAcidSeq, AminoAcid}(UInt(n_gaps))))
n_gaps = 0
end
end
# Check for insertions
if isgap(refi)
iszero(refpos) && continue # skip indels at start
iszero(n_ins) && (markpos = refpos + 1)
push!(insertion_buffer, seqi)
n_ins += 1
else
if !iszero(n_ins)
seq = LongAminoAcidSeq(insertion_buffer)
push!(diffs, Diff(UInt(markpos), Insertion(seq)))
empty!(insertion_buffer)
n_ins = 0
end
end
# Substitutions
if !isgap(refi) && !isgap(seqi) && seqi != refi
push!(diffs, Diff(UInt(refpos), Substitution{LongAminoAcidSeq, AminoAcid}(seqi)))
end
end
# At the end of the loop?
return result
end
@case Edit lendiff((x,)::Substitution) = 0
@case Edit lendiff((len,)::Deletion) = -(len % Int)
@case Edit lendiff((seq,)::Insertion) = length(seq) % Int
function reconstruct!(seq::S, x::Variant{S}) where S
sort!(x.diffs, by=y -> y.pos)
len = length(x.ref) + sum(diff -> lendiff(diff.edit), x.diffs)
resize!(seq, len % UInt)
refpos = seqpos = 1
for diff in x.diffs
while refpos < diff.pos
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
if diff.edit.data isa Substitution
seq[seqpos] = diff.edit.data._1
seqpos += 1
refpos += 1
elseif diff.edit.data isa Deletion
refpos += diff.edit.data._1
elseif diff.edit.data isa Insertion
for i in diff.edit.data._1
seq[seqpos] = i
seqpos += 1
end
end
end
while seqpos length(seq)
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
seq
end
"""
reconstruct(x::Variant{S})
Reconstruct the sequence of type `S` that created the variant. It is assumed the
variant is well-formed, e.g. no substitutions in deleted sequences, or
deletions/insertions of the same area multiple times.
"""
reconstruct(x::Variant{S}) where S = reconstruct!(S(), x)
export Substitution, Insertion, Deletion, Diff, Variant, Variation
end # module
#=
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
aln = AlignedSequence(seqaln, refaln)
return variations(ref, refaln, seqaln, aln.aln.anchors)
@ -179,27 +276,7 @@ function substitutions(ref::S, seq::S) where {S <: BioSequence}
end
function variant(ref::S, seq::S) where {S <: BioSequence}
aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln
diffs = Diff[]
result = Variant(ref, diffs)
refpos = 0
markpos = 0
n_gaps = n_ins = 0
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
# Check for deletions
if isgap(seqi)
iszero(n_gaps) && (markpos = refi)
n_gaps += 1
else
if !iszero(n_gaps)
push(diffs, Diff{Deletion}Deletion(n_gaps))
end
n_gaps = 0
end
end
#=
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
result = Variant(dna"TAG", Diff[])
@ -349,3 +426,5 @@ export Diff,
substitutions
end # module
=#