From 354f18e0e49abe9a45378ef47c8e5fce8781bd13 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Mar 2021 15:50:37 +0100 Subject: [PATCH] Switch to SumTypes --- Manifest.toml | 68 ++++++++++- Project.toml | 1 + src/SequenceVariation.jl | 249 ++++++++++++++++++++++++++------------- 3 files changed, 229 insertions(+), 89 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 24a30d3..6fd3f1b 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,5 +1,11 @@ # This file is machine-generated - editing it directly is not advised +[[ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" + +[[Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + [[Automa]] deps = ["DataStructures", "Printf", "Random", "Test", "TranscodingStreams"] git-tree-sha1 = "c81526bf5f6fb4616b4e22a3cd62ac20e255fd3c" @@ -62,6 +68,10 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" +[[Downloads]] +deps = ["ArgTools", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" + [[IndexableBitVectors]] deps = ["Random", "Test"] git-tree-sha1 = "b7f5e42dc867b8a8654a5f899064632dac05bc82" @@ -78,10 +88,22 @@ git-tree-sha1 = "6c9fcd87677231ae293f6806fad928c216ab6658" uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b" version = "1.0.0" +[[LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" + +[[LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" + [[LibGit2]] -deps = ["Printf"] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" +[[LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" + [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -92,20 +114,36 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.6" + [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +[[MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" + [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" +[[MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" + +[[NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" + [[OrderedCollections]] git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" version = "1.3.1" [[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[Printf]] @@ -117,7 +155,7 @@ deps = ["Printf"] uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" [[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[Random]] @@ -151,8 +189,22 @@ version = "0.1.2" deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[[SumTypes]] +deps = ["MacroTools"] +git-tree-sha1 = "be127ca4ba297c1574457baa0ac3092b0054395e" +uuid = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2" +version = "0.1.1" + +[[TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" + +[[Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" + [[Test]] -deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[TranscodingStreams]] @@ -172,3 +224,11 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" + +[[nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" diff --git a/Project.toml b/Project.toml index 0cf160b..5c82b94 100644 --- a/Project.toml +++ b/Project.toml @@ -7,3 +7,4 @@ version = "0.1.0" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" +SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2" diff --git a/src/SequenceVariation.jl b/src/SequenceVariation.jl index 97f847d..d16ee4a 100644 --- a/src/SequenceVariation.jl +++ b/src/SequenceVariation.jl @@ -23,138 +23,235 @@ Call Variations using BioSequences using BioAlignments -import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE import BioSymbols: BioSymbol +using SumTypes const ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-12, gap_extend=-2) -@sum_type Edit{S, T} +@sum_type Edit{S, T} begin Substitution{S, T}(::T) Deletion{S, T}(::UInt) Insertion{S, T}(::S) end """ - Edit - -Abstract type representing a type of nucleotide edit: Deletion, insertion or -substitution. -""" -abstract type Edit end - -""" - Substitution{T <: BioSymbol} <: Edit + Substitution Represents the presence of a `T` at a given position. The position is stored outside this struct. """ -struct Substitution{T <: BioSymbol} <: Edit - symbol::T -end +Substitution -Base.eltype(::Type{<:Substitution{T}}) where T = T +Base.eltype(::Type{<:Substitution{S, T}}) where {S, T} = T """ - Deletion <: Edit + Deletion Represents the deletion of N symbols. The location of the deletion is stored outside this struct """ -struct Deletion <: Edit - len::UInt -end +Deletion """ - Insertion{S <: BioSequence} <: Edit + Insertion Represents the insertion of a `T` into a sequence. The location of the insertion is stored outside the struct. """ -struct Insertion{S <: BioSequence} <: Edit - seq::S +Insertion + +function Insertion(s::BioSequence) + length(s) == 0 && throw(ArgumentError("Insertions cannot be length 0")) + Insertion{typeof(s), eltype(s)}(s) end -Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq +Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq """ Diff{E <: Edit} Represents an `Edit` og type `E` at a given position. """ -struct Diff{E <: Edit} +struct Diff{S <: BioSequence, T <: BioSymbol} pos::UInt - edit::E + edit::Edit{S, T} end -Diff(pos::Integer, edit::Edit) = Diff{typeof(edit)}(pos, edit) - """ - Variant{S <: BioSequence, D <: Diff} + Variant{S <: BioSequence, T <: BioSymbol} -Represents a series of diffs of type `D` against a reference of type `S`. -See also `Variation` +Represents a series of diffs of type `Diff{S, T}` against a reference of type `S`. +See also `Variation`. """ -struct Variant{S <: BioSequence, D <: Diff} +struct Variant{S <: BioSequence, T <: BioSymbol} ref::S - diffs::Vector{D} + diffs::Vector{Diff{S, T}} end -#function Variant(ref::BioSequence, diffs::Vector{D}) where D <: Diff -# return Variant{typeof(ref), D}(ref, diffs) -#end - -function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,D}) where {S,D} +function Base.show(io::IO, ::MIME"text/plain", x::Variant{S,T}) where {S,T} cols = displaysize(io)[2] - 3 recur_io = IOContext(io, :SHOWN_SET => x.diffs) print(io, summary(x), ":") for i in x.diffs - v = Variation{S, typeof(i)}(x.ref, i) + v = Variation{S, eltype(S)}(x.ref, i) str = sprint(show, v, context=recur_io, sizehint=0) print(io, "\n ", Base._truncate_at_width_or_chars(str, cols, "\r\n")) end end -#Variant(s::BioSequence, v::Vector{<:Diff}) = Variant{typeof(s),eltype(v)}(s,v) - """ - Variation{S <: BioSequence, D <: Diff} + Variation{S <: BioSequence, T <: BioSymbol} -Represent a single diff of type `D` is a sequence of type `S`. See also `Variant` +Represent a single diff against a biosequence. See also `Variant` """ -struct Variation{S <: BioSequence, D <: Diff} +struct Variation{S <: BioSequence, T <: BioSymbol} ref::S - diff::D + diff::Diff{S, T} end -#Variation(ref::BioSequence, diff::Diff) = Variation{typeof(ref), typeof(diffs)}(ref, diff) - -function check(v::Variation{S, <:Substitution{T}}) where {S, T} - T == eltype(S) || throw(TypeError(:check, "", eltype(S), T)) - checkbounds(v.ref, v.diff.pos) +@case Edit function check((x,)::Substitution{S, T}) where {S, T} + (ref, pos) -> begin + eltype(ref) == T || throw(TypeError(:check, "", eltype(ref), T)) + checkbounds(ref, pos) + end end -check(v::Variation{S, Diff{Deletion}}) where S = checkbounds(v.ref, v.diff.pos:(v.diff.pos+v.diff.edit.len)-1) - -function check(v::Variation{S, <:Diff{<:Insertion}}) where S - length(v.diff.edit.seq) > 0 || throw(ArgumentError("Insertions cannot be length 0")) - # We can have insertions at the very end, after the reference sequence - v.diff.pos == lastindex(v.ref) + 1 && return nothing - checkbounds(v.ref, v.diff.pos) +@case Edit function check((seq,)::Insertion{S, T}) where {S, T} + (ref, pos) -> begin + # We can have insertions at the very end, after the reference sequence + pos == lastindex(ref) + 1 && return nothing + checkbounds(ref, pos) + end end -Base.show(io::IO, x::Diff{<:Substitution}) = print(io, x.pos, x.edit.symbol) -Base.show(io::IO, x::Diff{Deletion}) = print(io, 'Δ', x.pos, '-', x.pos + x.edit.len - 1) -Base.show(io::IO, x::Diff{<:Insertion}) = print(io, x.pos, x.edit.seq) - -function Base.show(io::IO, x::Variation{S, <:Diff{<:Substitution}}) where S - print(io, x.ref[x.diff.pos], x.diff.pos, x.diff.edit.symbol) +@case Edit function check((len,)::Deletion{S, T}) where {S, T} + (ref, pos) -> begin + checkbounds(ref, pos:(pos+len)-1) + end end -Base.show(io::IO, x::Variation{S, Diff{Deletion}}) where S = show(io, x.diff) -Base.show(io::IO, x::Variation{S, <:Diff{<:Insertion}} where S) = show(io, x.diff) +@assert SumTypes.iscomplete(check, Edit) + +function check(x::Variation) + check(x.diff.edit)(x.ref, x.diff.pos) +end + +@case Edit _show((x,)::Substitution) = (io, pos) -> print(io, pos, x) +@case Edit _show((len,)::Deletion) = (io, pos) -> print(io, 'Δ', pos, '-', pos + len - 1) +@case Edit _show((seq,)::Insertion) = (io, pos) -> print(io, pos, seq) + +@assert SumTypes.iscomplete(_show, Edit) + +Base.show(io::IO, x::Diff) = _show(x.edit)(io, x.pos) + +function Base.show(io::IO, x::Variation) + if x.diff.edit.data isa Substitution + print(io, x.ref[x.diff.pos]) + end + show(io, x.diff) +end Base.:(==)(x::T, y::T) where {T <: Variation} = (x.ref === y.ref) & (x.diff == y.diff) +function variant(seq::LongAminoAcidSeq, ref::LongAminoAcidSeq) + aln = pairalign(OverlapAlignment(), seq, ref, ALN_MODEL).aln + diffs = Diff{LongAminoAcidSeq, AminoAcid}[] + result = Variant(ref, diffs) + refpos = seqpos = 0 + markpos = 0 + n_gaps = n_ins = 0 + insertion_buffer = AminoAcid[] + for (seqi, refi) in aln + isgap(refi) || (refpos += 1) + isgap(seqi) || (seqpos += 1) + + # Check for deletions + if isgap(seqi) + iszero(seqpos) && continue # skip indels at start + iszero(n_gaps) && (markpos = refpos) + n_gaps += 1 + else + if !iszero(n_gaps) + push!(diffs, Diff(UInt(markpos), Deletion{LongAminoAcidSeq, AminoAcid}(UInt(n_gaps)))) + n_gaps = 0 + end + end + + # Check for insertions + if isgap(refi) + iszero(refpos) && continue # skip indels at start + iszero(n_ins) && (markpos = refpos + 1) + push!(insertion_buffer, seqi) + n_ins += 1 + else + if !iszero(n_ins) + seq = LongAminoAcidSeq(insertion_buffer) + push!(diffs, Diff(UInt(markpos), Insertion(seq))) + empty!(insertion_buffer) + n_ins = 0 + end + end + + # Substitutions + if !isgap(refi) && !isgap(seqi) && seqi != refi + push!(diffs, Diff(UInt(refpos), Substitution{LongAminoAcidSeq, AminoAcid}(seqi))) + end + end + # At the end of the loop? + return result +end + +@case Edit lendiff((x,)::Substitution) = 0 +@case Edit lendiff((len,)::Deletion) = -(len % Int) +@case Edit lendiff((seq,)::Insertion) = length(seq) % Int + +function reconstruct!(seq::S, x::Variant{S}) where S + sort!(x.diffs, by=y -> y.pos) + len = length(x.ref) + sum(diff -> lendiff(diff.edit), x.diffs) + resize!(seq, len % UInt) + refpos = seqpos = 1 + for diff in x.diffs + while refpos < diff.pos + seq[seqpos] = x.ref[refpos] + refpos += 1 + seqpos += 1 + end + if diff.edit.data isa Substitution + seq[seqpos] = diff.edit.data._1 + seqpos += 1 + refpos += 1 + elseif diff.edit.data isa Deletion + refpos += diff.edit.data._1 + elseif diff.edit.data isa Insertion + for i in diff.edit.data._1 + seq[seqpos] = i + seqpos += 1 + end + end + end + while seqpos ≤ length(seq) + seq[seqpos] = x.ref[refpos] + refpos += 1 + seqpos += 1 + end + seq +end + +""" + reconstruct(x::Variant{S}) + +Reconstruct the sequence of type `S` that created the variant. It is assumed the +variant is well-formed, e.g. no substitutions in deleted sequences, or +deletions/insertions of the same area multiple times. +""" +reconstruct(x::Variant{S}) where S = reconstruct!(S(), x) + +export Substitution, Insertion, Deletion, Diff, Variant, Variation + +end # module + +#= + function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence} aln = AlignedSequence(seqaln, refaln) return variations(ref, refaln, seqaln, aln.aln.anchors) @@ -179,27 +276,7 @@ function substitutions(ref::S, seq::S) where {S <: BioSequence} end -function variant(ref::S, seq::S) where {S <: BioSequence} - aln = pairalign(GlobalAlignment(), seq, ref, ALN_MODEL).aln - diffs = Diff[] - result = Variant(ref, diffs) - refpos = 0 - markpos = 0 - n_gaps = n_ins = 0 - for (seqi, refi) in aln - isgap(refi) || (refpos += 1) - - # Check for deletions - if isgap(seqi) - iszero(n_gaps) && (markpos = refi) - n_gaps += 1 - else - if !iszero(n_gaps) - push(diffs, Diff{Deletion}Deletion(n_gaps)) - end - n_gaps = 0 - end - end + #= function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A result = Variant(dna"TAG", Diff[]) @@ -349,3 +426,5 @@ export Diff, substitutions end # module + +=# \ No newline at end of file