From ad3fe4303f29b59a0fe765c0644cbf243c368bb6 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Thu, 1 Oct 2020 17:03:01 +0200 Subject: [PATCH] Initial commit --- Manifest.toml | 174 +++++++++++++++++++++++++++++++ Project.toml | 3 + src/SequenceVariation.jl | 218 +++++++++++++++++++++++++++++++++++++++ src/parsing.jl | 56 ++++++++++ 4 files changed, 451 insertions(+) create mode 100644 Manifest.toml create mode 100644 Project.toml create mode 100644 src/SequenceVariation.jl create mode 100644 src/parsing.jl diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..24a30d3 --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,174 @@ +# This file is machine-generated - editing it directly is not advised + +[[Automa]] +deps = ["DataStructures", "Printf", "Random", "Test", "TranscodingStreams"] +git-tree-sha1 = "c81526bf5f6fb4616b4e22a3cd62ac20e255fd3c" +uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" +version = "0.8.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BioAlignments]] +deps = ["BioGenerics", "BioSequences", "BioSymbols", "IntervalTrees", "LinearAlgebra"] +git-tree-sha1 = "f610a3a965f187890edb0b1fdef4f30d77852edd" +uuid = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" +version = "2.0.0" + +[[BioGenerics]] +deps = ["TranscodingStreams"] +git-tree-sha1 = "57deb413ca9f4c8bc7d4c6e98ebe217ff728c737" +uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea" +version = "0.1.0" + +[[BioSequences]] +deps = ["BioGenerics", "BioSymbols", "Combinatorics", "IndexableBitVectors", "Printf", "Random", "StableRNGs", "Twiddle"] +git-tree-sha1 = "093ccb9211bdc71924abf8e74a0790af11da35a7" +uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +version = "2.0.5" + +[[BioSymbols]] +deps = ["Automa"] +git-tree-sha1 = "ec77888ac3e78f9d372c2b533bdb52668f9e2b09" +uuid = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" +version = "4.0.4" + +[[Combinatorics]] +git-tree-sha1 = "08c8b6831dc00bfea825826be0bc8336fc369860" +uuid = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" +version = "1.0.2" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "8cd7b7d1c7f6fcbe7e8743a58adf57788ec7f787" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.18.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "0347f23484a96d56e7096eb1f55c6975be34b11a" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.6" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[IndexableBitVectors]] +deps = ["Random", "Test"] +git-tree-sha1 = "b7f5e42dc867b8a8654a5f899064632dac05bc82" +uuid = "1cb3b9ac-1ffd-5777-9e6b-a3d42300664d" +version = "1.0.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IntervalTrees]] +deps = ["InteractiveUtils", "Profile", "Random", "Test"] +git-tree-sha1 = "6c9fcd87677231ae293f6806fad928c216ab6658" +uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b" +version = "1.0.0" + +[[LibGit2]] +deps = ["Printf"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[OrderedCollections]] +git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.3.1" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[StableRNGs]] +deps = ["Random", "Test"] +git-tree-sha1 = "b57c4216b6c163a3a9d674f6b9f7b99cdccdb959" +uuid = "860ef19b-820b-49d6-a774-d7a799459cd3" +version = "0.1.2" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TranscodingStreams]] +deps = ["Random", "Test"] +git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.9.5" + +[[Twiddle]] +git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a" +uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795" +version = "1.1.2" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..99ad32f --- /dev/null +++ b/Project.toml @@ -0,0 +1,3 @@ +[deps] +BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" diff --git a/src/SequenceVariation.jl b/src/SequenceVariation.jl new file mode 100644 index 0000000..80ca2a4 --- /dev/null +++ b/src/SequenceVariation.jl @@ -0,0 +1,218 @@ + +module t + +# TODO: Add functionality to move a Variation to a new reference +# needs to be done with heavy checks to make sure the alignment of the two +# is not full of gaps in the area where the Variation is +#= +Substitution should work just if the ref nuc don't match a deletion +Insertions/deletions should work if there are only matches (+/- 3 nucs), or at the ends + +Perhaps first make an array of oldref -> newref, with some positions marked nothing? +=# +# TODO: Do we need to prevent calling indels at the ends of alignments? +# No, we need a filtering step that trims the alignment before calling Variation + +#= +Extract alignment +Trim ends of gaps +Call Variations + +=# + +# TODO: Add inversions? + +using BioSequences +using BioAlignments +import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE + +abstract type Variation end + +struct Substitution{T} <: Variation + alt::T +end + +struct Deletion <: Variation + len::Int +end + +struct Insertion{A} <: Variation + seq::LongSequence{A} +end + +Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq + +# Metadata (such as sequence identifier) is intentionally left out +struct SeqVar{A <: Alphabet, V <: Variation} + ref::LongSequence{A} + pos::Int + var::V + + function SeqVar{A, V}(ref::LongSequence{A}, pos::Int, var::V) where {A <: Alphabet, V <: Variation} + n = new(ref, pos, var) + checkvar(n) + return n + end +end + +function SeqVar{A}(ref::LongSequence{A}, pos::Int, var::Variation) where A + return SeqVar{A,typeof(var)}(ref, pos, var) +end + +function SeqVar(ref::LongSequence{A}, pos::Int, var::Variation) where A + return SeqVar{A, typeof(var)}(ref, pos, var) +end + +function checkvar(x::SeqVar{A, Deletion}) where A + checkbounds(x.ref, x.pos:x.pos + x.var.len - 1) +end + +function checkvar(x::SeqVar{A, Substitution{T}}) where {A, T} + if T !== eltype(A) + throw(ArgumentError("Substitution type must be alphabet eltype")) + end + checkbounds(x.ref, x.pos) +end + +function checkvar(x::SeqVar{A, Insertion{A}}) where A + if length(x.var.seq) < 1 + throw(ArgumentError("Insertions cannot be empty")) + end + if x.pos ∉ 1:lastindex(x.ref)+1 + throw(BoundsError(x.ref, x.pos)) + end +end + +function Base.show(io::IO, x::SeqVar{A, Deletion}) where A + print(io, 'Δ', x.pos) + if x.var.len > 1 + print(io, '-', x.pos + x.var.len - 1) + end +end + +function Base.show(io::IO, x::SeqVar{A, <:Substitution}) where A + print(io, x.ref[x.pos], x.pos, x.var.alt) +end + +function Base.show(io::IO, x::SeqVar{A, Insertion{A}}) where A + print(io, x.pos, x.var.seq) +end + +function Base.:(==)(x::T, y::T) where {T <: SeqVar} + return x.ref === y.ref && x.pos == y.pos && x.var == y.var +end + +function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence} + aln = AlignedSequence(seqaln, refaln) + return variations(ref, refaln, seqaln, aln.aln.anchors) +end + +function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A + result = SeqVar{A}[] + isempty(anchors) && return result + firstop = first(anchors) + if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0) + throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)")) + end + # refpos and seqpos refer to the ungapped sequences. seqaln refer to the gapped one + refpos, seqpos, seqalnpos, seqalnend = 1, 1, 1, 1 + for anchor in @view anchors[2:end] + seqend, refend = anchor.seqpos, anchor.refpos + seqalnend = seqalnpos + max(seqend - seqpos, refend - refpos) + + # For mismatches, we add one Variation per mismatch + if anchor.op == OP_SEQ_MISMATCH + p = seqalnpos + for pos in refpos:refend + push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p]))) + p += 1 + end + + # Deletions are trivial + elseif anchor.op == OP_DELETE + len = refend - refpos + 1 + push!(result, SeqVar(ref, refpos, Deletion(len))) + + # Insertions are a little more tricky + elseif anchor.op == OP_INSERT + push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend]))) + + elseif anchor.op == OP_SEQ_MATCH + nothing + else + throw(ArgumentError("Cannot create Variation from operation $(anchor.op)")) + end + refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1 + end + return result +end + +function posmap(gap, oldref::BioSequence, newref::BioSequence) + if length(oldref) != length(newref) + throw(ArgumentError("Sequences must be equal-length and aligned")) + end + result = Vector{Union{Int, Nothing}}(undef, length(oldref)) + oldpos, newpos = 0, 0 + for (o, n) in zip(oldref, newref) + oldpos += o != gap + newpos += n != gap + o != gap && (result[oldpos] = n == gap ? nothing : newpos) + end + return resize!(result, oldpos) +end + +""" + posmap(oldref::S, newref::S) + +Given two aligned sequences, creates a vector `v` such that `v[o] = n`, where `o` is +a position in the old (ungapped) sequence and `n` is a position in the new ungapped sequence. +If `o` maps to a gap in the new sequence, `v[o] === nothing`. +""" +posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref) + +function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution} + newpos = posmap[var.pos] + newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference")) + return SeqVar{A, S}(ref, newpos::Int, var.var) +end + +function checkflanks(posmap, varpos::Int, flank::Int) + for i in max(1, varpos - flank) : min(length(posmap), varpos + flank) + posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion")) + end +end + +function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A + checkbounds(posmap, var.pos:var.pos + var.var.len - 1) + checkflanks(posmap, var.pos, flank) + return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var) +end + +function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A + checkflanks(posmap, var.pos, flank) + return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var) +end + + +# Assumes same reference, and that they are not mutally exclusive +# (e.g no substitution in deleted areas) +#= +function reconstruct(v::Vector{<:SeqVar{A}}) where A + isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence")) + srt = sort(v, by=x -> x.pos) + len = length(v[1].ref) + for i in srt + if i isa SeqVar{A, <:Deletion} + len -= i.var.len + elseif i isa SeqVar{A, <:Insertion} + len += length(i.var.seq) + end + end + + + ref = copy(v[1].ref) + oldpos, newpos = 1, 1 + for i in srt +=# + +end diff --git a/src/parsing.jl b/src/parsing.jl new file mode 100644 index 0000000..1168f52 --- /dev/null +++ b/src/parsing.jl @@ -0,0 +1,56 @@ +const DELETION_PATTERN = r"^Δ(\d+)(?:-(\d+))?$" +const INSERTION_PATTERN = r"^(\d+)([A-Z]+)$" +const SUBSTITUTION_PATTERN = r"^([A-Z])(\d+)([A-Z])$" +const MULTI_SUBST_PATTERN = r"^([A-Z](?:/[A-Z])*)(\d+)([A-Z](?:/[A-Z])*)$" + +function Base.parse(::Type{<:SeqVar{A}}, ref::LongSequence{A}, st::AbstractString) where A + m = match(DELETION_PATTERN, st) + if m !== nothing + return parse(SeqVar{A, Deletion}, ref, m) + end + m = match(SUBSTITUTION_PATTERN, st) + if m !== nothing + return parse(SeqVar{A, Substitution{eltype(A)}}, ref, m) + end + m = match(INSERTION_PATTERN, st) + if m !== nothing + return parse(SeqVar{A, Insertion{A}}, ref, m) + else + throw(ArgumentError("Cannot parse $st as Variation")) + end +end + +function Base.parse(::Type{SeqVar{A, Deletion}}, ref::LongSequence{A}, m::RegexMatch) where A + pos = parse(Int, m[1]) + len = m === nothing ? 1 : parse(Int, m[2]) - pos + 1 + return SeqVar(ref, pos, Deletion(len)) +end + +function Base.parse(::Type{<:SeqVar{A, Substitution{T}}}, ref::LongSequence{A}, m::RegexMatch) where {A,T} + if T !== eltype(A) + throw(ArgumentError("Substitution type must be alphabet eltype")) + end + refst, pos, altst = m[1], m[2], m[3] + pos = parse(Int, pos) + refT = T(first(refst)) + altT = T(first(altst)) + return SeqVar(ref, pos, Substitution{T}(altT)) +end + +function Base.parse(::Type{<:SeqVar{A, Insertion{A}}}, ref::LongSequence{A}, m::RegexMatch) where A + pos = parse(Int, m[1]) + # We accept an insertion immediately after the reference, right? + checkbounds(ref, pos - (pos == length(ref))) + seq = LongSequence{A}(m[2]) + return SeqVar(ref, pos, Insertion{A}(seq)) +end + + + +totext(m::SeqVar, seqid::IdDict{<:LongSequence, AbstractString}) = string(seqid[m.ref], '\t', m) + +function fromtext(st::AbstractString, seqid::Dict{String, LongSequence{A}}) where A + name, varstr = split(st, '\t', limit=2) + ref = seqid[name] + return parse(SeqVar{A}, ref, varstr) +end \ No newline at end of file