mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-25 14:49:55 +00:00
Initial commit
This commit is contained in:
commit
ad3fe4303f
4 changed files with 451 additions and 0 deletions
174
Manifest.toml
Normal file
174
Manifest.toml
Normal file
|
@ -0,0 +1,174 @@
|
|||
# This file is machine-generated - editing it directly is not advised
|
||||
|
||||
[[Automa]]
|
||||
deps = ["DataStructures", "Printf", "Random", "Test", "TranscodingStreams"]
|
||||
git-tree-sha1 = "c81526bf5f6fb4616b4e22a3cd62ac20e255fd3c"
|
||||
uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
|
||||
version = "0.8.0"
|
||||
|
||||
[[Base64]]
|
||||
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
|
||||
|
||||
[[BioAlignments]]
|
||||
deps = ["BioGenerics", "BioSequences", "BioSymbols", "IntervalTrees", "LinearAlgebra"]
|
||||
git-tree-sha1 = "f610a3a965f187890edb0b1fdef4f30d77852edd"
|
||||
uuid = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||
version = "2.0.0"
|
||||
|
||||
[[BioGenerics]]
|
||||
deps = ["TranscodingStreams"]
|
||||
git-tree-sha1 = "57deb413ca9f4c8bc7d4c6e98ebe217ff728c737"
|
||||
uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea"
|
||||
version = "0.1.0"
|
||||
|
||||
[[BioSequences]]
|
||||
deps = ["BioGenerics", "BioSymbols", "Combinatorics", "IndexableBitVectors", "Printf", "Random", "StableRNGs", "Twiddle"]
|
||||
git-tree-sha1 = "093ccb9211bdc71924abf8e74a0790af11da35a7"
|
||||
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||
version = "2.0.5"
|
||||
|
||||
[[BioSymbols]]
|
||||
deps = ["Automa"]
|
||||
git-tree-sha1 = "ec77888ac3e78f9d372c2b533bdb52668f9e2b09"
|
||||
uuid = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
||||
version = "4.0.4"
|
||||
|
||||
[[Combinatorics]]
|
||||
git-tree-sha1 = "08c8b6831dc00bfea825826be0bc8336fc369860"
|
||||
uuid = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
|
||||
version = "1.0.2"
|
||||
|
||||
[[Compat]]
|
||||
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
|
||||
git-tree-sha1 = "8cd7b7d1c7f6fcbe7e8743a58adf57788ec7f787"
|
||||
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
|
||||
version = "3.18.0"
|
||||
|
||||
[[DataStructures]]
|
||||
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
|
||||
git-tree-sha1 = "0347f23484a96d56e7096eb1f55c6975be34b11a"
|
||||
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
version = "0.18.6"
|
||||
|
||||
[[Dates]]
|
||||
deps = ["Printf"]
|
||||
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
||||
|
||||
[[DelimitedFiles]]
|
||||
deps = ["Mmap"]
|
||||
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
|
||||
|
||||
[[Distributed]]
|
||||
deps = ["Random", "Serialization", "Sockets"]
|
||||
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
|
||||
|
||||
[[IndexableBitVectors]]
|
||||
deps = ["Random", "Test"]
|
||||
git-tree-sha1 = "b7f5e42dc867b8a8654a5f899064632dac05bc82"
|
||||
uuid = "1cb3b9ac-1ffd-5777-9e6b-a3d42300664d"
|
||||
version = "1.0.0"
|
||||
|
||||
[[InteractiveUtils]]
|
||||
deps = ["Markdown"]
|
||||
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
|
||||
|
||||
[[IntervalTrees]]
|
||||
deps = ["InteractiveUtils", "Profile", "Random", "Test"]
|
||||
git-tree-sha1 = "6c9fcd87677231ae293f6806fad928c216ab6658"
|
||||
uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b"
|
||||
version = "1.0.0"
|
||||
|
||||
[[LibGit2]]
|
||||
deps = ["Printf"]
|
||||
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
|
||||
|
||||
[[Libdl]]
|
||||
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
|
||||
|
||||
[[LinearAlgebra]]
|
||||
deps = ["Libdl"]
|
||||
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
|
||||
[[Logging]]
|
||||
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
|
||||
|
||||
[[Markdown]]
|
||||
deps = ["Base64"]
|
||||
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
|
||||
|
||||
[[Mmap]]
|
||||
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
|
||||
|
||||
[[OrderedCollections]]
|
||||
git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b"
|
||||
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
|
||||
version = "1.3.1"
|
||||
|
||||
[[Pkg]]
|
||||
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
|
||||
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
|
||||
[[Printf]]
|
||||
deps = ["Unicode"]
|
||||
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||
|
||||
[[Profile]]
|
||||
deps = ["Printf"]
|
||||
uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
|
||||
|
||||
[[REPL]]
|
||||
deps = ["InteractiveUtils", "Markdown", "Sockets"]
|
||||
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
|
||||
|
||||
[[Random]]
|
||||
deps = ["Serialization"]
|
||||
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
|
||||
[[SHA]]
|
||||
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
|
||||
|
||||
[[Serialization]]
|
||||
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
|
||||
|
||||
[[SharedArrays]]
|
||||
deps = ["Distributed", "Mmap", "Random", "Serialization"]
|
||||
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
|
||||
|
||||
[[Sockets]]
|
||||
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
|
||||
|
||||
[[SparseArrays]]
|
||||
deps = ["LinearAlgebra", "Random"]
|
||||
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
|
||||
|
||||
[[StableRNGs]]
|
||||
deps = ["Random", "Test"]
|
||||
git-tree-sha1 = "b57c4216b6c163a3a9d674f6b9f7b99cdccdb959"
|
||||
uuid = "860ef19b-820b-49d6-a774-d7a799459cd3"
|
||||
version = "0.1.2"
|
||||
|
||||
[[Statistics]]
|
||||
deps = ["LinearAlgebra", "SparseArrays"]
|
||||
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||
|
||||
[[Test]]
|
||||
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
|
||||
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[[TranscodingStreams]]
|
||||
deps = ["Random", "Test"]
|
||||
git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
|
||||
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
|
||||
version = "0.9.5"
|
||||
|
||||
[[Twiddle]]
|
||||
git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a"
|
||||
uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795"
|
||||
version = "1.1.2"
|
||||
|
||||
[[UUIDs]]
|
||||
deps = ["Random", "SHA"]
|
||||
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
|
||||
|
||||
[[Unicode]]
|
||||
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
3
Project.toml
Normal file
3
Project.toml
Normal file
|
@ -0,0 +1,3 @@
|
|||
[deps]
|
||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
218
src/SequenceVariation.jl
Normal file
218
src/SequenceVariation.jl
Normal file
|
@ -0,0 +1,218 @@
|
|||
|
||||
module t
|
||||
|
||||
# TODO: Add functionality to move a Variation to a new reference
|
||||
# needs to be done with heavy checks to make sure the alignment of the two
|
||||
# is not full of gaps in the area where the Variation is
|
||||
#=
|
||||
Substitution should work just if the ref nuc don't match a deletion
|
||||
Insertions/deletions should work if there are only matches (+/- 3 nucs), or at the ends
|
||||
|
||||
Perhaps first make an array of oldref -> newref, with some positions marked nothing?
|
||||
=#
|
||||
# TODO: Do we need to prevent calling indels at the ends of alignments?
|
||||
# No, we need a filtering step that trims the alignment before calling Variation
|
||||
|
||||
#=
|
||||
Extract alignment
|
||||
Trim ends of gaps
|
||||
Call Variations
|
||||
|
||||
=#
|
||||
|
||||
# TODO: Add inversions?
|
||||
|
||||
using BioSequences
|
||||
using BioAlignments
|
||||
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
|
||||
|
||||
abstract type Variation end
|
||||
|
||||
struct Substitution{T} <: Variation
|
||||
alt::T
|
||||
end
|
||||
|
||||
struct Deletion <: Variation
|
||||
len::Int
|
||||
end
|
||||
|
||||
struct Insertion{A} <: Variation
|
||||
seq::LongSequence{A}
|
||||
end
|
||||
|
||||
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
|
||||
|
||||
# Metadata (such as sequence identifier) is intentionally left out
|
||||
struct SeqVar{A <: Alphabet, V <: Variation}
|
||||
ref::LongSequence{A}
|
||||
pos::Int
|
||||
var::V
|
||||
|
||||
function SeqVar{A, V}(ref::LongSequence{A}, pos::Int, var::V) where {A <: Alphabet, V <: Variation}
|
||||
n = new(ref, pos, var)
|
||||
checkvar(n)
|
||||
return n
|
||||
end
|
||||
end
|
||||
|
||||
function SeqVar{A}(ref::LongSequence{A}, pos::Int, var::Variation) where A
|
||||
return SeqVar{A,typeof(var)}(ref, pos, var)
|
||||
end
|
||||
|
||||
function SeqVar(ref::LongSequence{A}, pos::Int, var::Variation) where A
|
||||
return SeqVar{A, typeof(var)}(ref, pos, var)
|
||||
end
|
||||
|
||||
function checkvar(x::SeqVar{A, Deletion}) where A
|
||||
checkbounds(x.ref, x.pos:x.pos + x.var.len - 1)
|
||||
end
|
||||
|
||||
function checkvar(x::SeqVar{A, Substitution{T}}) where {A, T}
|
||||
if T !== eltype(A)
|
||||
throw(ArgumentError("Substitution type must be alphabet eltype"))
|
||||
end
|
||||
checkbounds(x.ref, x.pos)
|
||||
end
|
||||
|
||||
function checkvar(x::SeqVar{A, Insertion{A}}) where A
|
||||
if length(x.var.seq) < 1
|
||||
throw(ArgumentError("Insertions cannot be empty"))
|
||||
end
|
||||
if x.pos ∉ 1:lastindex(x.ref)+1
|
||||
throw(BoundsError(x.ref, x.pos))
|
||||
end
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::SeqVar{A, Deletion}) where A
|
||||
print(io, 'Δ', x.pos)
|
||||
if x.var.len > 1
|
||||
print(io, '-', x.pos + x.var.len - 1)
|
||||
end
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::SeqVar{A, <:Substitution}) where A
|
||||
print(io, x.ref[x.pos], x.pos, x.var.alt)
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::SeqVar{A, Insertion{A}}) where A
|
||||
print(io, x.pos, x.var.seq)
|
||||
end
|
||||
|
||||
function Base.:(==)(x::T, y::T) where {T <: SeqVar}
|
||||
return x.ref === y.ref && x.pos == y.pos && x.var == y.var
|
||||
end
|
||||
|
||||
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
|
||||
aln = AlignedSequence(seqaln, refaln)
|
||||
return variations(ref, refaln, seqaln, aln.aln.anchors)
|
||||
end
|
||||
|
||||
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
|
||||
result = SeqVar{A}[]
|
||||
isempty(anchors) && return result
|
||||
firstop = first(anchors)
|
||||
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
|
||||
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
|
||||
end
|
||||
# refpos and seqpos refer to the ungapped sequences. seqaln refer to the gapped one
|
||||
refpos, seqpos, seqalnpos, seqalnend = 1, 1, 1, 1
|
||||
for anchor in @view anchors[2:end]
|
||||
seqend, refend = anchor.seqpos, anchor.refpos
|
||||
seqalnend = seqalnpos + max(seqend - seqpos, refend - refpos)
|
||||
|
||||
# For mismatches, we add one Variation per mismatch
|
||||
if anchor.op == OP_SEQ_MISMATCH
|
||||
p = seqalnpos
|
||||
for pos in refpos:refend
|
||||
push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p])))
|
||||
p += 1
|
||||
end
|
||||
|
||||
# Deletions are trivial
|
||||
elseif anchor.op == OP_DELETE
|
||||
len = refend - refpos + 1
|
||||
push!(result, SeqVar(ref, refpos, Deletion(len)))
|
||||
|
||||
# Insertions are a little more tricky
|
||||
elseif anchor.op == OP_INSERT
|
||||
push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend])))
|
||||
|
||||
elseif anchor.op == OP_SEQ_MATCH
|
||||
nothing
|
||||
else
|
||||
throw(ArgumentError("Cannot create Variation from operation $(anchor.op)"))
|
||||
end
|
||||
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
|
||||
end
|
||||
return result
|
||||
end
|
||||
|
||||
function posmap(gap, oldref::BioSequence, newref::BioSequence)
|
||||
if length(oldref) != length(newref)
|
||||
throw(ArgumentError("Sequences must be equal-length and aligned"))
|
||||
end
|
||||
result = Vector{Union{Int, Nothing}}(undef, length(oldref))
|
||||
oldpos, newpos = 0, 0
|
||||
for (o, n) in zip(oldref, newref)
|
||||
oldpos += o != gap
|
||||
newpos += n != gap
|
||||
o != gap && (result[oldpos] = n == gap ? nothing : newpos)
|
||||
end
|
||||
return resize!(result, oldpos)
|
||||
end
|
||||
|
||||
"""
|
||||
posmap(oldref::S, newref::S)
|
||||
|
||||
Given two aligned sequences, creates a vector `v` such that `v[o] = n`, where `o` is
|
||||
a position in the old (ungapped) sequence and `n` is a position in the new ungapped sequence.
|
||||
If `o` maps to a gap in the new sequence, `v[o] === nothing`.
|
||||
"""
|
||||
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
|
||||
|
||||
function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution}
|
||||
newpos = posmap[var.pos]
|
||||
newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference"))
|
||||
return SeqVar{A, S}(ref, newpos::Int, var.var)
|
||||
end
|
||||
|
||||
function checkflanks(posmap, varpos::Int, flank::Int)
|
||||
for i in max(1, varpos - flank) : min(length(posmap), varpos + flank)
|
||||
posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion"))
|
||||
end
|
||||
end
|
||||
|
||||
function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A
|
||||
checkbounds(posmap, var.pos:var.pos + var.var.len - 1)
|
||||
checkflanks(posmap, var.pos, flank)
|
||||
return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var)
|
||||
end
|
||||
|
||||
function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A
|
||||
checkflanks(posmap, var.pos, flank)
|
||||
return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var)
|
||||
end
|
||||
|
||||
|
||||
# Assumes same reference, and that they are not mutally exclusive
|
||||
# (e.g no substitution in deleted areas)
|
||||
#=
|
||||
function reconstruct(v::Vector{<:SeqVar{A}}) where A
|
||||
isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence"))
|
||||
srt = sort(v, by=x -> x.pos)
|
||||
len = length(v[1].ref)
|
||||
for i in srt
|
||||
if i isa SeqVar{A, <:Deletion}
|
||||
len -= i.var.len
|
||||
elseif i isa SeqVar{A, <:Insertion}
|
||||
len += length(i.var.seq)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
ref = copy(v[1].ref)
|
||||
oldpos, newpos = 1, 1
|
||||
for i in srt
|
||||
=#
|
||||
|
||||
end
|
56
src/parsing.jl
Normal file
56
src/parsing.jl
Normal file
|
@ -0,0 +1,56 @@
|
|||
const DELETION_PATTERN = r"^Δ(\d+)(?:-(\d+))?$"
|
||||
const INSERTION_PATTERN = r"^(\d+)([A-Z]+)$"
|
||||
const SUBSTITUTION_PATTERN = r"^([A-Z])(\d+)([A-Z])$"
|
||||
const MULTI_SUBST_PATTERN = r"^([A-Z](?:/[A-Z])*)(\d+)([A-Z](?:/[A-Z])*)$"
|
||||
|
||||
function Base.parse(::Type{<:SeqVar{A}}, ref::LongSequence{A}, st::AbstractString) where A
|
||||
m = match(DELETION_PATTERN, st)
|
||||
if m !== nothing
|
||||
return parse(SeqVar{A, Deletion}, ref, m)
|
||||
end
|
||||
m = match(SUBSTITUTION_PATTERN, st)
|
||||
if m !== nothing
|
||||
return parse(SeqVar{A, Substitution{eltype(A)}}, ref, m)
|
||||
end
|
||||
m = match(INSERTION_PATTERN, st)
|
||||
if m !== nothing
|
||||
return parse(SeqVar{A, Insertion{A}}, ref, m)
|
||||
else
|
||||
throw(ArgumentError("Cannot parse $st as Variation"))
|
||||
end
|
||||
end
|
||||
|
||||
function Base.parse(::Type{SeqVar{A, Deletion}}, ref::LongSequence{A}, m::RegexMatch) where A
|
||||
pos = parse(Int, m[1])
|
||||
len = m === nothing ? 1 : parse(Int, m[2]) - pos + 1
|
||||
return SeqVar(ref, pos, Deletion(len))
|
||||
end
|
||||
|
||||
function Base.parse(::Type{<:SeqVar{A, Substitution{T}}}, ref::LongSequence{A}, m::RegexMatch) where {A,T}
|
||||
if T !== eltype(A)
|
||||
throw(ArgumentError("Substitution type must be alphabet eltype"))
|
||||
end
|
||||
refst, pos, altst = m[1], m[2], m[3]
|
||||
pos = parse(Int, pos)
|
||||
refT = T(first(refst))
|
||||
altT = T(first(altst))
|
||||
return SeqVar(ref, pos, Substitution{T}(altT))
|
||||
end
|
||||
|
||||
function Base.parse(::Type{<:SeqVar{A, Insertion{A}}}, ref::LongSequence{A}, m::RegexMatch) where A
|
||||
pos = parse(Int, m[1])
|
||||
# We accept an insertion immediately after the reference, right?
|
||||
checkbounds(ref, pos - (pos == length(ref)))
|
||||
seq = LongSequence{A}(m[2])
|
||||
return SeqVar(ref, pos, Insertion{A}(seq))
|
||||
end
|
||||
|
||||
|
||||
|
||||
totext(m::SeqVar, seqid::IdDict{<:LongSequence, AbstractString}) = string(seqid[m.ref], '\t', m)
|
||||
|
||||
function fromtext(st::AbstractString, seqid::Dict{String, LongSequence{A}}) where A
|
||||
name, varstr = split(st, '\t', limit=2)
|
||||
ref = seqid[name]
|
||||
return parse(SeqVar{A}, ref, varstr)
|
||||
end
|
Loading…
Reference in a new issue