Initial commit

This commit is contained in:
Jakob Nybo Nissen 2020-10-01 17:03:01 +02:00
commit ad3fe4303f
4 changed files with 451 additions and 0 deletions

174
Manifest.toml Normal file
View file

@ -0,0 +1,174 @@
# This file is machine-generated - editing it directly is not advised
[[Automa]]
deps = ["DataStructures", "Printf", "Random", "Test", "TranscodingStreams"]
git-tree-sha1 = "c81526bf5f6fb4616b4e22a3cd62ac20e255fd3c"
uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
version = "0.8.0"
[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
[[BioAlignments]]
deps = ["BioGenerics", "BioSequences", "BioSymbols", "IntervalTrees", "LinearAlgebra"]
git-tree-sha1 = "f610a3a965f187890edb0b1fdef4f30d77852edd"
uuid = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
version = "2.0.0"
[[BioGenerics]]
deps = ["TranscodingStreams"]
git-tree-sha1 = "57deb413ca9f4c8bc7d4c6e98ebe217ff728c737"
uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea"
version = "0.1.0"
[[BioSequences]]
deps = ["BioGenerics", "BioSymbols", "Combinatorics", "IndexableBitVectors", "Printf", "Random", "StableRNGs", "Twiddle"]
git-tree-sha1 = "093ccb9211bdc71924abf8e74a0790af11da35a7"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
version = "2.0.5"
[[BioSymbols]]
deps = ["Automa"]
git-tree-sha1 = "ec77888ac3e78f9d372c2b533bdb52668f9e2b09"
uuid = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
version = "4.0.4"
[[Combinatorics]]
git-tree-sha1 = "08c8b6831dc00bfea825826be0bc8336fc369860"
uuid = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
version = "1.0.2"
[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "8cd7b7d1c7f6fcbe7e8743a58adf57788ec7f787"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.18.0"
[[DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "0347f23484a96d56e7096eb1f55c6975be34b11a"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.6"
[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
[[IndexableBitVectors]]
deps = ["Random", "Test"]
git-tree-sha1 = "b7f5e42dc867b8a8654a5f899064632dac05bc82"
uuid = "1cb3b9ac-1ffd-5777-9e6b-a3d42300664d"
version = "1.0.0"
[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
[[IntervalTrees]]
deps = ["InteractiveUtils", "Profile", "Random", "Test"]
git-tree-sha1 = "6c9fcd87677231ae293f6806fad928c216ab6658"
uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b"
version = "1.0.0"
[[LibGit2]]
deps = ["Printf"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
[[LinearAlgebra]]
deps = ["Libdl"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
[[OrderedCollections]]
git-tree-sha1 = "16c08bf5dba06609fe45e30860092d6fa41fde7b"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.3.1"
[[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[[Profile]]
deps = ["Printf"]
uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
[[SharedArrays]]
deps = ["Distributed", "Mmap", "Random", "Serialization"]
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
[[SparseArrays]]
deps = ["LinearAlgebra", "Random"]
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[StableRNGs]]
deps = ["Random", "Test"]
git-tree-sha1 = "b57c4216b6c163a3a9d674f6b9f7b99cdccdb959"
uuid = "860ef19b-820b-49d6-a774-d7a799459cd3"
version = "0.1.2"
[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[[TranscodingStreams]]
deps = ["Random", "Test"]
git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.9.5"
[[Twiddle]]
git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a"
uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795"
version = "1.1.2"
[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

3
Project.toml Normal file
View file

@ -0,0 +1,3 @@
[deps]
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"

218
src/SequenceVariation.jl Normal file
View file

@ -0,0 +1,218 @@
module t
# TODO: Add functionality to move a Variation to a new reference
# needs to be done with heavy checks to make sure the alignment of the two
# is not full of gaps in the area where the Variation is
#=
Substitution should work just if the ref nuc don't match a deletion
Insertions/deletions should work if there are only matches (+/- 3 nucs), or at the ends
Perhaps first make an array of oldref -> newref, with some positions marked nothing?
=#
# TODO: Do we need to prevent calling indels at the ends of alignments?
# No, we need a filtering step that trims the alignment before calling Variation
#=
Extract alignment
Trim ends of gaps
Call Variations
=#
# TODO: Add inversions?
using BioSequences
using BioAlignments
import BioAlignments: OP_START, OP_SEQ_MATCH, OP_SEQ_MISMATCH, OP_INSERT, OP_DELETE
abstract type Variation end
struct Substitution{T} <: Variation
alt::T
end
struct Deletion <: Variation
len::Int
end
struct Insertion{A} <: Variation
seq::LongSequence{A}
end
Base.:(==)(x::Insertion{A}, y::Insertion{A}) where A = x.seq == y.seq
# Metadata (such as sequence identifier) is intentionally left out
struct SeqVar{A <: Alphabet, V <: Variation}
ref::LongSequence{A}
pos::Int
var::V
function SeqVar{A, V}(ref::LongSequence{A}, pos::Int, var::V) where {A <: Alphabet, V <: Variation}
n = new(ref, pos, var)
checkvar(n)
return n
end
end
function SeqVar{A}(ref::LongSequence{A}, pos::Int, var::Variation) where A
return SeqVar{A,typeof(var)}(ref, pos, var)
end
function SeqVar(ref::LongSequence{A}, pos::Int, var::Variation) where A
return SeqVar{A, typeof(var)}(ref, pos, var)
end
function checkvar(x::SeqVar{A, Deletion}) where A
checkbounds(x.ref, x.pos:x.pos + x.var.len - 1)
end
function checkvar(x::SeqVar{A, Substitution{T}}) where {A, T}
if T !== eltype(A)
throw(ArgumentError("Substitution type must be alphabet eltype"))
end
checkbounds(x.ref, x.pos)
end
function checkvar(x::SeqVar{A, Insertion{A}}) where A
if length(x.var.seq) < 1
throw(ArgumentError("Insertions cannot be empty"))
end
if x.pos 1:lastindex(x.ref)+1
throw(BoundsError(x.ref, x.pos))
end
end
function Base.show(io::IO, x::SeqVar{A, Deletion}) where A
print(io, 'Δ', x.pos)
if x.var.len > 1
print(io, '-', x.pos + x.var.len - 1)
end
end
function Base.show(io::IO, x::SeqVar{A, <:Substitution}) where A
print(io, x.ref[x.pos], x.pos, x.var.alt)
end
function Base.show(io::IO, x::SeqVar{A, Insertion{A}}) where A
print(io, x.pos, x.var.seq)
end
function Base.:(==)(x::T, y::T) where {T <: SeqVar}
return x.ref === y.ref && x.pos == y.pos && x.var == y.var
end
function variations(ref::S, refaln::S, seqaln::S) where {S <: BioSequence}
aln = AlignedSequence(seqaln, refaln)
return variations(ref, refaln, seqaln, aln.aln.anchors)
end
function variations(ref::S, refaln::S, seqaln::S, anchors::Vector{AlignmentAnchor}) where {S <: BioSequence{A}} where A
result = SeqVar{A}[]
isempty(anchors) && return result
firstop = first(anchors)
if (firstop.op !== OP_START) || (firstop.refpos != 0 | firstop.seqpos != 0)
throw(ArgumentError("Alignment must begin with OP_START at positions (0, 0)"))
end
# refpos and seqpos refer to the ungapped sequences. seqaln refer to the gapped one
refpos, seqpos, seqalnpos, seqalnend = 1, 1, 1, 1
for anchor in @view anchors[2:end]
seqend, refend = anchor.seqpos, anchor.refpos
seqalnend = seqalnpos + max(seqend - seqpos, refend - refpos)
# For mismatches, we add one Variation per mismatch
if anchor.op == OP_SEQ_MISMATCH
p = seqalnpos
for pos in refpos:refend
push!(result, SeqVar(ref, pos, Substitution{eltype(A)}(seqaln[p])))
p += 1
end
# Deletions are trivial
elseif anchor.op == OP_DELETE
len = refend - refpos + 1
push!(result, SeqVar(ref, refpos, Deletion(len)))
# Insertions are a little more tricky
elseif anchor.op == OP_INSERT
push!(result, SeqVar(ref, refend + 1, Insertion{A}(seqaln[seqalnpos:seqalnend])))
elseif anchor.op == OP_SEQ_MATCH
nothing
else
throw(ArgumentError("Cannot create Variation from operation $(anchor.op)"))
end
refpos, seqpos, seqalnpos = refend + 1, seqend + 1, seqalnend + 1
end
return result
end
function posmap(gap, oldref::BioSequence, newref::BioSequence)
if length(oldref) != length(newref)
throw(ArgumentError("Sequences must be equal-length and aligned"))
end
result = Vector{Union{Int, Nothing}}(undef, length(oldref))
oldpos, newpos = 0, 0
for (o, n) in zip(oldref, newref)
oldpos += o != gap
newpos += n != gap
o != gap && (result[oldpos] = n == gap ? nothing : newpos)
end
return resize!(result, oldpos)
end
"""
posmap(oldref::S, newref::S)
Given two aligned sequences, creates a vector `v` such that `v[o] = n`, where `o` is
a position in the old (ungapped) sequence and `n` is a position in the new ungapped sequence.
If `o` maps to a gap in the new sequence, `v[o] === nothing`.
"""
posmap(oldref::S, newref::S) where {A, S <: BioSequence{A}} = posmap(gap(eltype(A)), oldref, newref)
function rereference(var::SeqVar{A, S}, posmap, ref::LongSequence{A}) where {A, S <: Substitution}
newpos = posmap[var.pos]
newpos === nothing && throw(ArgumentError("Position $(var.pos) maps to a gap in reference"))
return SeqVar{A, S}(ref, newpos::Int, var.var)
end
function checkflanks(posmap, varpos::Int, flank::Int)
for i in max(1, varpos - flank) : min(length(posmap), varpos + flank)
posmap[i] === nothing && throw(ArgumentError("Position $i maps to a deletion"))
end
end
function rereference(var::SeqVar{A, Deletion}, posmap, ref::LongSequence{A}, flank::Int=5) where A
checkbounds(posmap, var.pos:var.pos + var.var.len - 1)
checkflanks(posmap, var.pos, flank)
return SeqVar{A, Deletion}(ref, posmap[var.pos], var.var)
end
function rereference(var::SeqVar{A, Insertion{A}}, posmap, ref::LongSequence{A}, flank::Int=5) where A
checkflanks(posmap, var.pos, flank)
return SeqVar{A, Insertion{A}}(ref, posmap[var.pos], var.var)
end
# Assumes same reference, and that they are not mutally exclusive
# (e.g no substitution in deleted areas)
#=
function reconstruct(v::Vector{<:SeqVar{A}}) where A
isempty(v) && throw(ArgumentError("Need at least one Variation to reconstruct sequence"))
srt = sort(v, by=x -> x.pos)
len = length(v[1].ref)
for i in srt
if i isa SeqVar{A, <:Deletion}
len -= i.var.len
elseif i isa SeqVar{A, <:Insertion}
len += length(i.var.seq)
end
end
ref = copy(v[1].ref)
oldpos, newpos = 1, 1
for i in srt
=#
end

56
src/parsing.jl Normal file
View file

@ -0,0 +1,56 @@
const DELETION_PATTERN = r"^Δ(\d+)(?:-(\d+))?$"
const INSERTION_PATTERN = r"^(\d+)([A-Z]+)$"
const SUBSTITUTION_PATTERN = r"^([A-Z])(\d+)([A-Z])$"
const MULTI_SUBST_PATTERN = r"^([A-Z](?:/[A-Z])*)(\d+)([A-Z](?:/[A-Z])*)$"
function Base.parse(::Type{<:SeqVar{A}}, ref::LongSequence{A}, st::AbstractString) where A
m = match(DELETION_PATTERN, st)
if m !== nothing
return parse(SeqVar{A, Deletion}, ref, m)
end
m = match(SUBSTITUTION_PATTERN, st)
if m !== nothing
return parse(SeqVar{A, Substitution{eltype(A)}}, ref, m)
end
m = match(INSERTION_PATTERN, st)
if m !== nothing
return parse(SeqVar{A, Insertion{A}}, ref, m)
else
throw(ArgumentError("Cannot parse $st as Variation"))
end
end
function Base.parse(::Type{SeqVar{A, Deletion}}, ref::LongSequence{A}, m::RegexMatch) where A
pos = parse(Int, m[1])
len = m === nothing ? 1 : parse(Int, m[2]) - pos + 1
return SeqVar(ref, pos, Deletion(len))
end
function Base.parse(::Type{<:SeqVar{A, Substitution{T}}}, ref::LongSequence{A}, m::RegexMatch) where {A,T}
if T !== eltype(A)
throw(ArgumentError("Substitution type must be alphabet eltype"))
end
refst, pos, altst = m[1], m[2], m[3]
pos = parse(Int, pos)
refT = T(first(refst))
altT = T(first(altst))
return SeqVar(ref, pos, Substitution{T}(altT))
end
function Base.parse(::Type{<:SeqVar{A, Insertion{A}}}, ref::LongSequence{A}, m::RegexMatch) where A
pos = parse(Int, m[1])
# We accept an insertion immediately after the reference, right?
checkbounds(ref, pos - (pos == length(ref)))
seq = LongSequence{A}(m[2])
return SeqVar(ref, pos, Insertion{A}(seq))
end
totext(m::SeqVar, seqid::IdDict{<:LongSequence, AbstractString}) = string(seqid[m.ref], '\t', m)
function fromtext(st::AbstractString, seqid::Dict{String, LongSequence{A}}) where A
name, varstr = split(st, '\t', limit=2)
ref = seqid[name]
return parse(SeqVar{A}, ref, varstr)
end