Move Edit-related code to Edit.jl

This commit is contained in:
Thomas A. Christensen II 2022-12-30 14:20:03 -06:00
parent a6ad47a568
commit 3f631ace8f
2 changed files with 143 additions and 142 deletions

142
src/Edit.jl Normal file
View file

@ -0,0 +1,142 @@
"""
Substitution
Represents the presence of a `T` at a given position. The position is stored
outside this struct.
"""
struct Substitution{T<:BioSymbol}
x::T
end
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
"""
Deletion
Represents the deletion of N symbols. The location of the deletion is stored
outside this struct
"""
struct Deletion
len::UInt
function Deletion(len::UInt)
iszero(len) && error("Deletion must be at least 1 symbol")
return new(len)
end
end
Deletion(x::Integer) = Deletion(convert(UInt, x))
Base.length(x::Deletion) = Int(x.len)
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
"""
Insertion{S <: BioSequence}
Represents the insertion of a `S` into a sequence. The location of the insertion
is stored outside the struct.
"""
struct Insertion{S<:BioSequence}
seq::S
function Insertion{S}(x::S) where {S<:BioSequence}
isempty(x) && error("Insertion must be at least 1 symbol")
return new(x)
end
end
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
Base.length(x::Insertion) = length(x.seq)
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
"""
Edit{S <: BioSequence, T <: BioSymbol}
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
"""
struct Edit{S<:BioSequence,T<:BioSymbol}
x::Union{Substitution{T},Deletion,Insertion{S}}
pos::UInt
end
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
Base.length(e::Edit) = e isa Substitution ? 1 : length(mutation(e))
function Base.parse(::Type{T}, s::AbstractString) where {T<:Edit{Se,Sy}} where {Se,Sy}
return parse(T, String(s))
end
function Base.parse(::Type{<:Edit{Se,Sy}}, s::Union{String,SubString{String}}) where {Se,Sy}
# Either "Δ1-2", "11T" or "G16C"
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
pos = parse(UInt, m[1])
stop = parse(UInt, m[2])
stop pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
Edit{Se,Sy}(Deletion(stop - pos + 1), pos)
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
pos = parse(UInt, m[1])
seq = Se(m[2])
Edit{Se,Sy}(Insertion(seq), pos)
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
pos = parse(UInt, m[1])
sym = Sy(first(m[2]))
Edit{Se,Sy}(Substitution(sym), pos)
else
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
end
end
mutation(e::Edit) = e.x
BioGenerics.leftposition(e::Edit) = e.pos
function BioGenerics.rightposition(e::Edit)
if mutation(e) isa Substitution
return leftposition(e)
elseif mutation(e) isa Insertion
return leftposition(e) + 1
elseif mutation(e) isa Deletion
return leftposition(e) + length(e) - 1
else
error("Unknown mutation type $(typeof(mutation(e)))")
end
end
function lendiff(edit::Edit)
x = edit.x
return x isa Substitution ? 0 : (x isa Deletion ? -length(x) : length(x.x))
end
function _refbases(s::Substitution, reference::S, pos::UInt) where {S<:BioSequence}
return S([reference[pos]])
end
function _altbases(s::Substitution, reference::S, pos::UInt) where {S<:BioSequence}
return S([s.x])
end
function _refbases(d::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
if pos == 1
return S(reference[UnitRange{Int}(pos, pos + length(d))])
else
return S(reference[UnitRange{Int}(pos - 1, pos + length(d) - 1)])
end
end
function _altbases(d::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
if pos == 1
return S([reference[pos + 1]])
else
return S([reference[pos - 1]])
end
end
function _refbases(i::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
return S([reference[pos]])
end
function _altbases(i::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
if pos == 1
return S([i.seq..., reference[pos]])
else
return S([reference[pos], i.seq...])
end
end

View file

@ -31,107 +31,7 @@ const BS = BioSequences
struct Unsafe end struct Unsafe end
struct Inapplicable end struct Inapplicable end
""" include("Edit.jl")
Substitution
Represents the presence of a `T` at a given position. The position is stored
outside this struct.
"""
struct Substitution{T <: BioSymbol}
x::T
end
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
"""
Deletion
Represents the deletion of N symbols. The location of the deletion is stored
outside this struct
"""
struct Deletion
len::UInt
function Deletion(len::UInt)
iszero(len) && error("Deletion must be at least 1 symbol")
new(len)
end
end
Deletion(x::Integer) = Deletion(convert(UInt, x))
Base.length(x::Deletion) = Int(x.len)
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
"""
Insertion{S <: BioSequence}
Represents the insertion of a `S` into a sequence. The location of the insertion
is stored outside the struct.
"""
struct Insertion{S <: BioSequence}
seq::S
function Insertion{S}(x::S) where {S <: BioSequence}
isempty(x) && error("Insertion must be at least 1 symbol")
new(x)
end
end
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
Base.length(x::Insertion) = length(x.seq)
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
"""
Edit{S <: BioSequence, T <: BioSymbol}
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
"""
struct Edit{S <: BioSequence, T <: BioSymbol}
x::Union{Substitution{T}, Deletion, Insertion{S}}
pos::UInt
end
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
Base.length(e::Edit) = e isa Substitution ? 1 : length(mutation(e))
function Base.parse(::Type{T}, s::AbstractString) where {T <: Edit{Se, Sy}} where {Se, Sy}
parse(T, String(s))
end
function Base.parse(::Type{<:Edit{Se, Sy}}, s::Union{String, SubString{String}}) where {Se, Sy}
# Either "Δ1-2", "11T" or "G16C"
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
pos = parse(UInt, m[1])
stop = parse(UInt, m[2])
stop pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
Edit{Se, Sy}(Deletion(stop - pos + 1), pos)
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
pos = parse(UInt, m[1])
seq = Se(m[2])
Edit{Se, Sy}(Insertion(seq), pos)
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
pos = parse(UInt, m[1])
sym = Sy(first(m[2]))
Edit{Se, Sy}(Substitution(sym), pos)
else
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
end
end
mutation(e::Edit) = e.x
BioGenerics.leftposition(e::Edit) = e.pos
function BioGenerics.rightposition(e::Edit)
if mutation(e) isa Substitution
return leftposition(e)
elseif mutation(e) isa Insertion
return leftposition(e) + 1
elseif mutation(e) isa Deletion
return leftposition(e) + length(e) - 1
else
error("Unknown mutation type $(typeof(mutation(e)))")
end
end
# Edits are applied sequentially from first to last pos. # Edits are applied sequentially from first to last pos.
# The vector must always be sorted by pos. # The vector must always be sorted by pos.
@ -260,11 +160,6 @@ edits(v::Variant) = v.edits
reference(v::Variant) = v.ref reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
function lendiff(edit::Edit)
x = edit.x
x isa Substitution ? 0 : (x isa Deletion ? -length(x) : length(x.x))
end
function reconstruct!(seq::S, x::Variant{S}) where S function reconstruct!(seq::S, x::Variant{S}) where S
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits) len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
resize!(seq, len % UInt) resize!(seq, len % UInt)
@ -422,42 +317,6 @@ function variations(v::Variant{S,T}) where {S,T}
return vs return vs
end end
function _refbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
return S([reference[pos]])
end
function _altbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
return S([s.x])
end
function _refbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
if pos == 1
return S(reference[UnitRange{Int}(pos, pos+length(d))])
else
return S(reference[UnitRange{Int}(pos-1, pos+length(d)-1)])
end
end
function _altbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
if pos == 1
return S([reference[pos+1]])
else
return S([reference[pos-1]])
end
end
function _refbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
return S([reference[pos]])
end
function _altbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
if pos == 1
return S([i.seq..., reference[pos]])
else
return S([reference[pos], i.seq...])
end
end
function refbases(v::Variation) function refbases(v::Variation)
return _refbases(mutation(v), reference(v), leftposition(v)) return _refbases(mutation(v), reference(v), leftposition(v))
end end