mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 13:29:56 +00:00
Implement translate
This commit is contained in:
parent
8024d50889
commit
92a2d40553
2 changed files with 95 additions and 21 deletions
|
@ -20,9 +20,12 @@ TODO now:
|
||||||
* Add tests
|
* Add tests
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import BioSymbols: BioSymbol
|
using BioSymbols: BioSymbol
|
||||||
using BioAlignments
|
using BioAlignments: BioAlignments, PairwiseAlignment
|
||||||
using BioSequences
|
using BioSequences: BioSequences, BioSequence, NucleotideSeq, AminoAcidSeq, LongSequence, isgap
|
||||||
|
|
||||||
|
const BA = BioAlignments
|
||||||
|
const BS = BioSequences
|
||||||
|
|
||||||
#=
|
#=
|
||||||
import Automa
|
import Automa
|
||||||
|
@ -30,11 +33,7 @@ import Automa.RegExp: @re_str
|
||||||
=#
|
=#
|
||||||
|
|
||||||
struct Unsafe end
|
struct Unsafe end
|
||||||
|
struct Inapplicable end
|
||||||
const DEFAULT_AA_ALN_MODEL = AffineGapScoreModel(BLOSUM62, gap_open=-10, gap_extend=-2)
|
|
||||||
const DEFAULT_DNA_ALN_MODEL = AffineGapScoreModel(EDNAFULL, gap_open=-25, gap_extend=-2)
|
|
||||||
model(::AminoAcidSeq) = DEFAULT_AA_ALN_MODEL
|
|
||||||
model(::NucleotideSeq) = DEFAULT_DNA_ALN_MODEL
|
|
||||||
|
|
||||||
#=
|
#=
|
||||||
const CONTEXT = Automa.CodeGenContext(
|
const CONTEXT = Automa.CodeGenContext(
|
||||||
|
@ -56,7 +55,6 @@ outside this struct.
|
||||||
struct Substitution{T <: BioSymbol}
|
struct Substitution{T <: BioSymbol}
|
||||||
x::T
|
x::T
|
||||||
end
|
end
|
||||||
Substitution(x::BioSymbol) = Substitution{typeof(x)}(x)
|
|
||||||
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
|
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
|
||||||
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
|
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
|
||||||
|
|
||||||
|
@ -85,7 +83,7 @@ Represents the insertion of a `S` into a sequence. The location of the insertion
|
||||||
is stored outside the struct.
|
is stored outside the struct.
|
||||||
"""
|
"""
|
||||||
struct Insertion{S <: BioSequence}
|
struct Insertion{S <: BioSequence}
|
||||||
x::S
|
seq::S
|
||||||
|
|
||||||
function Insertion{S}(x::S) where {S <: BioSequence}
|
function Insertion{S}(x::S) where {S <: BioSequence}
|
||||||
isempty(x) && error("Insertion must be at least 1 symbol")
|
isempty(x) && error("Insertion must be at least 1 symbol")
|
||||||
|
@ -93,8 +91,9 @@ struct Insertion{S <: BioSequence}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
||||||
|
Base.length(x::Insertion) = length(x.seq)
|
||||||
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
||||||
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.x, h))
|
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Edit{S <: BioSequence, T <: BioSymbol}
|
Edit{S <: BioSequence, T <: BioSymbol}
|
||||||
|
@ -110,7 +109,10 @@ end
|
||||||
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
|
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
|
||||||
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
|
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
|
||||||
|
|
||||||
Base.parse(::Type{Edit}, s::AbstractString) = parse(Edit, String(s))
|
function Base.parse(::Type{T}, s::AbstractString) where {T <: Edit{Se, Sy}} where {Se, Sy}
|
||||||
|
parse(T, String(s))
|
||||||
|
end
|
||||||
|
|
||||||
function Base.parse(::Type{<:Edit{Se, Sy}}, s::Union{String, SubString{String}}) where {Se, Sy}
|
function Base.parse(::Type{<:Edit{Se, Sy}}, s::Union{String, SubString{String}}) where {Se, Sy}
|
||||||
# Either "Δ1-2", "11T" or "G16C"
|
# Either "Δ1-2", "11T" or "G16C"
|
||||||
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
|
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
|
||||||
|
@ -203,7 +205,6 @@ function Base.show(io::IO, x::Variant)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
# Validate:
|
# Validate:
|
||||||
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||||
# is affected by multiple edits.
|
# is affected by multiple edits.
|
||||||
|
@ -236,11 +237,7 @@ function is_valid(v::Variant)
|
||||||
return true
|
return true
|
||||||
end
|
end
|
||||||
|
|
||||||
function Variant(seq::T, ref::T) where {T <: LongSequence{<:Union{AminoAcidAlphabet, NucleicAcidAlphabet}}}
|
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
|
||||||
return Variant(pairalign(OverlapAlignment(), seq, ref, model(seq)).aln)
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{AminoAcidAlphabet, NucleicAcidAlphabet}}}
|
|
||||||
ref = aln.b
|
ref = aln.b
|
||||||
E = eltype(typeof(ref))
|
E = eltype(typeof(ref))
|
||||||
edits = Edit{T, E}[]
|
edits = Edit{T, E}[]
|
||||||
|
@ -375,9 +372,51 @@ function Base.in(v::Variation, var::Variant)
|
||||||
any(v.edit == edit for edit in var.edits)
|
any(v.edit == edit for edit in var.edits)
|
||||||
end
|
end
|
||||||
|
|
||||||
# How to check if a Variation is present in a sequence?
|
function translate(var::Variation{S, T}, aln::PairwiseAlignment{S, S}) where {S, T}
|
||||||
function hasvariation(seq::S, var::Variation{S, T}) where {S, T}
|
kind = var.edit.x
|
||||||
var in variant(seq, var.ref)
|
pos = var.edit.pos
|
||||||
|
seq, ref = aln.seq, aln.b
|
||||||
|
|
||||||
|
# Special case: Insertions may have a pos of 0, which cannot be mapped to
|
||||||
|
# the seq using ref2seq
|
||||||
|
if iszero(pos)
|
||||||
|
(s, r), _ = iterate(aln)
|
||||||
|
(isgap(s) | isgap(r)) && return Inapplicable()
|
||||||
|
return Variation{S, T}(seq, Edit{S, T}(Insertion(var.edit.x), 0))
|
||||||
|
end
|
||||||
|
|
||||||
|
(seqpos, op) = BA.ref2seq(aln, pos)
|
||||||
|
if kind isa Substitution
|
||||||
|
# If it's a substitution, return nothing if it maps to a deleted
|
||||||
|
# position, or substitutes to same base.
|
||||||
|
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
|
||||||
|
seq[seqpos] == kind.x && return nothing
|
||||||
|
edit = Edit{S, T}(kind, seqpos)
|
||||||
|
return Variation{S, T}(seq, edit, Unsafe())
|
||||||
|
elseif kind isa Deletion
|
||||||
|
# If it's a deletion, return nothing if the deleted part is already missing
|
||||||
|
# from the new reference.
|
||||||
|
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
|
||||||
|
start = seqpos + op == BA.OP_DELETE
|
||||||
|
start < stop && return nothing
|
||||||
|
edit = Edit{S, T}(Deletion(stop - start + 1), start)
|
||||||
|
return Variation{S, T}(seq, edit, Unsafe())
|
||||||
|
else
|
||||||
|
# If it maps directly to a symbol, just insert
|
||||||
|
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
|
||||||
|
# This happens if there is already an insertion at the position
|
||||||
|
if pos != lastindex(ref) && first(ref2seq(aln, pos+1)) != seqpos + 1
|
||||||
|
return Inapplicable()
|
||||||
|
else
|
||||||
|
edit = Edit{S, T}(Insertion(var.edit.x), seqpos)
|
||||||
|
return Variation{S, T}(seq, edit, Unsafe())
|
||||||
|
end
|
||||||
|
# Alternatively, it can map to a deletion. In that case, it become really
|
||||||
|
# tricky to talk about the "same" insertion.
|
||||||
|
else
|
||||||
|
return Inapplicable()
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
export Insertion,
|
export Insertion,
|
||||||
|
|
35
test/runtests.jl
Normal file
35
test/runtests.jl
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
"""
|
||||||
|
Needs to be able to:
|
||||||
|
* Given a sequence and a reference, create a `Variant` that unambiguously represents
|
||||||
|
the sequence
|
||||||
|
|
||||||
|
* Given a `Variant` and a new reference, translate the variant to the new reference.
|
||||||
|
|
||||||
|
* Given a mutation and a reference and a sequence, determine if the sequence has that
|
||||||
|
mutation
|
||||||
|
|
||||||
|
TODO now:
|
||||||
|
* Create a string repr and parser for Edit, perhaps
|
||||||
|
* A243T for sub
|
||||||
|
* 119TAGGCTA for insertion
|
||||||
|
* TGAGCTA9 for deletion
|
||||||
|
* Create a parser + print/show for edit
|
||||||
|
* Play around with some NGS results rel. to picked reference.
|
||||||
|
* Is it easy to construct ref and variants? I.e. is API nice?
|
||||||
|
* Is it nice and easy to check if a mut is present?
|
||||||
|
*
|
||||||
|
|
||||||
|
* Implement "reference switching".
|
||||||
|
* Add tests
|
||||||
|
"""
|
||||||
|
|
||||||
|
using BioSequences
|
||||||
|
using BioAlignments
|
||||||
|
using SequenceVariation
|
||||||
|
|
||||||
|
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL, gap_open=-25, gap_extend=-2)
|
||||||
|
|
||||||
|
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
||||||
|
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
||||||
|
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
|
||||||
|
var = Variant(align(seq1, seq2))
|
Loading…
Reference in a new issue