You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
243 lines
8.1 KiB
Julia
243 lines
8.1 KiB
Julia
"""
|
|
Haplotype{S<:BioSequence,T<:BioSymbol}
|
|
|
|
A set of variations within a given sequence that are all found together. Depending on the
|
|
field, it might also be referred to as a "genotype" or "strain."
|
|
|
|
# Constructors
|
|
|
|
Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
Haplotype(
|
|
aln::PairwiseAlignment{T,T}
|
|
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
|
|
|
When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
|
|
the edits are applied sequentially from first to last position, therefore the vector must
|
|
always be sorted by position. These edits are sorted automatically if constructing from an
|
|
alignment.
|
|
"""
|
|
struct Haplotype{S<:BioSequence,T<:BioSymbol}
|
|
ref::S
|
|
edits::Vector{Edit{S,T}}
|
|
|
|
Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
|
end
|
|
|
|
function Haplotype{S,T}(
|
|
ref::S, edits::Vector{Edit{S,T}}
|
|
) where {S<:BioSequence,T<:BioSymbol}
|
|
sort!(edits; by=x -> x.pos)
|
|
result = Haplotype{S,T}(ref, edits, Unsafe())
|
|
valid, message = _is_valid(result)
|
|
valid || error(message)
|
|
return result
|
|
end
|
|
|
|
function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
return Haplotype{S,T}(ref, edits)
|
|
end
|
|
|
|
function Base.show(io::IO, x::Haplotype)
|
|
n = length(x.edits)
|
|
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
|
for i in x.edits
|
|
v = Variation(x.ref, i)
|
|
print(io, "\n ")
|
|
show(io, v)
|
|
end
|
|
end
|
|
|
|
"""
|
|
_is_valid(h::Haplotype{S,T}) where {S,T}
|
|
|
|
Validate `h`. `h` is invalid if any of its operations are out of bounds, or the same
|
|
position is affected by multiple edits.
|
|
"""
|
|
function _is_valid(h::Haplotype{S,T}) where {S,T}
|
|
# Empty references mean we have nothing to compare to
|
|
isempty(reference(h)) && return (false, "Empty reference")
|
|
# Empty edits simple means that this is the reference haplotype
|
|
isempty(_edits(h)) && return (true, "")
|
|
|
|
# It is valid for edits to exist within the space between the first edit and the entire
|
|
# length of the reference. The reason we can't use simply use the length of the reference
|
|
# is because we couldn't tell if a starting insertion overlapped with a future edit that
|
|
# way. Also, force the range to be a signed integer to avoid subtraction overflow errors
|
|
# when there is a starting insertion.
|
|
valid_positions = UnitRange{Int128}(
|
|
leftposition(first(_edits(h))), UInt64(length(reference(h)))
|
|
)
|
|
# Placeholder flag for iterating through multiple insertions at the same position
|
|
last_was_insert = false
|
|
|
|
for edit in _edits(h)
|
|
pos = leftposition(edit)
|
|
op = _mutation(edit)
|
|
|
|
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
|
# variations
|
|
_is_valid(Variation{S,T}(h.ref, edit, Unsafe())) ||
|
|
return (false, "Invalid Variation")
|
|
|
|
if op isa Substitution
|
|
# For substitutions we simply do not allow another modification of the same base
|
|
pos in valid_positions ||
|
|
return (false, "Multiple modifications at same position")
|
|
|
|
# Invalidate this base's and all previous positions. This only works because
|
|
# we're working with a pre-sorted list of edits.
|
|
valid_positions = (pos + 1):last(valid_positions)
|
|
last_was_insert = false
|
|
elseif op isa Insertion
|
|
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
|
# for next op. However, we cannot have two insertions at the same position, because
|
|
# then the order of them is ambiguous
|
|
pos in (
|
|
(first(valid_positions) - 1 + 2 * last_was_insert):(last(valid_positions) + 1)
|
|
) || return (false, "Multiple insertions at same position")
|
|
|
|
last_was_insert = true
|
|
elseif op isa Deletion
|
|
# Deletions obviously invalidate the reference bases that are deleted.
|
|
len = length(op)
|
|
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
|
|
return (false, "Deletion out of range")
|
|
|
|
valid_positions = (first(valid_positions) + len):last(valid_positions)
|
|
last_was_insert = false
|
|
end
|
|
end
|
|
return (true, "")
|
|
end
|
|
|
|
function Haplotype(
|
|
aln::PairwiseAlignment{T,T}
|
|
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
|
ref = aln.b
|
|
E = eltype(typeof(ref))
|
|
edits = Edit{T,E}[]
|
|
refpos = first(aln.a.aln.anchors).refpos
|
|
seqpos = first(aln.a.aln.anchors).seqpos
|
|
markpos = 0
|
|
n_gaps = n_ins = 0
|
|
insertion_buffer = E[]
|
|
for (seqi, refi) in aln
|
|
isgap(refi) || (refpos += 1)
|
|
isgap(seqi) || (seqpos += 1)
|
|
|
|
# Check for deletions
|
|
if isgap(seqi)
|
|
iszero(n_gaps) && (markpos = refpos)
|
|
n_gaps += 1
|
|
elseif !iszero(n_gaps)
|
|
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
|
n_gaps = 0
|
|
end
|
|
|
|
# Check for insertions
|
|
if isgap(refi)
|
|
iszero(n_ins) && (markpos = refpos + 1)
|
|
push!(insertion_buffer, seqi)
|
|
n_ins += 1
|
|
elseif !iszero(n_ins)
|
|
seq = T(insertion_buffer)
|
|
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
|
|
empty!(insertion_buffer)
|
|
n_ins = 0
|
|
end
|
|
|
|
# Substitutions
|
|
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
|
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
|
|
end
|
|
end
|
|
|
|
# Check for clips at the end of the alignment
|
|
last_anchors = aln.a.aln.anchors[(end - 1):end]
|
|
|
|
# Final indel, if applicable
|
|
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
|
if !iszero(n_gaps)
|
|
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
|
elseif !iszero(n_ins)
|
|
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
|
end
|
|
end
|
|
|
|
return Haplotype(ref, edits)
|
|
end
|
|
|
|
"""
|
|
_edits(h::Haplotype)
|
|
|
|
Gets the [`Edit`](@ref)s that comprise `h`
|
|
"""
|
|
_edits(h::Haplotype) = h.edits
|
|
|
|
"""
|
|
reference(h::Haplotype)
|
|
|
|
Gets the reference sequence of `h`.
|
|
"""
|
|
reference(h::Haplotype) = h.ref
|
|
Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
|
|
|
|
"""
|
|
reconstruct(h::Haplotype)
|
|
|
|
Apply the edits in `h` to the reference sequence of `h` and return the mutated sequence
|
|
"""
|
|
function reconstruct(h::Haplotype)
|
|
len = length(reference(h)) + sum(edit -> _lendiff(edit), _edits(h))
|
|
seq = copy(reference(h))
|
|
resize!(seq, len % UInt)
|
|
refpos = seqpos = 1
|
|
for edit in _edits(h)
|
|
while refpos < leftposition(edit)
|
|
seq[seqpos] = reference(h)[refpos]
|
|
refpos += 1
|
|
seqpos += 1
|
|
end
|
|
editx = _mutation(edit)
|
|
if editx isa Substitution
|
|
seq[seqpos] = editx.x
|
|
seqpos += 1
|
|
refpos += 1
|
|
elseif editx isa Deletion
|
|
refpos += editx.len
|
|
elseif editx isa Insertion
|
|
for i in editx.seq
|
|
seq[seqpos] = i
|
|
seqpos += 1
|
|
end
|
|
end
|
|
end
|
|
while seqpos ≤ length(seq)
|
|
seq[seqpos] = reference(h)[refpos]
|
|
refpos += 1
|
|
seqpos += 1
|
|
end
|
|
return seq
|
|
end
|
|
|
|
"""
|
|
translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
|
|
|
Convert the variations in `hap` to a new reference sequence based upon `aln`. The alignment
|
|
rules follow the conventions of
|
|
[`translate(::Variation, PairwiseAlignment)`](@ref translate(::Variation{S,T}, ::PairwiseAlignment{S,S}) where {S,T}).
|
|
Indels at the beginning or end may not be preserved. Returns a new
|
|
[`Haplotype`](@ref)
|
|
"""
|
|
function translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
|
vars = variations(hap)
|
|
new_ref = BA.sequence(aln)
|
|
translated_vars = Variation{S,T}[]
|
|
for v in vars
|
|
new_v = translate(v, aln)
|
|
isnothing(new_v) || push!(translated_vars, new_v)
|
|
end
|
|
return Haplotype(new_ref, translated_vars)
|
|
end
|