Rename Variant type to Haplotype

This commit is contained in:
Thomas A. Christensen II 2023-01-04 12:45:25 -06:00
parent 44aa41e628
commit 5839de3ce1
8 changed files with 62 additions and 60 deletions

View file

@ -18,8 +18,8 @@ Insertion
## Variants
```@docs
Variant
reference(::Variant)
Haplotype
reference(::Haplotype)
variations
reconstruct!
```
@ -49,7 +49,7 @@ _lendiff
```@docs
_edits
_is_valid(::Variant)
_is_valid(::Haplotype)
```
### Variations

View file

@ -6,7 +6,7 @@ CurrentModule = SequenceVariation
## Checking for variations in a known variant
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
efficiently accomplished using the `in` operator.
```@setup call_variants
@ -21,8 +21,8 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
bos_ovis_variant = Haplotype(bos_ovis_alignment)
bos_human_variant = Haplotype(bos_human_alignment)
```
```@example call_variants
@ -42,6 +42,6 @@ that aren't validated by another variant.
```@repl call_variants
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
Variant(bovine, sheeple)
Haplotype(bovine, sheeple)
reconstruct!(bovine, ans)
```

View file

@ -8,7 +8,7 @@ CurrentModule = SequenceVariation
The first step in working with sequence variation is to identify (call)
variations. SequenceVariation can directly call variants using the
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
`Haplotype(::PairwiseAlignment)` constructor of the [`Haplotype`](@ref) type.
```@repl call_variants
using SequenceVariation, BioAlignments, BioSequences
@ -22,8 +22,8 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
bos_ovis_variant = Haplotype(bos_ovis_alignment)
bos_human_variant = Haplotype(bos_human_alignment)
```
## Sequence reconstruction

View file

@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
## Extraction
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
using the [`variations`](@ref) function.
```@setup call_variants
@ -42,8 +42,8 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
bos_ovis_variant = Haplotype(bos_ovis_alignment)
bos_human_variant = Haplotype(bos_human_alignment)
```
```@repl call_variants

View file

@ -1,41 +1,43 @@
"""
Variant{S<:BioSequence,T<:BioSymbol}
Haplotype{S<:BioSequence,T<:BioSymbol}
A set of variations within a given sequence that are all found together. Depending on the
field, it might also be referred to as a "genotype," "haplotype," or "strain."
# Constructors
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(
Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Haplotype(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
edits are applied sequentially from first to last position, therefore the vector must always
be sorted by position. These edits are sorted automatically if constructing from an
When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
the edits are applied sequentially from first to last position, therefore the vector must
always be sorted by position. These edits are sorted automatically if constructing from an
alignment.
"""
struct Variant{S<:BioSequence,T<:BioSymbol}
struct Haplotype{S<:BioSequence,T<:BioSymbol}
ref::S
edits::Vector{Edit{S,T}}
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
function Haplotype{S,T}(
ref::S, edits::Vector{Edit{S,T}}
) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos)
result = Variant{S,T}(ref, edits, Unsafe())
result = Haplotype{S,T}(ref, edits, Unsafe())
_is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits)
function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Haplotype{S,T}(ref, edits)
end
function Base.show(io::IO, x::Variant)
function Base.show(io::IO, x::Haplotype)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
@ -46,12 +48,12 @@ function Base.show(io::IO, x::Variant)
end
"""
is_valid(v::Variant)
is_valid(v::Haplotype)
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
position is affected by multiple edits.
"""
function _is_valid(v::Variant)
function _is_valid(v::Haplotype)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
last_was_insert = false
@ -87,7 +89,7 @@ function _is_valid(v::Variant)
return true
end
function Variant(
function Haplotype(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b
@ -141,30 +143,30 @@ function Variant(
end
end
return Variant(ref, edits)
return Haplotype(ref, edits)
end
"""
_edits(v::Variant)
_edits(v::Haplotype)
Gets the [`Edit`](@ref)s that comprise `v`
"""
_edits(v::Variant) = v.edits
_edits(v::Haplotype) = v.edits
"""
reference(v::Variant)
reference(v::Haplotype)
Gets the reference sequence of `v`.
"""
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
reference(v::Haplotype) = v.ref
Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
"""
reconstruct!(seq::S, x::Variant{S}) where {S}
reconstruct!(seq::S, x::Haplotype{S}) where {S}
Apply the edits in `x` to `seq` and return the mutated sequence
"""
function reconstruct!(seq::S, x::Variant{S}) where {S}
function reconstruct!(seq::S, x::Haplotype{S}) where {S}
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
resize!(seq, len % UInt)
refpos = seqpos = 1

View file

@ -2,10 +2,10 @@ module SequenceVariation
"""
Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference.
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that
mutation
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
using BioSymbols: BioSymbol
export Deletion
export Haplotype
export Insertion
export Substitution
export Variant
export Variation
export altbases
export mutation

View file

@ -12,7 +12,7 @@ and built-in validation.
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
[`variations(::Variant)`](@ref) is encouraged, instead.
[`variations(::Haplotype)`](@ref) is encouraged, instead.
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
following syntax:
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
return Variation{S,T}(ref, e)
end
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
edits = _edit.(vars)
return Variant{S,T}(ref, edits)
return Haplotype{S,T}(ref, edits)
end
"""
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
end
end
function Base.in(v::Variation, var::Variant)
function Base.in(v::Variation, var::Haplotype)
if v.ref != var.ref
error("References must be equal")
end
@ -171,11 +171,11 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
end
"""
variations(v::Variant{S,T}) where {S,T}
variations(v::Haplotype{S,T}) where {S,T}
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
"""
function variations(v::Variant{S,T}) where {S,T}
function variations(v::Haplotype{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
for (i, e) in enumerate(_edits(v))
vs[i] = Variation{S,T}(reference(v), e)

View file

@ -1,9 +1,9 @@
"""
Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference.
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that
mutation
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
var = Variant(align(seq1, seq2))
var = Haplotype(align(seq1, seq2))
@testset "VariantRoundtrip" begin
@testset "HaplotypeRoundtrip" begin
for v in variations(var)
@test v in var
@test v in Variant(seq2, [v])
@test v in Haplotype(seq2, [v])
end
end
@ -51,7 +51,7 @@ end
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
aln01 = PairwiseAlignment(read01, refseq)
aln02 = PairwiseAlignment(read02, refseq)
@test Variant(aln01).edits == Variant(aln02).edits
@test Haplotype(aln01).edits == Haplotype(aln02).edits
end
@testset "VariationParsing" begin
@ -72,7 +72,7 @@ end
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
aln = PairwiseAlignment(read, refseq)
var = Variant(aln)
var = Haplotype(aln)
sub = Variation(refseq, "A4T")
@test first(variations(var)) == sub
@ -108,31 +108,31 @@ end
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
end
@testset "SoftclipVariant" begin
@testset "SoftclipHaplotype" begin
refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA"
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
# Test for ending soft clip
@test Variant(
@test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
) == refvar
# Test for ending soft+hard clip
@test Variant(
@test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
) == refvar
# Test that ending insertions are still valid
@test length(
Variant(
Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
).edits,
) == 1
# Test that out-of-bounds bases are still caught
@test_throws BoundsError Variant(
@test_throws BoundsError Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
)
end