Rename Variant type to Haplotype

This commit is contained in:
Thomas A. Christensen II 2023-01-04 12:45:25 -06:00
parent 44aa41e628
commit 5839de3ce1
8 changed files with 62 additions and 60 deletions

View file

@ -18,8 +18,8 @@ Insertion
## Variants ## Variants
```@docs ```@docs
Variant Haplotype
reference(::Variant) reference(::Haplotype)
variations variations
reconstruct! reconstruct!
``` ```
@ -49,7 +49,7 @@ _lendiff
```@docs ```@docs
_edits _edits
_is_valid(::Variant) _is_valid(::Haplotype)
``` ```
### Variations ### Variations

View file

@ -6,7 +6,7 @@ CurrentModule = SequenceVariation
## Checking for variations in a known variant ## Checking for variations in a known variant
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
efficiently accomplished using the `in` operator. efficiently accomplished using the `in` operator.
```@setup call_variants ```@setup call_variants
@ -21,8 +21,8 @@ bos_ovis_alignment =
bos_human_alignment = bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment) bos_ovis_variant = Haplotype(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment) bos_human_variant = Haplotype(bos_human_alignment)
``` ```
```@example call_variants ```@example call_variants
@ -42,6 +42,6 @@ that aren't validated by another variant.
```@repl call_variants ```@repl call_variants
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant)); sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
Variant(bovine, sheeple) Haplotype(bovine, sheeple)
reconstruct!(bovine, ans) reconstruct!(bovine, ans)
``` ```

View file

@ -8,7 +8,7 @@ CurrentModule = SequenceVariation
The first step in working with sequence variation is to identify (call) The first step in working with sequence variation is to identify (call)
variations. SequenceVariation can directly call variants using the variations. SequenceVariation can directly call variants using the
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type. `Haplotype(::PairwiseAlignment)` constructor of the [`Haplotype`](@ref) type.
```@repl call_variants ```@repl call_variants
using SequenceVariation, BioAlignments, BioSequences using SequenceVariation, BioAlignments, BioSequences
@ -22,8 +22,8 @@ bos_ovis_alignment =
bos_human_alignment = bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment) bos_ovis_variant = Haplotype(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment) bos_human_variant = Haplotype(bos_human_alignment)
``` ```
## Sequence reconstruction ## Sequence reconstruction

View file

@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
## Extraction ## Extraction
Sequence variations may also be extracted wholesale from a [`Variant`](@ref) Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
using the [`variations`](@ref) function. using the [`variations`](@ref) function.
```@setup call_variants ```@setup call_variants
@ -42,8 +42,8 @@ bos_ovis_alignment =
bos_human_alignment = bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment) bos_ovis_variant = Haplotype(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment) bos_human_variant = Haplotype(bos_human_alignment)
``` ```
```@repl call_variants ```@repl call_variants

View file

@ -1,41 +1,43 @@
""" """
Variant{S<:BioSequence,T<:BioSymbol} Haplotype{S<:BioSequence,T<:BioSymbol}
A set of variations within a given sequence that are all found together. Depending on the A set of variations within a given sequence that are all found together. Depending on the
field, it might also be referred to as a "genotype," "haplotype," or "strain." field, it might also be referred to as a "genotype," "haplotype," or "strain."
# Constructors # Constructors
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant( Haplotype(
aln::PairwiseAlignment{T,T} aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} ) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
edits are applied sequentially from first to last position, therefore the vector must always the edits are applied sequentially from first to last position, therefore the vector must
be sorted by position. These edits are sorted automatically if constructing from an always be sorted by position. These edits are sorted automatically if constructing from an
alignment. alignment.
""" """
struct Variant{S<:BioSequence,T<:BioSymbol} struct Haplotype{S<:BioSequence,T<:BioSymbol}
ref::S ref::S
edits::Vector{Edit{S,T}} edits::Vector{Edit{S,T}}
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits) Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end end
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} function Haplotype{S,T}(
ref::S, edits::Vector{Edit{S,T}}
) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos) sort!(edits; by=x -> x.pos)
result = Variant{S,T}(ref, edits, Unsafe()) result = Haplotype{S,T}(ref, edits, Unsafe())
_is_valid(result) || error("TODO") # report what kind of error message? _is_valid(result) || error("TODO") # report what kind of error message?
return result return result
end end
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits) return Haplotype{S,T}(ref, edits)
end end
function Base.show(io::IO, x::Variant) function Base.show(io::IO, x::Haplotype)
n = length(x.edits) n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):") print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits for i in x.edits
@ -46,12 +48,12 @@ function Base.show(io::IO, x::Variant)
end end
""" """
is_valid(v::Variant) is_valid(v::Haplotype)
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
position is affected by multiple edits. position is affected by multiple edits.
""" """
function _is_valid(v::Variant) function _is_valid(v::Haplotype)
isempty(v.ref) && return false isempty(v.ref) && return false
valid_positions = 1:length(v.ref) valid_positions = 1:length(v.ref)
last_was_insert = false last_was_insert = false
@ -87,7 +89,7 @@ function _is_valid(v::Variant)
return true return true
end end
function Variant( function Haplotype(
aln::PairwiseAlignment{T,T} aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} ) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b ref = aln.b
@ -141,30 +143,30 @@ function Variant(
end end
end end
return Variant(ref, edits) return Haplotype(ref, edits)
end end
""" """
_edits(v::Variant) _edits(v::Haplotype)
Gets the [`Edit`](@ref)s that comprise `v` Gets the [`Edit`](@ref)s that comprise `v`
""" """
_edits(v::Variant) = v.edits _edits(v::Haplotype) = v.edits
""" """
reference(v::Variant) reference(v::Haplotype)
Gets the reference sequence of `v`. Gets the reference sequence of `v`.
""" """
reference(v::Variant) = v.ref reference(v::Haplotype) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
""" """
reconstruct!(seq::S, x::Variant{S}) where {S} reconstruct!(seq::S, x::Haplotype{S}) where {S}
Apply the edits in `x` to `seq` and return the mutated sequence Apply the edits in `x` to `seq` and return the mutated sequence
""" """
function reconstruct!(seq::S, x::Variant{S}) where {S} function reconstruct!(seq::S, x::Haplotype{S}) where {S}
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x)) len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
resize!(seq, len % UInt) resize!(seq, len % UInt)
refpos = seqpos = 1 refpos = seqpos = 1

View file

@ -2,10 +2,10 @@ module SequenceVariation
""" """
Needs to be able to: Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents * Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference. * Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that * Given a mutation and a reference and a sequence, determine if the sequence has that
mutation mutation
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
using BioSymbols: BioSymbol using BioSymbols: BioSymbol
export Deletion export Deletion
export Haplotype
export Insertion export Insertion
export Substitution export Substitution
export Variant
export Variation export Variation
export altbases export altbases
export mutation export mutation

View file

@ -12,7 +12,7 @@ and built-in validation.
Variation(ref::S, edit::AbstractString) where {S<:BioSequence} Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
[`variations(::Variant)`](@ref) is encouraged, instead. [`variations(::Haplotype)`](@ref) is encouraged, instead.
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
following syntax: following syntax:
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
return Variation{S,T}(ref, e) return Variation{S,T}(ref, e)
end end
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
edits = _edit.(vars) edits = _edit.(vars)
return Variant{S,T}(ref, edits) return Haplotype{S,T}(ref, edits)
end end
""" """
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
end end
end end
function Base.in(v::Variation, var::Variant) function Base.in(v::Variation, var::Haplotype)
if v.ref != var.ref if v.ref != var.ref
error("References must be equal") error("References must be equal")
end end
@ -171,11 +171,11 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
end end
""" """
variations(v::Variant{S,T}) where {S,T} variations(v::Haplotype{S,T}) where {S,T}
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s. Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
""" """
function variations(v::Variant{S,T}) where {S,T} function variations(v::Haplotype{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(v))) vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
for (i, e) in enumerate(_edits(v)) for (i, e) in enumerate(_edits(v))
vs[i] = Variation{S,T}(reference(v), e) vs[i] = Variation{S,T}(reference(v), e)

View file

@ -1,9 +1,9 @@
""" """
Needs to be able to: Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents * Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference. * Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that * Given a mutation and a reference and a sequence, determine if the sequence has that
mutation mutation
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG") seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG") seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
var = Variant(align(seq1, seq2)) var = Haplotype(align(seq1, seq2))
@testset "VariantRoundtrip" begin @testset "HaplotypeRoundtrip" begin
for v in variations(var) for v in variations(var)
@test v in var @test v in var
@test v in Variant(seq2, [v]) @test v in Haplotype(seq2, [v])
end end
end end
@ -51,7 +51,7 @@ end
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3)) read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
aln01 = PairwiseAlignment(read01, refseq) aln01 = PairwiseAlignment(read01, refseq)
aln02 = PairwiseAlignment(read02, refseq) aln02 = PairwiseAlignment(read02, refseq)
@test Variant(aln01).edits == Variant(aln02).edits @test Haplotype(aln01).edits == Haplotype(aln02).edits
end end
@testset "VariationParsing" begin @testset "VariationParsing" begin
@ -72,7 +72,7 @@ end
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1)) read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
aln = PairwiseAlignment(read, refseq) aln = PairwiseAlignment(read, refseq)
var = Variant(aln) var = Haplotype(aln)
sub = Variation(refseq, "A4T") sub = Variation(refseq, "A4T")
@test first(variations(var)) == sub @test first(variations(var)) == sub
@ -108,31 +108,31 @@ end
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA" @test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
end end
@testset "SoftclipVariant" begin @testset "SoftclipHaplotype" begin
refseq = dna"GATTACA" refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA" mutseq = dna"GATTACAAAA"
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[]) refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
# Test for ending soft clip # Test for ending soft clip
@test Variant( @test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
) == refvar ) == refvar
# Test for ending soft+hard clip # Test for ending soft+hard clip
@test Variant( @test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
) == refvar ) == refvar
# Test that ending insertions are still valid # Test that ending insertions are still valid
@test length( @test length(
Variant( Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
).edits, ).edits,
) == 1 ) == 1
# Test that out-of-bounds bases are still caught # Test that out-of-bounds bases are still caught
@test_throws BoundsError Variant( @test_throws BoundsError Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
) )
end end