From 5839de3ce1e406756953a4e55ce3a630349d7c9a Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Wed, 4 Jan 2023 12:45:25 -0600 Subject: [PATCH] Rename `Variant` type to `Haplotype` --- docs/src/api.md | 6 ++--- docs/src/compare.md | 8 +++---- docs/src/variants.md | 6 ++--- docs/src/variations.md | 6 ++--- src/Haplotype.jl | 52 +++++++++++++++++++++------------------- src/SequenceVariation.jl | 6 ++--- src/Variation.jl | 12 +++++----- test/runtests.jl | 26 ++++++++++---------- 8 files changed, 62 insertions(+), 60 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 0d7688e..693b315 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -18,8 +18,8 @@ Insertion ## Variants ```@docs -Variant -reference(::Variant) +Haplotype +reference(::Haplotype) variations reconstruct! ``` @@ -49,7 +49,7 @@ _lendiff ```@docs _edits -_is_valid(::Variant) +_is_valid(::Haplotype) ``` ### Variations diff --git a/docs/src/compare.md b/docs/src/compare.md index 9ee3693..3717498 100644 --- a/docs/src/compare.md +++ b/docs/src/compare.md @@ -6,7 +6,7 @@ CurrentModule = SequenceVariation ## Checking for variations in a known variant -Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is +Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is efficiently accomplished using the `in` operator. ```@setup call_variants @@ -21,8 +21,8 @@ bos_ovis_alignment = bos_human_alignment = PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); -bos_ovis_variant = Variant(bos_ovis_alignment) -bos_human_variant = Variant(bos_human_alignment) +bos_ovis_variant = Haplotype(bos_ovis_alignment) +bos_human_variant = Haplotype(bos_human_alignment) ``` ```@example call_variants @@ -42,6 +42,6 @@ that aren't validated by another variant. ```@repl call_variants sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant)); -Variant(bovine, sheeple) +Haplotype(bovine, sheeple) reconstruct!(bovine, ans) ``` diff --git a/docs/src/variants.md b/docs/src/variants.md index c04abf0..805e1f3 100644 --- a/docs/src/variants.md +++ b/docs/src/variants.md @@ -8,7 +8,7 @@ CurrentModule = SequenceVariation The first step in working with sequence variation is to identify (call) variations. SequenceVariation can directly call variants using the -`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type. +`Haplotype(::PairwiseAlignment)` constructor of the [`Haplotype`](@ref) type. ```@repl call_variants using SequenceVariation, BioAlignments, BioSequences @@ -22,8 +22,8 @@ bos_ovis_alignment = bos_human_alignment = PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); -bos_ovis_variant = Variant(bos_ovis_alignment) -bos_human_variant = Variant(bos_human_alignment) +bos_ovis_variant = Haplotype(bos_ovis_alignment) +bos_human_variant = Haplotype(bos_human_alignment) ``` ## Sequence reconstruction diff --git a/docs/src/variations.md b/docs/src/variations.md index 229bae5..d374f0d 100644 --- a/docs/src/variations.md +++ b/docs/src/variations.md @@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA")) ## Extraction -Sequence variations may also be extracted wholesale from a [`Variant`](@ref) +Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref) using the [`variations`](@ref) function. ```@setup call_variants @@ -42,8 +42,8 @@ bos_ovis_alignment = bos_human_alignment = PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); -bos_ovis_variant = Variant(bos_ovis_alignment) -bos_human_variant = Variant(bos_human_alignment) +bos_ovis_variant = Haplotype(bos_ovis_alignment) +bos_human_variant = Haplotype(bos_human_alignment) ``` ```@repl call_variants diff --git a/src/Haplotype.jl b/src/Haplotype.jl index 2ce2aeb..945200d 100644 --- a/src/Haplotype.jl +++ b/src/Haplotype.jl @@ -1,41 +1,43 @@ """ - Variant{S<:BioSequence,T<:BioSymbol} + Haplotype{S<:BioSequence,T<:BioSymbol} A set of variations within a given sequence that are all found together. Depending on the field, it might also be referred to as a "genotype," "haplotype," or "strain." # Constructors - Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} - Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} - Variant( + Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} + Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} + Haplotype( aln::PairwiseAlignment{T,T} ) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} -When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the -edits are applied sequentially from first to last position, therefore the vector must always -be sorted by position. These edits are sorted automatically if constructing from an +When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, +the edits are applied sequentially from first to last position, therefore the vector must +always be sorted by position. These edits are sorted automatically if constructing from an alignment. """ -struct Variant{S<:BioSequence,T<:BioSymbol} +struct Haplotype{S<:BioSequence,T<:BioSymbol} ref::S edits::Vector{Edit{S,T}} - Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits) + Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits) end -function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} +function Haplotype{S,T}( + ref::S, edits::Vector{Edit{S,T}} +) where {S<:BioSequence,T<:BioSymbol} sort!(edits; by=x -> x.pos) - result = Variant{S,T}(ref, edits, Unsafe()) + result = Haplotype{S,T}(ref, edits, Unsafe()) _is_valid(result) || error("TODO") # report what kind of error message? return result end -function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} - return Variant{S,T}(ref, edits) +function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} + return Haplotype{S,T}(ref, edits) end -function Base.show(io::IO, x::Variant) +function Base.show(io::IO, x::Haplotype) n = length(x.edits) print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):") for i in x.edits @@ -46,12 +48,12 @@ function Base.show(io::IO, x::Variant) end """ - is_valid(v::Variant) + is_valid(v::Haplotype) Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same position is affected by multiple edits. """ -function _is_valid(v::Variant) +function _is_valid(v::Haplotype) isempty(v.ref) && return false valid_positions = 1:length(v.ref) last_was_insert = false @@ -87,7 +89,7 @@ function _is_valid(v::Variant) return true end -function Variant( +function Haplotype( aln::PairwiseAlignment{T,T} ) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} ref = aln.b @@ -141,30 +143,30 @@ function Variant( end end - return Variant(ref, edits) + return Haplotype(ref, edits) end """ - _edits(v::Variant) + _edits(v::Haplotype) Gets the [`Edit`](@ref)s that comprise `v` """ -_edits(v::Variant) = v.edits +_edits(v::Haplotype) = v.edits """ - reference(v::Variant) + reference(v::Haplotype) Gets the reference sequence of `v`. """ -reference(v::Variant) = v.ref -Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits +reference(v::Haplotype) = v.ref +Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits """ - reconstruct!(seq::S, x::Variant{S}) where {S} + reconstruct!(seq::S, x::Haplotype{S}) where {S} Apply the edits in `x` to `seq` and return the mutated sequence """ -function reconstruct!(seq::S, x::Variant{S}) where {S} +function reconstruct!(seq::S, x::Haplotype{S}) where {S} len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x)) resize!(seq, len % UInt) refpos = seqpos = 1 diff --git a/src/SequenceVariation.jl b/src/SequenceVariation.jl index fa8f335..93305e6 100644 --- a/src/SequenceVariation.jl +++ b/src/SequenceVariation.jl @@ -2,10 +2,10 @@ module SequenceVariation """ Needs to be able to: -* Given a sequence and a reference, create a `Variant` that unambiguously represents +* Given a sequence and a reference, create a `Haplotype` that unambiguously represents the sequence -* Given a `Variant` and a new reference, translate the variant to the new reference. +* Given a `Haplotype` and a new reference, translate the variant to the new reference. * Given a mutation and a reference and a sequence, determine if the sequence has that mutation @@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga using BioSymbols: BioSymbol export Deletion +export Haplotype export Insertion export Substitution -export Variant export Variation export altbases export mutation diff --git a/src/Variation.jl b/src/Variation.jl index d38f08a..5121707 100644 --- a/src/Variation.jl +++ b/src/Variation.jl @@ -12,7 +12,7 @@ and built-in validation. Variation(ref::S, edit::AbstractString) where {S<:BioSequence} Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of -[`variations(::Variant)`](@ref) is encouraged, instead. +[`variations(::Haplotype)`](@ref) is encouraged, instead. Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the following syntax: @@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence} return Variation{S,T}(ref, e) end -function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} +function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} edits = _edit.(vars) - return Variant{S,T}(ref, edits) + return Haplotype{S,T}(ref, edits) end """ @@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation) end end -function Base.in(v::Variation, var::Variant) +function Base.in(v::Variation, var::Haplotype) if v.ref != var.ref error("References must be equal") end @@ -171,11 +171,11 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T} end """ - variations(v::Variant{S,T}) where {S,T} + variations(v::Haplotype{S,T}) where {S,T} Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s. """ -function variations(v::Variant{S,T}) where {S,T} +function variations(v::Haplotype{S,T}) where {S,T} vs = Vector{Variation{S,T}}(undef, length(_edits(v))) for (i, e) in enumerate(_edits(v)) vs[i] = Variation{S,T}(reference(v), e) diff --git a/test/runtests.jl b/test/runtests.jl index 5f163b0..0bc128b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,9 +1,9 @@ """ Needs to be able to: -* Given a sequence and a reference, create a `Variant` that unambiguously represents +* Given a sequence and a reference, create a `Haplotype` that unambiguously represents the sequence -* Given a `Variant` and a new reference, translate the variant to the new reference. +* Given a `Haplotype` and a new reference, translate the variant to the new reference. * Given a mutation and a reference and a sequence, determine if the sequence has that mutation @@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_ align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG") seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG") -var = Variant(align(seq1, seq2)) +var = Haplotype(align(seq1, seq2)) -@testset "VariantRoundtrip" begin +@testset "HaplotypeRoundtrip" begin for v in variations(var) @test v in var - @test v in Variant(seq2, [v]) + @test v in Haplotype(seq2, [v]) end end @@ -51,7 +51,7 @@ end read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3)) aln01 = PairwiseAlignment(read01, refseq) aln02 = PairwiseAlignment(read02, refseq) - @test Variant(aln01).edits == Variant(aln02).edits + @test Haplotype(aln01).edits == Haplotype(aln02).edits end @testset "VariationParsing" begin @@ -72,7 +72,7 @@ end read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1)) aln = PairwiseAlignment(read, refseq) - var = Variant(aln) + var = Haplotype(aln) sub = Variation(refseq, "A4T") @test first(variations(var)) == sub @@ -108,31 +108,31 @@ end @test altbases(Variation(dna"ATCGA", "1C")) == dna"CA" end -@testset "SoftclipVariant" begin +@testset "SoftclipHaplotype" begin refseq = dna"GATTACA" mutseq = dna"GATTACAAAA" - refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[]) + refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[]) # Test for ending soft clip - @test Variant( + @test Haplotype( PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq) ) == refvar # Test for ending soft+hard clip - @test Variant( + @test Haplotype( PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq) ) == refvar # Test that ending insertions are still valid @test length( - Variant( + Haplotype( PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq) ).edits, ) == 1 # Test that out-of-bounds bases are still caught - @test_throws BoundsError Variant( + @test_throws BoundsError Haplotype( PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq) ) end