mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 05:19:55 +00:00
Rename Variant
type to Haplotype
This commit is contained in:
parent
44aa41e628
commit
5839de3ce1
8 changed files with 62 additions and 60 deletions
|
@ -18,8 +18,8 @@ Insertion
|
|||
## Variants
|
||||
|
||||
```@docs
|
||||
Variant
|
||||
reference(::Variant)
|
||||
Haplotype
|
||||
reference(::Haplotype)
|
||||
variations
|
||||
reconstruct!
|
||||
```
|
||||
|
@ -49,7 +49,7 @@ _lendiff
|
|||
|
||||
```@docs
|
||||
_edits
|
||||
_is_valid(::Variant)
|
||||
_is_valid(::Haplotype)
|
||||
```
|
||||
|
||||
### Variations
|
||||
|
|
|
@ -6,7 +6,7 @@ CurrentModule = SequenceVariation
|
|||
|
||||
## Checking for variations in a known variant
|
||||
|
||||
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
|
||||
Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
|
||||
efficiently accomplished using the `in` operator.
|
||||
|
||||
```@setup call_variants
|
||||
|
@ -21,8 +21,8 @@ bos_ovis_alignment =
|
|||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
bos_ovis_variant = Haplotype(bos_ovis_alignment)
|
||||
bos_human_variant = Haplotype(bos_human_alignment)
|
||||
```
|
||||
|
||||
```@example call_variants
|
||||
|
@ -42,6 +42,6 @@ that aren't validated by another variant.
|
|||
|
||||
```@repl call_variants
|
||||
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
|
||||
Variant(bovine, sheeple)
|
||||
Haplotype(bovine, sheeple)
|
||||
reconstruct!(bovine, ans)
|
||||
```
|
||||
|
|
|
@ -8,7 +8,7 @@ CurrentModule = SequenceVariation
|
|||
|
||||
The first step in working with sequence variation is to identify (call)
|
||||
variations. SequenceVariation can directly call variants using the
|
||||
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
|
||||
`Haplotype(::PairwiseAlignment)` constructor of the [`Haplotype`](@ref) type.
|
||||
|
||||
```@repl call_variants
|
||||
using SequenceVariation, BioAlignments, BioSequences
|
||||
|
@ -22,8 +22,8 @@ bos_ovis_alignment =
|
|||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
bos_ovis_variant = Haplotype(bos_ovis_alignment)
|
||||
bos_human_variant = Haplotype(bos_human_alignment)
|
||||
```
|
||||
|
||||
## Sequence reconstruction
|
||||
|
|
|
@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
|
|||
|
||||
## Extraction
|
||||
|
||||
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
|
||||
Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
|
||||
using the [`variations`](@ref) function.
|
||||
|
||||
```@setup call_variants
|
||||
|
@ -42,8 +42,8 @@ bos_ovis_alignment =
|
|||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
bos_ovis_variant = Haplotype(bos_ovis_alignment)
|
||||
bos_human_variant = Haplotype(bos_human_alignment)
|
||||
```
|
||||
|
||||
```@repl call_variants
|
||||
|
|
|
@ -1,41 +1,43 @@
|
|||
"""
|
||||
Variant{S<:BioSequence,T<:BioSymbol}
|
||||
Haplotype{S<:BioSequence,T<:BioSymbol}
|
||||
|
||||
A set of variations within a given sequence that are all found together. Depending on the
|
||||
field, it might also be referred to as a "genotype," "haplotype," or "strain."
|
||||
|
||||
# Constructors
|
||||
|
||||
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variant(
|
||||
Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Haplotype(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
|
||||
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
|
||||
edits are applied sequentially from first to last position, therefore the vector must always
|
||||
be sorted by position. These edits are sorted automatically if constructing from an
|
||||
When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
|
||||
the edits are applied sequentially from first to last position, therefore the vector must
|
||||
always be sorted by position. These edits are sorted automatically if constructing from an
|
||||
alignment.
|
||||
"""
|
||||
struct Variant{S<:BioSequence,T<:BioSymbol}
|
||||
struct Haplotype{S<:BioSequence,T<:BioSymbol}
|
||||
ref::S
|
||||
edits::Vector{Edit{S,T}}
|
||||
|
||||
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||
Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||
end
|
||||
|
||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
function Haplotype{S,T}(
|
||||
ref::S, edits::Vector{Edit{S,T}}
|
||||
) where {S<:BioSequence,T<:BioSymbol}
|
||||
sort!(edits; by=x -> x.pos)
|
||||
result = Variant{S,T}(ref, edits, Unsafe())
|
||||
result = Haplotype{S,T}(ref, edits, Unsafe())
|
||||
_is_valid(result) || error("TODO") # report what kind of error message?
|
||||
return result
|
||||
end
|
||||
|
||||
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
return Variant{S,T}(ref, edits)
|
||||
function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
return Haplotype{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::Variant)
|
||||
function Base.show(io::IO, x::Haplotype)
|
||||
n = length(x.edits)
|
||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||
for i in x.edits
|
||||
|
@ -46,12 +48,12 @@ function Base.show(io::IO, x::Variant)
|
|||
end
|
||||
|
||||
"""
|
||||
is_valid(v::Variant)
|
||||
is_valid(v::Haplotype)
|
||||
|
||||
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
|
||||
position is affected by multiple edits.
|
||||
"""
|
||||
function _is_valid(v::Variant)
|
||||
function _is_valid(v::Haplotype)
|
||||
isempty(v.ref) && return false
|
||||
valid_positions = 1:length(v.ref)
|
||||
last_was_insert = false
|
||||
|
@ -87,7 +89,7 @@ function _is_valid(v::Variant)
|
|||
return true
|
||||
end
|
||||
|
||||
function Variant(
|
||||
function Haplotype(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
ref = aln.b
|
||||
|
@ -141,30 +143,30 @@ function Variant(
|
|||
end
|
||||
end
|
||||
|
||||
return Variant(ref, edits)
|
||||
return Haplotype(ref, edits)
|
||||
end
|
||||
|
||||
"""
|
||||
_edits(v::Variant)
|
||||
_edits(v::Haplotype)
|
||||
|
||||
Gets the [`Edit`](@ref)s that comprise `v`
|
||||
"""
|
||||
_edits(v::Variant) = v.edits
|
||||
_edits(v::Haplotype) = v.edits
|
||||
|
||||
"""
|
||||
reference(v::Variant)
|
||||
reference(v::Haplotype)
|
||||
|
||||
Gets the reference sequence of `v`.
|
||||
"""
|
||||
reference(v::Variant) = v.ref
|
||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||
reference(v::Haplotype) = v.ref
|
||||
Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
|
||||
|
||||
"""
|
||||
reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
reconstruct!(seq::S, x::Haplotype{S}) where {S}
|
||||
|
||||
Apply the edits in `x` to `seq` and return the mutated sequence
|
||||
"""
|
||||
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
function reconstruct!(seq::S, x::Haplotype{S}) where {S}
|
||||
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
||||
|
|
|
@ -2,10 +2,10 @@ module SequenceVariation
|
|||
|
||||
"""
|
||||
Needs to be able to:
|
||||
* Given a sequence and a reference, create a `Variant` that unambiguously represents
|
||||
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
|
||||
the sequence
|
||||
|
||||
* Given a `Variant` and a new reference, translate the variant to the new reference.
|
||||
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
|
||||
|
||||
* Given a mutation and a reference and a sequence, determine if the sequence has that
|
||||
mutation
|
||||
|
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
|
|||
using BioSymbols: BioSymbol
|
||||
|
||||
export Deletion
|
||||
export Haplotype
|
||||
export Insertion
|
||||
export Substitution
|
||||
export Variant
|
||||
export Variation
|
||||
export altbases
|
||||
export mutation
|
||||
|
|
|
@ -12,7 +12,7 @@ and built-in validation.
|
|||
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
||||
|
||||
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
|
||||
[`variations(::Variant)`](@ref) is encouraged, instead.
|
||||
[`variations(::Haplotype)`](@ref) is encouraged, instead.
|
||||
|
||||
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
|
||||
following syntax:
|
||||
|
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
|||
return Variation{S,T}(ref, e)
|
||||
end
|
||||
|
||||
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
edits = _edit.(vars)
|
||||
return Variant{S,T}(ref, edits)
|
||||
return Haplotype{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
"""
|
||||
|
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
|
|||
end
|
||||
end
|
||||
|
||||
function Base.in(v::Variation, var::Variant)
|
||||
function Base.in(v::Variation, var::Haplotype)
|
||||
if v.ref != var.ref
|
||||
error("References must be equal")
|
||||
end
|
||||
|
@ -171,11 +171,11 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
|||
end
|
||||
|
||||
"""
|
||||
variations(v::Variant{S,T}) where {S,T}
|
||||
variations(v::Haplotype{S,T}) where {S,T}
|
||||
|
||||
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
|
||||
"""
|
||||
function variations(v::Variant{S,T}) where {S,T}
|
||||
function variations(v::Haplotype{S,T}) where {S,T}
|
||||
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
|
||||
for (i, e) in enumerate(_edits(v))
|
||||
vs[i] = Variation{S,T}(reference(v), e)
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
"""
|
||||
Needs to be able to:
|
||||
* Given a sequence and a reference, create a `Variant` that unambiguously represents
|
||||
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
|
||||
the sequence
|
||||
|
||||
* Given a `Variant` and a new reference, translate the variant to the new reference.
|
||||
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
|
||||
|
||||
* Given a mutation and a reference and a sequence, determine if the sequence has that
|
||||
mutation
|
||||
|
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
|
|||
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
||||
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
||||
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
|
||||
var = Variant(align(seq1, seq2))
|
||||
var = Haplotype(align(seq1, seq2))
|
||||
|
||||
@testset "VariantRoundtrip" begin
|
||||
@testset "HaplotypeRoundtrip" begin
|
||||
for v in variations(var)
|
||||
@test v in var
|
||||
@test v in Variant(seq2, [v])
|
||||
@test v in Haplotype(seq2, [v])
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -51,7 +51,7 @@ end
|
|||
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
|
||||
aln01 = PairwiseAlignment(read01, refseq)
|
||||
aln02 = PairwiseAlignment(read02, refseq)
|
||||
@test Variant(aln01).edits == Variant(aln02).edits
|
||||
@test Haplotype(aln01).edits == Haplotype(aln02).edits
|
||||
end
|
||||
|
||||
@testset "VariationParsing" begin
|
||||
|
@ -72,7 +72,7 @@ end
|
|||
|
||||
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
|
||||
aln = PairwiseAlignment(read, refseq)
|
||||
var = Variant(aln)
|
||||
var = Haplotype(aln)
|
||||
|
||||
sub = Variation(refseq, "A4T")
|
||||
@test first(variations(var)) == sub
|
||||
|
@ -108,31 +108,31 @@ end
|
|||
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
|
||||
end
|
||||
|
||||
@testset "SoftclipVariant" begin
|
||||
@testset "SoftclipHaplotype" begin
|
||||
refseq = dna"GATTACA"
|
||||
mutseq = dna"GATTACAAAA"
|
||||
|
||||
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
||||
refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
||||
|
||||
# Test for ending soft clip
|
||||
@test Variant(
|
||||
@test Haplotype(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
|
||||
) == refvar
|
||||
|
||||
# Test for ending soft+hard clip
|
||||
@test Variant(
|
||||
@test Haplotype(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
|
||||
) == refvar
|
||||
|
||||
# Test that ending insertions are still valid
|
||||
@test length(
|
||||
Variant(
|
||||
Haplotype(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
|
||||
).edits,
|
||||
) == 1
|
||||
|
||||
# Test that out-of-bounds bases are still caught
|
||||
@test_throws BoundsError Variant(
|
||||
@test_throws BoundsError Haplotype(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
|
||||
)
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue