Compare commits

...

6 commits

10 changed files with 95 additions and 91 deletions

View file

@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Renamed type `Variant` to `Haplotype` ([#20](https://github.com/BioJulia/SequenceVariation.jl/issues/20)/[#29](https://github.com/BioJulia/SequenceVariation.jl/pull/29))
### Removed

View file

@ -14,7 +14,7 @@ makedocs(;
modules=[SequenceVariation],
pages=[
"Home" => "index.md",
"Working with variants" => "variants.md",
"Working with haplotypes" => "haplotypes.md",
"Working with variations" => "variations.md",
"Comparing variations" => "compare.md",
"API Reference" => "api.md",

View file

@ -18,8 +18,8 @@ Insertion
## Variants
```@docs
Variant
reference(::Variant)
Haplotype
reference(::Haplotype)
variations
reconstruct!
```
@ -49,7 +49,7 @@ _lendiff
```@docs
_edits
_is_valid(::Variant)
_is_valid(::Haplotype)
```
### Variations

View file

@ -4,9 +4,9 @@ CurrentModule = SequenceVariation
# Comparing variations in sequences
## Checking for variations in a known variant
## Checking for variations in a known haplotype
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
efficiently accomplished using the `in` operator.
```@setup call_variants
@ -21,27 +21,27 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_haplotype = Haplotype(bos_human_alignment)
```
```@example call_variants
println("\tOvis aires\tHomo sapiens")
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant))
is_sheep = v in bos_ovis_variant
is_human = v in bos_human_variant
for v in vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype))
is_sheep = v in bos_ovis_haplotype
is_human = v in bos_human_haplotype
println("$v\t$is_sheep\t\t$is_human")
end
```
## Constructing new variants based on other variations
## Constructing new haplotypes based on other variations
New variants can be constructed using variations. This might be useful to pool
variations found on different reads or to filter variations from a variant
that aren't validated by another variant.
New haplotypes can be constructed using variations. This might be useful to pool
variations found on different reads or to filter variations from a haplotype
that aren't validated by another haplotype.
```@repl call_variants
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
Variant(bovine, sheeple)
sheeple = vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype));
Haplotype(bovine, sheeple)
reconstruct!(bovine, ans)
```

View file

@ -2,13 +2,14 @@
CurrentModule = SequenceVariation
```
# Working with variants
# Working with haplotypes
## Calling variants
The first step in working with sequence variation is to identify (call)
variations. SequenceVariation can directly call variants using the
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
variations between two sequences. SequenceVariation can directly call variants
using the `Haplotype(::PairwiseAlignment)` constructor of the
[`Haplotype`](@ref) type.
```@repl call_variants
using SequenceVariation, BioAlignments, BioSequences
@ -22,19 +23,19 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_haplotype = Haplotype(bos_human_alignment)
```
## Sequence reconstruction
If the alternate sequence of a variant is no longer available (as is often the
If the alternate sequence of a haplotype is no longer available (as is often the
case when calling variants from alignment files), then the sequence can be
retrieved using the [`reconstruct!`](@ref) function.
```@repl call_variants
human2 = copy(bovine);
reconstruct!(human2, bos_human_variant)
reconstruct!(human2, bos_human_haplotype)
human2 == bovine
human2 == human
```

View file

@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
## Extraction
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
using the [`variations`](@ref) function.
```@setup call_variants
@ -42,13 +42,13 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_haplotype = Haplotype(bos_human_alignment)
```
```@repl call_variants
variations(bos_ovis_variant)
variations(bos_human_variant)
variations(bos_ovis_haplotype)
variations(bos_human_haplotype)
```
## Reference switching
@ -59,8 +59,8 @@ alignment between the new and old references using the [`translate`](@ref).
```@repl call_variants
ovis_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
human_variation = first(variations(bos_ovis_variant))
human_variation = first(variations(bos_ovis_haplotype))
reference(ans) == bovine
SequenceVariation.translate(human_variation, ovis_human_alignment)
SequenceVariation.translate(human_variation, ovis_human_haplotype)
reference(ans) == bovine
```

View file

@ -1,41 +1,43 @@
"""
Variant{S<:BioSequence,T<:BioSymbol}
Haplotype{S<:BioSequence,T<:BioSymbol}
A set of variations within a given sequence that are all found together. Depending on the
field, it might also be referred to as a "genotype," "haplotype," or "strain."
field, it might also be referred to as a "genotype" or "strain."
# Constructors
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(
Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Haplotype(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
edits are applied sequentially from first to last position, therefore the vector must always
be sorted by position. These edits are sorted automatically if constructing from an
When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
the edits are applied sequentially from first to last position, therefore the vector must
always be sorted by position. These edits are sorted automatically if constructing from an
alignment.
"""
struct Variant{S<:BioSequence,T<:BioSymbol}
struct Haplotype{S<:BioSequence,T<:BioSymbol}
ref::S
edits::Vector{Edit{S,T}}
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
function Haplotype{S,T}(
ref::S, edits::Vector{Edit{S,T}}
) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos)
result = Variant{S,T}(ref, edits, Unsafe())
result = Haplotype{S,T}(ref, edits, Unsafe())
_is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits)
function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Haplotype{S,T}(ref, edits)
end
function Base.show(io::IO, x::Variant)
function Base.show(io::IO, x::Haplotype)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
@ -46,21 +48,21 @@ function Base.show(io::IO, x::Variant)
end
"""
is_valid(v::Variant)
is_valid(h::Haplotype)
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
Validate `h`. `h` is invalid if any of its operations are out of bounds, or the same
position is affected by multiple edits.
"""
function _is_valid(v::Variant)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
function _is_valid(h::Haplotype)
isempty(h.ref) && return false
valid_positions = 1:length(h.ref)
last_was_insert = false
for edit in v.edits
for edit in h.edits
pos = edit.pos
op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
_is_valid(Variation(v.ref, edit)) || return false
_is_valid(Variation(h.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base
if op isa Substitution
@ -87,7 +89,7 @@ function _is_valid(v::Variant)
return true
end
function Variant(
function Haplotype(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b
@ -141,30 +143,30 @@ function Variant(
end
end
return Variant(ref, edits)
return Haplotype(ref, edits)
end
"""
_edits(v::Variant)
_edits(h::Haplotype)
Gets the [`Edit`](@ref)s that comprise `v`
Gets the [`Edit`](@ref)s that comprise `h`
"""
_edits(v::Variant) = v.edits
_edits(h::Haplotype) = h.edits
"""
reference(v::Variant)
reference(h::Haplotype)
Gets the reference sequence of `v`.
Gets the reference sequence of `h`.
"""
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
reference(h::Haplotype) = h.ref
Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
"""
reconstruct!(seq::S, x::Variant{S}) where {S}
reconstruct!(seq::S, x::Haplotype{S}) where {S}
Apply the edits in `x` to `seq` and return the mutated sequence
"""
function reconstruct!(seq::S, x::Variant{S}) where {S}
function reconstruct!(seq::S, x::Haplotype{S}) where {S}
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
resize!(seq, len % UInt)
refpos = seqpos = 1

View file

@ -2,10 +2,10 @@ module SequenceVariation
"""
Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference.
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that
mutation
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
using BioSymbols: BioSymbol
export Deletion
export Haplotype
export Insertion
export Substitution
export Variant
export Variation
export altbases
export mutation
@ -45,7 +45,7 @@ struct Unsafe end
struct Inapplicable end
include("Edit.jl")
include("Variant.jl")
include("Haplotype.jl")
include("Variation.jl")
end # module

View file

@ -12,7 +12,7 @@ and built-in validation.
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
[`variations(::Variant)`](@ref) is encouraged, instead.
[`variations(::Haplotype)`](@ref) is encouraged, instead.
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
following syntax:
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
return Variation{S,T}(ref, e)
end
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
edits = _edit.(vars)
return Variant{S,T}(ref, edits)
return Haplotype{S,T}(ref, edits)
end
"""
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
end
end
function Base.in(v::Variation, var::Variant)
function Base.in(v::Variation, var::Haplotype)
if v.ref != var.ref
error("References must be equal")
end
@ -171,14 +171,14 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
end
"""
variations(v::Variant{S,T}) where {S,T}
variations(h::Haplotype{S,T}) where {S,T}
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
Converts the [`Edit`](@ref)s of `h` into a vector of [`Variation`](@ref)s.
"""
function variations(v::Variant{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
for (i, e) in enumerate(_edits(v))
vs[i] = Variation{S,T}(reference(v), e)
function variations(h::Haplotype{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(h)))
for (i, e) in enumerate(_edits(h))
vs[i] = Variation{S,T}(reference(h), e)
end
return vs
end

View file

@ -1,9 +1,9 @@
"""
Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference.
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that
mutation
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
var = Variant(align(seq1, seq2))
var = Haplotype(align(seq1, seq2))
@testset "VariantRoundtrip" begin
@testset "HaplotypeRoundtrip" begin
for v in variations(var)
@test v in var
@test v in Variant(seq2, [v])
@test v in Haplotype(seq2, [v])
end
end
@ -51,7 +51,7 @@ end
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
aln01 = PairwiseAlignment(read01, refseq)
aln02 = PairwiseAlignment(read02, refseq)
@test Variant(aln01).edits == Variant(aln02).edits
@test Haplotype(aln01).edits == Haplotype(aln02).edits
end
@testset "VariationParsing" begin
@ -72,7 +72,7 @@ end
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
aln = PairwiseAlignment(read, refseq)
var = Variant(aln)
var = Haplotype(aln)
sub = Variation(refseq, "A4T")
@test first(variations(var)) == sub
@ -108,31 +108,31 @@ end
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
end
@testset "SoftclipVariant" begin
@testset "SoftclipHaplotype" begin
refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA"
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
# Test for ending soft clip
@test Variant(
@test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
) == refvar
# Test for ending soft+hard clip
@test Variant(
@test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
) == refvar
# Test that ending insertions are still valid
@test length(
Variant(
Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
).edits,
) == 1
# Test that out-of-bounds bases are still caught
@test_throws BoundsError Variant(
@test_throws BoundsError Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
)
end