Compare commits

..

No commits in common. "8a71715cd9fa3826da930f0caa965c109810232f" and "3ef4a92ba268faa509547bb9774d070aba38cdd5" have entirely different histories.

10 changed files with 91 additions and 95 deletions

View file

@ -15,7 +15,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Renamed type `Variant` to `Haplotype` ([#20](https://github.com/BioJulia/SequenceVariation.jl/issues/20)/[#29](https://github.com/BioJulia/SequenceVariation.jl/pull/29))
### Removed

View file

@ -14,7 +14,7 @@ makedocs(;
modules=[SequenceVariation],
pages=[
"Home" => "index.md",
"Working with haplotypes" => "haplotypes.md",
"Working with variants" => "variants.md",
"Working with variations" => "variations.md",
"Comparing variations" => "compare.md",
"API Reference" => "api.md",

View file

@ -18,8 +18,8 @@ Insertion
## Variants
```@docs
Haplotype
reference(::Haplotype)
Variant
reference(::Variant)
variations
reconstruct!
```
@ -49,7 +49,7 @@ _lendiff
```@docs
_edits
_is_valid(::Haplotype)
_is_valid(::Variant)
```
### Variations

View file

@ -4,9 +4,9 @@ CurrentModule = SequenceVariation
# Comparing variations in sequences
## Checking for variations in a known haplotype
## Checking for variations in a known variant
Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
efficiently accomplished using the `in` operator.
```@setup call_variants
@ -21,27 +21,27 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_haplotype = Haplotype(bos_human_alignment)
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
```
```@example call_variants
println("\tOvis aires\tHomo sapiens")
for v in vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype))
is_sheep = v in bos_ovis_haplotype
is_human = v in bos_human_haplotype
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant))
is_sheep = v in bos_ovis_variant
is_human = v in bos_human_variant
println("$v\t$is_sheep\t\t$is_human")
end
```
## Constructing new haplotypes based on other variations
## Constructing new variants based on other variations
New haplotypes can be constructed using variations. This might be useful to pool
variations found on different reads or to filter variations from a haplotype
that aren't validated by another haplotype.
New variants can be constructed using variations. This might be useful to pool
variations found on different reads or to filter variations from a variant
that aren't validated by another variant.
```@repl call_variants
sheeple = vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype));
Haplotype(bovine, sheeple)
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
Variant(bovine, sheeple)
reconstruct!(bovine, ans)
```

View file

@ -2,14 +2,13 @@
CurrentModule = SequenceVariation
```
# Working with haplotypes
# Working with variants
## Calling variants
The first step in working with sequence variation is to identify (call)
variations between two sequences. SequenceVariation can directly call variants
using the `Haplotype(::PairwiseAlignment)` constructor of the
[`Haplotype`](@ref) type.
variations. SequenceVariation can directly call variants using the
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
```@repl call_variants
using SequenceVariation, BioAlignments, BioSequences
@ -23,19 +22,19 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_haplotype = Haplotype(bos_human_alignment)
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
```
## Sequence reconstruction
If the alternate sequence of a haplotype is no longer available (as is often the
If the alternate sequence of a variant is no longer available (as is often the
case when calling variants from alignment files), then the sequence can be
retrieved using the [`reconstruct!`](@ref) function.
```@repl call_variants
human2 = copy(bovine);
reconstruct!(human2, bos_human_haplotype)
reconstruct!(human2, bos_human_variant)
human2 == bovine
human2 == human
```

View file

@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
## Extraction
Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
using the [`variations`](@ref) function.
```@setup call_variants
@ -42,13 +42,13 @@ bos_ovis_alignment =
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_haplotype = Haplotype(bos_human_alignment)
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
```
```@repl call_variants
variations(bos_ovis_haplotype)
variations(bos_human_haplotype)
variations(bos_ovis_variant)
variations(bos_human_variant)
```
## Reference switching
@ -59,8 +59,8 @@ alignment between the new and old references using the [`translate`](@ref).
```@repl call_variants
ovis_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
human_variation = first(variations(bos_ovis_haplotype))
human_variation = first(variations(bos_ovis_variant))
reference(ans) == bovine
SequenceVariation.translate(human_variation, ovis_human_haplotype)
SequenceVariation.translate(human_variation, ovis_human_alignment)
reference(ans) == bovine
```

View file

@ -2,10 +2,10 @@ module SequenceVariation
"""
Needs to be able to:
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
* Given a sequence and a reference, create a `Variant` that unambiguously represents
the sequence
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a `Variant` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that
mutation
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
using BioSymbols: BioSymbol
export Deletion
export Haplotype
export Insertion
export Substitution
export Variant
export Variation
export altbases
export mutation
@ -45,7 +45,7 @@ struct Unsafe end
struct Inapplicable end
include("Edit.jl")
include("Haplotype.jl")
include("Variant.jl")
include("Variation.jl")
end # module

View file

@ -1,43 +1,41 @@
"""
Haplotype{S<:BioSequence,T<:BioSymbol}
Variant{S<:BioSequence,T<:BioSymbol}
A set of variations within a given sequence that are all found together. Depending on the
field, it might also be referred to as a "genotype" or "strain."
field, it might also be referred to as a "genotype," "haplotype," or "strain."
# Constructors
Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Haplotype(
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
the edits are applied sequentially from first to last position, therefore the vector must
always be sorted by position. These edits are sorted automatically if constructing from an
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
edits are applied sequentially from first to last position, therefore the vector must always
be sorted by position. These edits are sorted automatically if constructing from an
alignment.
"""
struct Haplotype{S<:BioSequence,T<:BioSymbol}
struct Variant{S<:BioSequence,T<:BioSymbol}
ref::S
edits::Vector{Edit{S,T}}
Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end
function Haplotype{S,T}(
ref::S, edits::Vector{Edit{S,T}}
) where {S<:BioSequence,T<:BioSymbol}
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos)
result = Haplotype{S,T}(ref, edits, Unsafe())
result = Variant{S,T}(ref, edits, Unsafe())
_is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Haplotype{S,T}(ref, edits)
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits)
end
function Base.show(io::IO, x::Haplotype)
function Base.show(io::IO, x::Variant)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
@ -48,21 +46,21 @@ function Base.show(io::IO, x::Haplotype)
end
"""
is_valid(h::Haplotype)
is_valid(v::Variant)
Validate `h`. `h` is invalid if any of its operations are out of bounds, or the same
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
position is affected by multiple edits.
"""
function _is_valid(h::Haplotype)
isempty(h.ref) && return false
valid_positions = 1:length(h.ref)
function _is_valid(v::Variant)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
last_was_insert = false
for edit in h.edits
for edit in v.edits
pos = edit.pos
op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
_is_valid(Variation(h.ref, edit)) || return false
_is_valid(Variation(v.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base
if op isa Substitution
@ -89,7 +87,7 @@ function _is_valid(h::Haplotype)
return true
end
function Haplotype(
function Variant(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b
@ -143,30 +141,30 @@ function Haplotype(
end
end
return Haplotype(ref, edits)
return Variant(ref, edits)
end
"""
_edits(h::Haplotype)
_edits(v::Variant)
Gets the [`Edit`](@ref)s that comprise `h`
Gets the [`Edit`](@ref)s that comprise `v`
"""
_edits(h::Haplotype) = h.edits
_edits(v::Variant) = v.edits
"""
reference(h::Haplotype)
reference(v::Variant)
Gets the reference sequence of `h`.
Gets the reference sequence of `v`.
"""
reference(h::Haplotype) = h.ref
Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
"""
reconstruct!(seq::S, x::Haplotype{S}) where {S}
reconstruct!(seq::S, x::Variant{S}) where {S}
Apply the edits in `x` to `seq` and return the mutated sequence
"""
function reconstruct!(seq::S, x::Haplotype{S}) where {S}
function reconstruct!(seq::S, x::Variant{S}) where {S}
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
resize!(seq, len % UInt)
refpos = seqpos = 1

View file

@ -12,7 +12,7 @@ and built-in validation.
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
[`variations(::Haplotype)`](@ref) is encouraged, instead.
[`variations(::Variant)`](@ref) is encouraged, instead.
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
following syntax:
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
return Variation{S,T}(ref, e)
end
function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
edits = _edit.(vars)
return Haplotype{S,T}(ref, edits)
return Variant{S,T}(ref, edits)
end
"""
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
end
end
function Base.in(v::Variation, var::Haplotype)
function Base.in(v::Variation, var::Variant)
if v.ref != var.ref
error("References must be equal")
end
@ -171,14 +171,14 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
end
"""
variations(h::Haplotype{S,T}) where {S,T}
variations(v::Variant{S,T}) where {S,T}
Converts the [`Edit`](@ref)s of `h` into a vector of [`Variation`](@ref)s.
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
"""
function variations(h::Haplotype{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(h)))
for (i, e) in enumerate(_edits(h))
vs[i] = Variation{S,T}(reference(h), e)
function variations(v::Variant{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
for (i, e) in enumerate(_edits(v))
vs[i] = Variation{S,T}(reference(v), e)
end
return vs
end

View file

@ -1,9 +1,9 @@
"""
Needs to be able to:
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
* Given a sequence and a reference, create a `Variant` that unambiguously represents
the sequence
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a `Variant` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that
mutation
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
var = Haplotype(align(seq1, seq2))
var = Variant(align(seq1, seq2))
@testset "HaplotypeRoundtrip" begin
@testset "VariantRoundtrip" begin
for v in variations(var)
@test v in var
@test v in Haplotype(seq2, [v])
@test v in Variant(seq2, [v])
end
end
@ -51,7 +51,7 @@ end
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
aln01 = PairwiseAlignment(read01, refseq)
aln02 = PairwiseAlignment(read02, refseq)
@test Haplotype(aln01).edits == Haplotype(aln02).edits
@test Variant(aln01).edits == Variant(aln02).edits
end
@testset "VariationParsing" begin
@ -72,7 +72,7 @@ end
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
aln = PairwiseAlignment(read, refseq)
var = Haplotype(aln)
var = Variant(aln)
sub = Variation(refseq, "A4T")
@test first(variations(var)) == sub
@ -108,31 +108,31 @@ end
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
end
@testset "SoftclipHaplotype" begin
@testset "SoftclipVariant" begin
refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA"
refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
# Test for ending soft clip
@test Haplotype(
@test Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
) == refvar
# Test for ending soft+hard clip
@test Haplotype(
@test Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
) == refvar
# Test that ending insertions are still valid
@test length(
Haplotype(
Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
).edits,
) == 1
# Test that out-of-bounds bases are still caught
@test_throws BoundsError Haplotype(
@test_throws BoundsError Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
)
end