Compare commits

...

6 commits

10 changed files with 95 additions and 91 deletions

View file

@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28)) - Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28)) - :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Renamed type `Variant` to `Haplotype` ([#20](https://github.com/BioJulia/SequenceVariation.jl/issues/20)/[#29](https://github.com/BioJulia/SequenceVariation.jl/pull/29))
### Removed ### Removed

View file

@ -14,7 +14,7 @@ makedocs(;
modules=[SequenceVariation], modules=[SequenceVariation],
pages=[ pages=[
"Home" => "index.md", "Home" => "index.md",
"Working with variants" => "variants.md", "Working with haplotypes" => "haplotypes.md",
"Working with variations" => "variations.md", "Working with variations" => "variations.md",
"Comparing variations" => "compare.md", "Comparing variations" => "compare.md",
"API Reference" => "api.md", "API Reference" => "api.md",

View file

@ -18,8 +18,8 @@ Insertion
## Variants ## Variants
```@docs ```@docs
Variant Haplotype
reference(::Variant) reference(::Haplotype)
variations variations
reconstruct! reconstruct!
``` ```
@ -49,7 +49,7 @@ _lendiff
```@docs ```@docs
_edits _edits
_is_valid(::Variant) _is_valid(::Haplotype)
``` ```
### Variations ### Variations

View file

@ -4,9 +4,9 @@ CurrentModule = SequenceVariation
# Comparing variations in sequences # Comparing variations in sequences
## Checking for variations in a known variant ## Checking for variations in a known haplotype
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
efficiently accomplished using the `in` operator. efficiently accomplished using the `in` operator.
```@setup call_variants ```@setup call_variants
@ -21,27 +21,27 @@ bos_ovis_alignment =
bos_human_alignment = bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment) bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment) bos_human_haplotype = Haplotype(bos_human_alignment)
``` ```
```@example call_variants ```@example call_variants
println("\tOvis aires\tHomo sapiens") println("\tOvis aires\tHomo sapiens")
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant)) for v in vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype))
is_sheep = v in bos_ovis_variant is_sheep = v in bos_ovis_haplotype
is_human = v in bos_human_variant is_human = v in bos_human_haplotype
println("$v\t$is_sheep\t\t$is_human") println("$v\t$is_sheep\t\t$is_human")
end end
``` ```
## Constructing new variants based on other variations ## Constructing new haplotypes based on other variations
New variants can be constructed using variations. This might be useful to pool New haplotypes can be constructed using variations. This might be useful to pool
variations found on different reads or to filter variations from a variant variations found on different reads or to filter variations from a haplotype
that aren't validated by another variant. that aren't validated by another haplotype.
```@repl call_variants ```@repl call_variants
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant)); sheeple = vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype));
Variant(bovine, sheeple) Haplotype(bovine, sheeple)
reconstruct!(bovine, ans) reconstruct!(bovine, ans)
``` ```

View file

@ -2,13 +2,14 @@
CurrentModule = SequenceVariation CurrentModule = SequenceVariation
``` ```
# Working with variants # Working with haplotypes
## Calling variants ## Calling variants
The first step in working with sequence variation is to identify (call) The first step in working with sequence variation is to identify (call)
variations. SequenceVariation can directly call variants using the variations between two sequences. SequenceVariation can directly call variants
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type. using the `Haplotype(::PairwiseAlignment)` constructor of the
[`Haplotype`](@ref) type.
```@repl call_variants ```@repl call_variants
using SequenceVariation, BioAlignments, BioSequences using SequenceVariation, BioAlignments, BioSequences
@ -22,19 +23,19 @@ bos_ovis_alignment =
bos_human_alignment = bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment) bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment) bos_human_haplotype = Haplotype(bos_human_alignment)
``` ```
## Sequence reconstruction ## Sequence reconstruction
If the alternate sequence of a variant is no longer available (as is often the If the alternate sequence of a haplotype is no longer available (as is often the
case when calling variants from alignment files), then the sequence can be case when calling variants from alignment files), then the sequence can be
retrieved using the [`reconstruct!`](@ref) function. retrieved using the [`reconstruct!`](@ref) function.
```@repl call_variants ```@repl call_variants
human2 = copy(bovine); human2 = copy(bovine);
reconstruct!(human2, bos_human_variant) reconstruct!(human2, bos_human_haplotype)
human2 == bovine human2 == bovine
human2 == human human2 == human
``` ```

View file

@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
## Extraction ## Extraction
Sequence variations may also be extracted wholesale from a [`Variant`](@ref) Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
using the [`variations`](@ref) function. using the [`variations`](@ref) function.
```@setup call_variants ```@setup call_variants
@ -42,13 +42,13 @@ bos_ovis_alignment =
bos_human_alignment = bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine); PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment) bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment) bos_human_haplotype = Haplotype(bos_human_alignment)
``` ```
```@repl call_variants ```@repl call_variants
variations(bos_ovis_variant) variations(bos_ovis_haplotype)
variations(bos_human_variant) variations(bos_human_haplotype)
``` ```
## Reference switching ## Reference switching
@ -59,8 +59,8 @@ alignment between the new and old references using the [`translate`](@ref).
```@repl call_variants ```@repl call_variants
ovis_human_alignment = ovis_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine) PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
human_variation = first(variations(bos_ovis_variant)) human_variation = first(variations(bos_ovis_haplotype))
reference(ans) == bovine reference(ans) == bovine
SequenceVariation.translate(human_variation, ovis_human_alignment) SequenceVariation.translate(human_variation, ovis_human_haplotype)
reference(ans) == bovine reference(ans) == bovine
``` ```

View file

@ -1,41 +1,43 @@
""" """
Variant{S<:BioSequence,T<:BioSymbol} Haplotype{S<:BioSequence,T<:BioSymbol}
A set of variations within a given sequence that are all found together. Depending on the A set of variations within a given sequence that are all found together. Depending on the
field, it might also be referred to as a "genotype," "haplotype," or "strain." field, it might also be referred to as a "genotype" or "strain."
# Constructors # Constructors
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant( Haplotype(
aln::PairwiseAlignment{T,T} aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} ) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
edits are applied sequentially from first to last position, therefore the vector must always the edits are applied sequentially from first to last position, therefore the vector must
be sorted by position. These edits are sorted automatically if constructing from an always be sorted by position. These edits are sorted automatically if constructing from an
alignment. alignment.
""" """
struct Variant{S<:BioSequence,T<:BioSymbol} struct Haplotype{S<:BioSequence,T<:BioSymbol}
ref::S ref::S
edits::Vector{Edit{S,T}} edits::Vector{Edit{S,T}}
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits) Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end end
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} function Haplotype{S,T}(
ref::S, edits::Vector{Edit{S,T}}
) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos) sort!(edits; by=x -> x.pos)
result = Variant{S,T}(ref, edits, Unsafe()) result = Haplotype{S,T}(ref, edits, Unsafe())
_is_valid(result) || error("TODO") # report what kind of error message? _is_valid(result) || error("TODO") # report what kind of error message?
return result return result
end end
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol} function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits) return Haplotype{S,T}(ref, edits)
end end
function Base.show(io::IO, x::Variant) function Base.show(io::IO, x::Haplotype)
n = length(x.edits) n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):") print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits for i in x.edits
@ -46,21 +48,21 @@ function Base.show(io::IO, x::Variant)
end end
""" """
is_valid(v::Variant) is_valid(h::Haplotype)
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same Validate `h`. `h` is invalid if any of its operations are out of bounds, or the same
position is affected by multiple edits. position is affected by multiple edits.
""" """
function _is_valid(v::Variant) function _is_valid(h::Haplotype)
isempty(v.ref) && return false isempty(h.ref) && return false
valid_positions = 1:length(v.ref) valid_positions = 1:length(h.ref)
last_was_insert = false last_was_insert = false
for edit in v.edits for edit in h.edits
pos = edit.pos pos = edit.pos
op = edit.x op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid # Sanity check: for this to be a valid variant, it must be comprised of valid
# variations # variations
_is_valid(Variation(v.ref, edit)) || return false _is_valid(Variation(h.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base # For substitutions we simply do not allow another modification of the same base
if op isa Substitution if op isa Substitution
@ -87,7 +89,7 @@ function _is_valid(v::Variant)
return true return true
end end
function Variant( function Haplotype(
aln::PairwiseAlignment{T,T} aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}} ) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b ref = aln.b
@ -141,30 +143,30 @@ function Variant(
end end
end end
return Variant(ref, edits) return Haplotype(ref, edits)
end end
""" """
_edits(v::Variant) _edits(h::Haplotype)
Gets the [`Edit`](@ref)s that comprise `v` Gets the [`Edit`](@ref)s that comprise `h`
""" """
_edits(v::Variant) = v.edits _edits(h::Haplotype) = h.edits
""" """
reference(v::Variant) reference(h::Haplotype)
Gets the reference sequence of `v`. Gets the reference sequence of `h`.
""" """
reference(v::Variant) = v.ref reference(h::Haplotype) = h.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
""" """
reconstruct!(seq::S, x::Variant{S}) where {S} reconstruct!(seq::S, x::Haplotype{S}) where {S}
Apply the edits in `x` to `seq` and return the mutated sequence Apply the edits in `x` to `seq` and return the mutated sequence
""" """
function reconstruct!(seq::S, x::Variant{S}) where {S} function reconstruct!(seq::S, x::Haplotype{S}) where {S}
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x)) len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
resize!(seq, len % UInt) resize!(seq, len % UInt)
refpos = seqpos = 1 refpos = seqpos = 1

View file

@ -2,10 +2,10 @@ module SequenceVariation
""" """
Needs to be able to: Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents * Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference. * Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that * Given a mutation and a reference and a sequence, determine if the sequence has that
mutation mutation
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
using BioSymbols: BioSymbol using BioSymbols: BioSymbol
export Deletion export Deletion
export Haplotype
export Insertion export Insertion
export Substitution export Substitution
export Variant
export Variation export Variation
export altbases export altbases
export mutation export mutation
@ -45,7 +45,7 @@ struct Unsafe end
struct Inapplicable end struct Inapplicable end
include("Edit.jl") include("Edit.jl")
include("Variant.jl") include("Haplotype.jl")
include("Variation.jl") include("Variation.jl")
end # module end # module

View file

@ -12,7 +12,7 @@ and built-in validation.
Variation(ref::S, edit::AbstractString) where {S<:BioSequence} Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
[`variations(::Variant)`](@ref) is encouraged, instead. [`variations(::Haplotype)`](@ref) is encouraged, instead.
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
following syntax: following syntax:
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
return Variation{S,T}(ref, e) return Variation{S,T}(ref, e)
end end
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol} function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
edits = _edit.(vars) edits = _edit.(vars)
return Variant{S,T}(ref, edits) return Haplotype{S,T}(ref, edits)
end end
""" """
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
end end
end end
function Base.in(v::Variation, var::Variant) function Base.in(v::Variation, var::Haplotype)
if v.ref != var.ref if v.ref != var.ref
error("References must be equal") error("References must be equal")
end end
@ -171,14 +171,14 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
end end
""" """
variations(v::Variant{S,T}) where {S,T} variations(h::Haplotype{S,T}) where {S,T}
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s. Converts the [`Edit`](@ref)s of `h` into a vector of [`Variation`](@ref)s.
""" """
function variations(v::Variant{S,T}) where {S,T} function variations(h::Haplotype{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(v))) vs = Vector{Variation{S,T}}(undef, length(_edits(h)))
for (i, e) in enumerate(_edits(v)) for (i, e) in enumerate(_edits(h))
vs[i] = Variation{S,T}(reference(v), e) vs[i] = Variation{S,T}(reference(h), e)
end end
return vs return vs
end end

View file

@ -1,9 +1,9 @@
""" """
Needs to be able to: Needs to be able to:
* Given a sequence and a reference, create a `Variant` that unambiguously represents * Given a sequence and a reference, create a `Haplotype` that unambiguously represents
the sequence the sequence
* Given a `Variant` and a new reference, translate the variant to the new reference. * Given a `Haplotype` and a new reference, translate the variant to the new reference.
* Given a mutation and a reference and a sequence, determine if the sequence has that * Given a mutation and a reference and a sequence, determine if the sequence has that
mutation mutation
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG") seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG") seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
var = Variant(align(seq1, seq2)) var = Haplotype(align(seq1, seq2))
@testset "VariantRoundtrip" begin @testset "HaplotypeRoundtrip" begin
for v in variations(var) for v in variations(var)
@test v in var @test v in var
@test v in Variant(seq2, [v]) @test v in Haplotype(seq2, [v])
end end
end end
@ -51,7 +51,7 @@ end
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3)) read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
aln01 = PairwiseAlignment(read01, refseq) aln01 = PairwiseAlignment(read01, refseq)
aln02 = PairwiseAlignment(read02, refseq) aln02 = PairwiseAlignment(read02, refseq)
@test Variant(aln01).edits == Variant(aln02).edits @test Haplotype(aln01).edits == Haplotype(aln02).edits
end end
@testset "VariationParsing" begin @testset "VariationParsing" begin
@ -72,7 +72,7 @@ end
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1)) read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
aln = PairwiseAlignment(read, refseq) aln = PairwiseAlignment(read, refseq)
var = Variant(aln) var = Haplotype(aln)
sub = Variation(refseq, "A4T") sub = Variation(refseq, "A4T")
@test first(variations(var)) == sub @test first(variations(var)) == sub
@ -108,31 +108,31 @@ end
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA" @test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
end end
@testset "SoftclipVariant" begin @testset "SoftclipHaplotype" begin
refseq = dna"GATTACA" refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA" mutseq = dna"GATTACAAAA"
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[]) refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
# Test for ending soft clip # Test for ending soft clip
@test Variant( @test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
) == refvar ) == refvar
# Test for ending soft+hard clip # Test for ending soft+hard clip
@test Variant( @test Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
) == refvar ) == refvar
# Test that ending insertions are still valid # Test that ending insertions are still valid
@test length( @test length(
Variant( Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
).edits, ).edits,
) == 1 ) == 1
# Test that out-of-bounds bases are still caught # Test that out-of-bounds bases are still caught
@test_throws BoundsError Variant( @test_throws BoundsError Haplotype(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq) PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
) )
end end