mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-24 06:19:54 +00:00
Compare commits
No commits in common. "8a71715cd9fa3826da930f0caa965c109810232f" and "3ef4a92ba268faa509547bb9774d070aba38cdd5" have entirely different histories.
8a71715cd9
...
3ef4a92ba2
10 changed files with 91 additions and 95 deletions
|
@ -15,7 +15,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
||||
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
||||
- :bomb: [BREAKING] Renamed type `Variant` to `Haplotype` ([#20](https://github.com/BioJulia/SequenceVariation.jl/issues/20)/[#29](https://github.com/BioJulia/SequenceVariation.jl/pull/29))
|
||||
|
||||
### Removed
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ makedocs(;
|
|||
modules=[SequenceVariation],
|
||||
pages=[
|
||||
"Home" => "index.md",
|
||||
"Working with haplotypes" => "haplotypes.md",
|
||||
"Working with variants" => "variants.md",
|
||||
"Working with variations" => "variations.md",
|
||||
"Comparing variations" => "compare.md",
|
||||
"API Reference" => "api.md",
|
||||
|
|
|
@ -18,8 +18,8 @@ Insertion
|
|||
## Variants
|
||||
|
||||
```@docs
|
||||
Haplotype
|
||||
reference(::Haplotype)
|
||||
Variant
|
||||
reference(::Variant)
|
||||
variations
|
||||
reconstruct!
|
||||
```
|
||||
|
@ -49,7 +49,7 @@ _lendiff
|
|||
|
||||
```@docs
|
||||
_edits
|
||||
_is_valid(::Haplotype)
|
||||
_is_valid(::Variant)
|
||||
```
|
||||
|
||||
### Variations
|
||||
|
|
|
@ -4,9 +4,9 @@ CurrentModule = SequenceVariation
|
|||
|
||||
# Comparing variations in sequences
|
||||
|
||||
## Checking for variations in a known haplotype
|
||||
## Checking for variations in a known variant
|
||||
|
||||
Looking for a known [`Variation`](@ref) within a [`Haplotype`](@ref) is
|
||||
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
|
||||
efficiently accomplished using the `in` operator.
|
||||
|
||||
```@setup call_variants
|
||||
|
@ -21,27 +21,27 @@ bos_ovis_alignment =
|
|||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
|
||||
bos_human_haplotype = Haplotype(bos_human_alignment)
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
```
|
||||
|
||||
```@example call_variants
|
||||
println("\tOvis aires\tHomo sapiens")
|
||||
for v in vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype))
|
||||
is_sheep = v in bos_ovis_haplotype
|
||||
is_human = v in bos_human_haplotype
|
||||
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant))
|
||||
is_sheep = v in bos_ovis_variant
|
||||
is_human = v in bos_human_variant
|
||||
println("$v\t$is_sheep\t\t$is_human")
|
||||
end
|
||||
```
|
||||
|
||||
## Constructing new haplotypes based on other variations
|
||||
## Constructing new variants based on other variations
|
||||
|
||||
New haplotypes can be constructed using variations. This might be useful to pool
|
||||
variations found on different reads or to filter variations from a haplotype
|
||||
that aren't validated by another haplotype.
|
||||
New variants can be constructed using variations. This might be useful to pool
|
||||
variations found on different reads or to filter variations from a variant
|
||||
that aren't validated by another variant.
|
||||
|
||||
```@repl call_variants
|
||||
sheeple = vcat(variations(bos_ovis_haplotype), variations(bos_human_haplotype));
|
||||
Haplotype(bovine, sheeple)
|
||||
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
|
||||
Variant(bovine, sheeple)
|
||||
reconstruct!(bovine, ans)
|
||||
```
|
||||
|
|
|
@ -2,14 +2,13 @@
|
|||
CurrentModule = SequenceVariation
|
||||
```
|
||||
|
||||
# Working with haplotypes
|
||||
# Working with variants
|
||||
|
||||
## Calling variants
|
||||
|
||||
The first step in working with sequence variation is to identify (call)
|
||||
variations between two sequences. SequenceVariation can directly call variants
|
||||
using the `Haplotype(::PairwiseAlignment)` constructor of the
|
||||
[`Haplotype`](@ref) type.
|
||||
variations. SequenceVariation can directly call variants using the
|
||||
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
|
||||
|
||||
```@repl call_variants
|
||||
using SequenceVariation, BioAlignments, BioSequences
|
||||
|
@ -23,19 +22,19 @@ bos_ovis_alignment =
|
|||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
|
||||
bos_human_haplotype = Haplotype(bos_human_alignment)
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
```
|
||||
|
||||
## Sequence reconstruction
|
||||
|
||||
If the alternate sequence of a haplotype is no longer available (as is often the
|
||||
If the alternate sequence of a variant is no longer available (as is often the
|
||||
case when calling variants from alignment files), then the sequence can be
|
||||
retrieved using the [`reconstruct!`](@ref) function.
|
||||
|
||||
```@repl call_variants
|
||||
human2 = copy(bovine);
|
||||
reconstruct!(human2, bos_human_haplotype)
|
||||
reconstruct!(human2, bos_human_variant)
|
||||
human2 == bovine
|
||||
human2 == human
|
||||
```
|
|
@ -27,7 +27,7 @@ mutation(Variation(bovine_ins, "25ACA"))
|
|||
|
||||
## Extraction
|
||||
|
||||
Sequence variations may also be extracted wholesale from a [`Haplotype`](@ref)
|
||||
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
|
||||
using the [`variations`](@ref) function.
|
||||
|
||||
```@setup call_variants
|
||||
|
@ -42,13 +42,13 @@ bos_ovis_alignment =
|
|||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_haplotype = Haplotype(bos_ovis_alignment)
|
||||
bos_human_haplotype = Haplotype(bos_human_alignment)
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
```
|
||||
|
||||
```@repl call_variants
|
||||
variations(bos_ovis_haplotype)
|
||||
variations(bos_human_haplotype)
|
||||
variations(bos_ovis_variant)
|
||||
variations(bos_human_variant)
|
||||
```
|
||||
|
||||
## Reference switching
|
||||
|
@ -59,8 +59,8 @@ alignment between the new and old references using the [`translate`](@ref).
|
|||
```@repl call_variants
|
||||
ovis_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
|
||||
human_variation = first(variations(bos_ovis_haplotype))
|
||||
human_variation = first(variations(bos_ovis_variant))
|
||||
reference(ans) == bovine
|
||||
SequenceVariation.translate(human_variation, ovis_human_haplotype)
|
||||
SequenceVariation.translate(human_variation, ovis_human_alignment)
|
||||
reference(ans) == bovine
|
||||
```
|
||||
|
|
|
@ -2,10 +2,10 @@ module SequenceVariation
|
|||
|
||||
"""
|
||||
Needs to be able to:
|
||||
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
|
||||
* Given a sequence and a reference, create a `Variant` that unambiguously represents
|
||||
the sequence
|
||||
|
||||
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
|
||||
* Given a `Variant` and a new reference, translate the variant to the new reference.
|
||||
|
||||
* Given a mutation and a reference and a sequence, determine if the sequence has that
|
||||
mutation
|
||||
|
@ -26,9 +26,9 @@ using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isga
|
|||
using BioSymbols: BioSymbol
|
||||
|
||||
export Deletion
|
||||
export Haplotype
|
||||
export Insertion
|
||||
export Substitution
|
||||
export Variant
|
||||
export Variation
|
||||
export altbases
|
||||
export mutation
|
||||
|
@ -45,7 +45,7 @@ struct Unsafe end
|
|||
struct Inapplicable end
|
||||
|
||||
include("Edit.jl")
|
||||
include("Haplotype.jl")
|
||||
include("Variant.jl")
|
||||
include("Variation.jl")
|
||||
|
||||
end # module
|
||||
|
|
|
@ -1,43 +1,41 @@
|
|||
"""
|
||||
Haplotype{S<:BioSequence,T<:BioSymbol}
|
||||
Variant{S<:BioSequence,T<:BioSymbol}
|
||||
|
||||
A set of variations within a given sequence that are all found together. Depending on the
|
||||
field, it might also be referred to as a "genotype" or "strain."
|
||||
field, it might also be referred to as a "genotype," "haplotype," or "strain."
|
||||
|
||||
# Constructors
|
||||
|
||||
Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Haplotype(
|
||||
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variant(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
|
||||
When constructing a `Haplotype` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s,
|
||||
the edits are applied sequentially from first to last position, therefore the vector must
|
||||
always be sorted by position. These edits are sorted automatically if constructing from an
|
||||
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
|
||||
edits are applied sequentially from first to last position, therefore the vector must always
|
||||
be sorted by position. These edits are sorted automatically if constructing from an
|
||||
alignment.
|
||||
"""
|
||||
struct Haplotype{S<:BioSequence,T<:BioSymbol}
|
||||
struct Variant{S<:BioSequence,T<:BioSymbol}
|
||||
ref::S
|
||||
edits::Vector{Edit{S,T}}
|
||||
|
||||
Haplotype{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||
end
|
||||
|
||||
function Haplotype{S,T}(
|
||||
ref::S, edits::Vector{Edit{S,T}}
|
||||
) where {S<:BioSequence,T<:BioSymbol}
|
||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
sort!(edits; by=x -> x.pos)
|
||||
result = Haplotype{S,T}(ref, edits, Unsafe())
|
||||
result = Variant{S,T}(ref, edits, Unsafe())
|
||||
_is_valid(result) || error("TODO") # report what kind of error message?
|
||||
return result
|
||||
end
|
||||
|
||||
function Haplotype(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
return Haplotype{S,T}(ref, edits)
|
||||
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
return Variant{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::Haplotype)
|
||||
function Base.show(io::IO, x::Variant)
|
||||
n = length(x.edits)
|
||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||
for i in x.edits
|
||||
|
@ -48,21 +46,21 @@ function Base.show(io::IO, x::Haplotype)
|
|||
end
|
||||
|
||||
"""
|
||||
is_valid(h::Haplotype)
|
||||
is_valid(v::Variant)
|
||||
|
||||
Validate `h`. `h` is invalid if any of its operations are out of bounds, or the same
|
||||
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
|
||||
position is affected by multiple edits.
|
||||
"""
|
||||
function _is_valid(h::Haplotype)
|
||||
isempty(h.ref) && return false
|
||||
valid_positions = 1:length(h.ref)
|
||||
function _is_valid(v::Variant)
|
||||
isempty(v.ref) && return false
|
||||
valid_positions = 1:length(v.ref)
|
||||
last_was_insert = false
|
||||
for edit in h.edits
|
||||
for edit in v.edits
|
||||
pos = edit.pos
|
||||
op = edit.x
|
||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||
# variations
|
||||
_is_valid(Variation(h.ref, edit)) || return false
|
||||
_is_valid(Variation(v.ref, edit)) || return false
|
||||
|
||||
# For substitutions we simply do not allow another modification of the same base
|
||||
if op isa Substitution
|
||||
|
@ -89,7 +87,7 @@ function _is_valid(h::Haplotype)
|
|||
return true
|
||||
end
|
||||
|
||||
function Haplotype(
|
||||
function Variant(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
ref = aln.b
|
||||
|
@ -143,30 +141,30 @@ function Haplotype(
|
|||
end
|
||||
end
|
||||
|
||||
return Haplotype(ref, edits)
|
||||
return Variant(ref, edits)
|
||||
end
|
||||
|
||||
"""
|
||||
_edits(h::Haplotype)
|
||||
_edits(v::Variant)
|
||||
|
||||
Gets the [`Edit`](@ref)s that comprise `h`
|
||||
Gets the [`Edit`](@ref)s that comprise `v`
|
||||
"""
|
||||
_edits(h::Haplotype) = h.edits
|
||||
_edits(v::Variant) = v.edits
|
||||
|
||||
"""
|
||||
reference(h::Haplotype)
|
||||
reference(v::Variant)
|
||||
|
||||
Gets the reference sequence of `h`.
|
||||
Gets the reference sequence of `v`.
|
||||
"""
|
||||
reference(h::Haplotype) = h.ref
|
||||
Base.:(==)(x::Haplotype, y::Haplotype) = x.ref == y.ref && x.edits == y.edits
|
||||
reference(v::Variant) = v.ref
|
||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||
|
||||
"""
|
||||
reconstruct!(seq::S, x::Haplotype{S}) where {S}
|
||||
reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
|
||||
Apply the edits in `x` to `seq` and return the mutated sequence
|
||||
"""
|
||||
function reconstruct!(seq::S, x::Haplotype{S}) where {S}
|
||||
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
|
@ -12,7 +12,7 @@ and built-in validation.
|
|||
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
||||
|
||||
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
|
||||
[`variations(::Haplotype)`](@ref) is encouraged, instead.
|
||||
[`variations(::Variant)`](@ref) is encouraged, instead.
|
||||
|
||||
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
|
||||
following syntax:
|
||||
|
@ -45,9 +45,9 @@ function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
|||
return Variation{S,T}(ref, e)
|
||||
end
|
||||
|
||||
function Haplotype(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
edits = _edit.(vars)
|
||||
return Haplotype{S,T}(ref, edits)
|
||||
return Variant{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
"""
|
||||
|
@ -108,7 +108,7 @@ function Base.show(io::IO, x::Variation)
|
|||
end
|
||||
end
|
||||
|
||||
function Base.in(v::Variation, var::Haplotype)
|
||||
function Base.in(v::Variation, var::Variant)
|
||||
if v.ref != var.ref
|
||||
error("References must be equal")
|
||||
end
|
||||
|
@ -171,14 +171,14 @@ function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
|||
end
|
||||
|
||||
"""
|
||||
variations(h::Haplotype{S,T}) where {S,T}
|
||||
variations(v::Variant{S,T}) where {S,T}
|
||||
|
||||
Converts the [`Edit`](@ref)s of `h` into a vector of [`Variation`](@ref)s.
|
||||
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
|
||||
"""
|
||||
function variations(h::Haplotype{S,T}) where {S,T}
|
||||
vs = Vector{Variation{S,T}}(undef, length(_edits(h)))
|
||||
for (i, e) in enumerate(_edits(h))
|
||||
vs[i] = Variation{S,T}(reference(h), e)
|
||||
function variations(v::Variant{S,T}) where {S,T}
|
||||
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
|
||||
for (i, e) in enumerate(_edits(v))
|
||||
vs[i] = Variation{S,T}(reference(v), e)
|
||||
end
|
||||
return vs
|
||||
end
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
"""
|
||||
Needs to be able to:
|
||||
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
|
||||
* Given a sequence and a reference, create a `Variant` that unambiguously represents
|
||||
the sequence
|
||||
|
||||
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
|
||||
* Given a `Variant` and a new reference, translate the variant to the new reference.
|
||||
|
||||
* Given a mutation and a reference and a sequence, determine if the sequence has that
|
||||
mutation
|
||||
|
@ -34,12 +34,12 @@ const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_
|
|||
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
||||
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
||||
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
|
||||
var = Haplotype(align(seq1, seq2))
|
||||
var = Variant(align(seq1, seq2))
|
||||
|
||||
@testset "HaplotypeRoundtrip" begin
|
||||
@testset "VariantRoundtrip" begin
|
||||
for v in variations(var)
|
||||
@test v in var
|
||||
@test v in Haplotype(seq2, [v])
|
||||
@test v in Variant(seq2, [v])
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -51,7 +51,7 @@ end
|
|||
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
|
||||
aln01 = PairwiseAlignment(read01, refseq)
|
||||
aln02 = PairwiseAlignment(read02, refseq)
|
||||
@test Haplotype(aln01).edits == Haplotype(aln02).edits
|
||||
@test Variant(aln01).edits == Variant(aln02).edits
|
||||
end
|
||||
|
||||
@testset "VariationParsing" begin
|
||||
|
@ -72,7 +72,7 @@ end
|
|||
|
||||
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
|
||||
aln = PairwiseAlignment(read, refseq)
|
||||
var = Haplotype(aln)
|
||||
var = Variant(aln)
|
||||
|
||||
sub = Variation(refseq, "A4T")
|
||||
@test first(variations(var)) == sub
|
||||
|
@ -108,31 +108,31 @@ end
|
|||
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
|
||||
end
|
||||
|
||||
@testset "SoftclipHaplotype" begin
|
||||
@testset "SoftclipVariant" begin
|
||||
refseq = dna"GATTACA"
|
||||
mutseq = dna"GATTACAAAA"
|
||||
|
||||
refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
||||
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
||||
|
||||
# Test for ending soft clip
|
||||
@test Haplotype(
|
||||
@test Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
|
||||
) == refvar
|
||||
|
||||
# Test for ending soft+hard clip
|
||||
@test Haplotype(
|
||||
@test Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
|
||||
) == refvar
|
||||
|
||||
# Test that ending insertions are still valid
|
||||
@test length(
|
||||
Haplotype(
|
||||
Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
|
||||
).edits,
|
||||
) == 1
|
||||
|
||||
# Test that out-of-bounds bases are still caught
|
||||
@test_throws BoundsError Haplotype(
|
||||
@test_throws BoundsError Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
|
||||
)
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue