mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 21:39:55 +00:00
Compare commits
11 commits
a8b0372329
...
373e485802
Author | SHA1 | Date | |
---|---|---|---|
373e485802 | |||
6b2bd018e2 | |||
341faa45e3 | |||
8458f74053 | |||
2d16c0b7b6 | |||
db170730f0 | |||
4817c8016a | |||
95ef4fdc62 | |||
719a123992 | |||
e84f765357 | |||
c3a7788dbd |
7 changed files with 137 additions and 3 deletions
|
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- `alignment` and `cigar` getters for `Haplotype`s ([#39](https://github.com/BioJulia/SequenceVariation.jl/issues/39)/[#42](https://github.com/BioJulia/SequenceVariation.jl/pull/42))
|
||||||
|
|
||||||
## [0.2.2] - 2023-01-28
|
## [0.2.2] - 2023-01-28
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
@ -15,13 +15,15 @@ Deletion
|
||||||
Insertion
|
Insertion
|
||||||
```
|
```
|
||||||
|
|
||||||
## Variants
|
## Haplotypes
|
||||||
|
|
||||||
```@docs
|
```@docs
|
||||||
Haplotype
|
Haplotype
|
||||||
reference(::Haplotype)
|
reference(::Haplotype)
|
||||||
variations
|
variations
|
||||||
reconstruct
|
reconstruct
|
||||||
|
BioAlignments.alignment
|
||||||
|
BioAlignment.cigar
|
||||||
translate(::Haplotype{S,T}, ::PairwiseAlignment{S,S}) where {S,T}
|
translate(::Haplotype{S,T}, ::PairwiseAlignment{S,S}) where {S,T}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,24 @@ human2 == bovine
|
||||||
human2 == human
|
human2 == human
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Alignment reconstruction
|
||||||
|
|
||||||
|
Just like [Sequence reconstruction](@ref), alignments can also be reconstructed
|
||||||
|
from `Haplotype`s using the extension of the [`BioAlignments.alignment`](@ref)
|
||||||
|
function.
|
||||||
|
|
||||||
|
```@repl call_variants
|
||||||
|
human_alignment = alignment(bos_human_haplotype)
|
||||||
|
human_alignment == bos_human_alignment
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, you can get the information in CIGAR format using the
|
||||||
|
extension of the [`BioAlignments.cigar`](@ref) function.
|
||||||
|
|
||||||
|
```@repl call_variants
|
||||||
|
cigar(bos_human_haplotype)
|
||||||
|
```
|
||||||
|
|
||||||
## Reference switching
|
## Reference switching
|
||||||
|
|
||||||
All variations within a haplotype can be mapped to a new reference sequence
|
All variations within a haplotype can be mapped to a new reference sequence
|
||||||
|
|
|
@ -221,6 +221,46 @@ function reconstruct(h::Haplotype)
|
||||||
return seq
|
return seq
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
cigar(hap::Haplotype{S,T}) where {S,T}
|
||||||
|
|
||||||
|
Constructs a CIGAR string representing the alignment of the sequence of `hap` to its
|
||||||
|
reference.
|
||||||
|
"""
|
||||||
|
function BioAlignments.cigar(hap::Haplotype{S,T}) where {S,T}
|
||||||
|
cigar_string = String[]
|
||||||
|
|
||||||
|
mismatch_vars = filter(var -> !isa(mutation(var), Substitution), variations(hap))
|
||||||
|
|
||||||
|
length(mismatch_vars) > 0 || return "$(length(reference(hap)))M"
|
||||||
|
|
||||||
|
lastvar = first(mismatch_vars)
|
||||||
|
|
||||||
|
leftposition(lastvar) > 1 && push!(cigar_string, "$(leftposition(lastvar))M")
|
||||||
|
|
||||||
|
for var in mismatch_vars
|
||||||
|
push!(cigar_string, _cigar_between(lastvar, var))
|
||||||
|
push!(cigar_string, _cigar(var))
|
||||||
|
lastvar = var
|
||||||
|
end #for
|
||||||
|
|
||||||
|
remaining_bases = length(reference(hap)) - rightposition(lastvar)
|
||||||
|
remaining_bases > 0 && push!(cigar_string, "$(remaining_bases)M")
|
||||||
|
|
||||||
|
return join(cigar_string, "")
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
alignment(hap::Haplotype)
|
||||||
|
|
||||||
|
Gets a `PairwiseAlignment` of the mutated sequence of `hap` mapped to its refernce sequence
|
||||||
|
"""
|
||||||
|
function BioAlignments.alignment(hap::Haplotype)
|
||||||
|
return PairwiseAlignment(
|
||||||
|
AlignedSequence(reconstruct(hap), Alignment(cigar(hap))), reference(hap)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,15 @@ TODO now:
|
||||||
* Add tests
|
* Add tests
|
||||||
"""
|
"""
|
||||||
|
|
||||||
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence
|
using BioAlignments:
|
||||||
|
BioAlignments,
|
||||||
|
Alignment,
|
||||||
|
AlignedSequence,
|
||||||
|
PairwiseAlignment,
|
||||||
|
OP_SOFT_CLIP,
|
||||||
|
alignment,
|
||||||
|
cigar,
|
||||||
|
sequence
|
||||||
using BioGenerics: BioGenerics, leftposition, rightposition
|
using BioGenerics: BioGenerics, leftposition, rightposition
|
||||||
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
|
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
|
||||||
using BioSymbols: BioSymbol
|
using BioSymbols: BioSymbol
|
||||||
|
|
|
@ -120,6 +120,41 @@ function Base.in(v::Variation, var::Haplotype)
|
||||||
return any(v.edit == edit for edit in var.edits)
|
return any(v.edit == edit for edit in var.edits)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
_cigar(var::Variation{S,T}) where {S,T}
|
||||||
|
|
||||||
|
Returns a CIGAR operation for `var`. Only supports insertions and deletions.
|
||||||
|
|
||||||
|
See also [`_cigar_between`](@ref)
|
||||||
|
"""
|
||||||
|
function _cigar(var::Variation{S,T}) where {S,T}
|
||||||
|
mut = mutation(var)
|
||||||
|
mut isa Union{Deletion,Insertion} ||
|
||||||
|
throw(ArgumentError("var must be an Insertion or Deletion"))
|
||||||
|
cigar_letter = mut isa Deletion ? 'D' : 'I'
|
||||||
|
return string(length(mut), cigar_letter)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
_cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
|
||||||
|
|
||||||
|
Returns a CIGAR operation for the (assumed) matching bases between `x` and `y`.
|
||||||
|
|
||||||
|
See also [`_cigar`](@ref)
|
||||||
|
"""
|
||||||
|
function _cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
|
||||||
|
x == y && return ""
|
||||||
|
match_length = leftposition(y) - rightposition(x)
|
||||||
|
if mutation(y) isa Insertion
|
||||||
|
match_length -= 1
|
||||||
|
end
|
||||||
|
if mutation(y) isa Deletion
|
||||||
|
match_length += 1
|
||||||
|
end
|
||||||
|
match_length > 0 || return ""
|
||||||
|
return "$(match_length)M"
|
||||||
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,9 @@ using BioAlignments:
|
||||||
pairalign,
|
pairalign,
|
||||||
Alignment,
|
Alignment,
|
||||||
AlignedSequence,
|
AlignedSequence,
|
||||||
PairwiseAlignment
|
PairwiseAlignment,
|
||||||
|
alignment,
|
||||||
|
cigar
|
||||||
using BioSequences: BioSequence, @dna_str, ungap!
|
using BioSequences: BioSequence, @dna_str, ungap!
|
||||||
using BioSymbols: DNA_A
|
using BioSymbols: DNA_A
|
||||||
using SequenceVariation
|
using SequenceVariation
|
||||||
|
@ -127,6 +129,31 @@ end
|
||||||
@test Variation(seq2, "A3T") < Variation(seq2, "T4A")
|
@test Variation(seq2, "A3T") < Variation(seq2, "T4A")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@testset "CIGAR" begin
|
||||||
|
reference = dna"TGATGCGTGTAGCAACACTTATAGCG"
|
||||||
|
reference_genotype = Haplotype(
|
||||||
|
reference, Variation{typeof(reference),eltype(reference)}[]
|
||||||
|
)
|
||||||
|
genotype = Haplotype(
|
||||||
|
reference,
|
||||||
|
[
|
||||||
|
Variation(reference, "Δ1-2"),
|
||||||
|
Variation(reference, "10T"),
|
||||||
|
Variation(reference, "Δ17-18"),
|
||||||
|
Variation(reference, "A23C"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@test cigar(reference_genotype) == "26M"
|
||||||
|
@test cigar(genotype) == "2D7M1I7M2D8M"
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "HaplotypeAlignment" begin
|
||||||
|
# This test is broken until we get a way to remove sequence info from alignments
|
||||||
|
# See: https://github.com/BioJulia/BioAlignments.jl/issues/90
|
||||||
|
@test_broken alignment(var) == align(seq1, seq2)
|
||||||
|
end
|
||||||
|
|
||||||
@testset "HaplotypeTranslation" begin
|
@testset "HaplotypeTranslation" begin
|
||||||
ref1 = seq2
|
ref1 = seq2
|
||||||
ref2 = seq3
|
ref2 = seq3
|
||||||
|
|
Loading…
Reference in a new issue