mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 05:19:55 +00:00
Compare commits
11 commits
a8b0372329
...
373e485802
Author | SHA1 | Date | |
---|---|---|---|
373e485802 | |||
6b2bd018e2 | |||
341faa45e3 | |||
8458f74053 | |||
2d16c0b7b6 | |||
db170730f0 | |||
4817c8016a | |||
95ef4fdc62 | |||
719a123992 | |||
e84f765357 | |||
c3a7788dbd |
7 changed files with 137 additions and 3 deletions
|
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- `alignment` and `cigar` getters for `Haplotype`s ([#39](https://github.com/BioJulia/SequenceVariation.jl/issues/39)/[#42](https://github.com/BioJulia/SequenceVariation.jl/pull/42))
|
||||
|
||||
## [0.2.2] - 2023-01-28
|
||||
|
||||
### Fixed
|
||||
|
|
|
@ -15,13 +15,15 @@ Deletion
|
|||
Insertion
|
||||
```
|
||||
|
||||
## Variants
|
||||
## Haplotypes
|
||||
|
||||
```@docs
|
||||
Haplotype
|
||||
reference(::Haplotype)
|
||||
variations
|
||||
reconstruct
|
||||
BioAlignments.alignment
|
||||
BioAlignment.cigar
|
||||
translate(::Haplotype{S,T}, ::PairwiseAlignment{S,S}) where {S,T}
|
||||
```
|
||||
|
||||
|
|
|
@ -39,6 +39,24 @@ human2 == bovine
|
|||
human2 == human
|
||||
```
|
||||
|
||||
## Alignment reconstruction
|
||||
|
||||
Just like [Sequence reconstruction](@ref), alignments can also be reconstructed
|
||||
from `Haplotype`s using the extension of the [`BioAlignments.alignment`](@ref)
|
||||
function.
|
||||
|
||||
```@repl call_variants
|
||||
human_alignment = alignment(bos_human_haplotype)
|
||||
human_alignment == bos_human_alignment
|
||||
```
|
||||
|
||||
Alternatively, you can get the information in CIGAR format using the
|
||||
extension of the [`BioAlignments.cigar`](@ref) function.
|
||||
|
||||
```@repl call_variants
|
||||
cigar(bos_human_haplotype)
|
||||
```
|
||||
|
||||
## Reference switching
|
||||
|
||||
All variations within a haplotype can be mapped to a new reference sequence
|
||||
|
|
|
@ -221,6 +221,46 @@ function reconstruct(h::Haplotype)
|
|||
return seq
|
||||
end
|
||||
|
||||
"""
|
||||
cigar(hap::Haplotype{S,T}) where {S,T}
|
||||
|
||||
Constructs a CIGAR string representing the alignment of the sequence of `hap` to its
|
||||
reference.
|
||||
"""
|
||||
function BioAlignments.cigar(hap::Haplotype{S,T}) where {S,T}
|
||||
cigar_string = String[]
|
||||
|
||||
mismatch_vars = filter(var -> !isa(mutation(var), Substitution), variations(hap))
|
||||
|
||||
length(mismatch_vars) > 0 || return "$(length(reference(hap)))M"
|
||||
|
||||
lastvar = first(mismatch_vars)
|
||||
|
||||
leftposition(lastvar) > 1 && push!(cigar_string, "$(leftposition(lastvar))M")
|
||||
|
||||
for var in mismatch_vars
|
||||
push!(cigar_string, _cigar_between(lastvar, var))
|
||||
push!(cigar_string, _cigar(var))
|
||||
lastvar = var
|
||||
end #for
|
||||
|
||||
remaining_bases = length(reference(hap)) - rightposition(lastvar)
|
||||
remaining_bases > 0 && push!(cigar_string, "$(remaining_bases)M")
|
||||
|
||||
return join(cigar_string, "")
|
||||
end
|
||||
|
||||
"""
|
||||
alignment(hap::Haplotype)
|
||||
|
||||
Gets a `PairwiseAlignment` of the mutated sequence of `hap` mapped to its refernce sequence
|
||||
"""
|
||||
function BioAlignments.alignment(hap::Haplotype)
|
||||
return PairwiseAlignment(
|
||||
AlignedSequence(reconstruct(hap), Alignment(cigar(hap))), reference(hap)
|
||||
)
|
||||
end
|
||||
|
||||
"""
|
||||
translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
||||
|
||||
|
|
|
@ -20,7 +20,15 @@ TODO now:
|
|||
* Add tests
|
||||
"""
|
||||
|
||||
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence
|
||||
using BioAlignments:
|
||||
BioAlignments,
|
||||
Alignment,
|
||||
AlignedSequence,
|
||||
PairwiseAlignment,
|
||||
OP_SOFT_CLIP,
|
||||
alignment,
|
||||
cigar,
|
||||
sequence
|
||||
using BioGenerics: BioGenerics, leftposition, rightposition
|
||||
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
|
||||
using BioSymbols: BioSymbol
|
||||
|
|
|
@ -120,6 +120,41 @@ function Base.in(v::Variation, var::Haplotype)
|
|||
return any(v.edit == edit for edit in var.edits)
|
||||
end
|
||||
|
||||
"""
|
||||
_cigar(var::Variation{S,T}) where {S,T}
|
||||
|
||||
Returns a CIGAR operation for `var`. Only supports insertions and deletions.
|
||||
|
||||
See also [`_cigar_between`](@ref)
|
||||
"""
|
||||
function _cigar(var::Variation{S,T}) where {S,T}
|
||||
mut = mutation(var)
|
||||
mut isa Union{Deletion,Insertion} ||
|
||||
throw(ArgumentError("var must be an Insertion or Deletion"))
|
||||
cigar_letter = mut isa Deletion ? 'D' : 'I'
|
||||
return string(length(mut), cigar_letter)
|
||||
end
|
||||
|
||||
"""
|
||||
_cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
|
||||
|
||||
Returns a CIGAR operation for the (assumed) matching bases between `x` and `y`.
|
||||
|
||||
See also [`_cigar`](@ref)
|
||||
"""
|
||||
function _cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
|
||||
x == y && return ""
|
||||
match_length = leftposition(y) - rightposition(x)
|
||||
if mutation(y) isa Insertion
|
||||
match_length -= 1
|
||||
end
|
||||
if mutation(y) isa Deletion
|
||||
match_length += 1
|
||||
end
|
||||
match_length > 0 || return ""
|
||||
return "$(match_length)M"
|
||||
end
|
||||
|
||||
"""
|
||||
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
||||
|
||||
|
|
|
@ -31,7 +31,9 @@ using BioAlignments:
|
|||
pairalign,
|
||||
Alignment,
|
||||
AlignedSequence,
|
||||
PairwiseAlignment
|
||||
PairwiseAlignment,
|
||||
alignment,
|
||||
cigar
|
||||
using BioSequences: BioSequence, @dna_str, ungap!
|
||||
using BioSymbols: DNA_A
|
||||
using SequenceVariation
|
||||
|
@ -127,6 +129,31 @@ end
|
|||
@test Variation(seq2, "A3T") < Variation(seq2, "T4A")
|
||||
end
|
||||
|
||||
@testset "CIGAR" begin
|
||||
reference = dna"TGATGCGTGTAGCAACACTTATAGCG"
|
||||
reference_genotype = Haplotype(
|
||||
reference, Variation{typeof(reference),eltype(reference)}[]
|
||||
)
|
||||
genotype = Haplotype(
|
||||
reference,
|
||||
[
|
||||
Variation(reference, "Δ1-2"),
|
||||
Variation(reference, "10T"),
|
||||
Variation(reference, "Δ17-18"),
|
||||
Variation(reference, "A23C"),
|
||||
],
|
||||
)
|
||||
|
||||
@test cigar(reference_genotype) == "26M"
|
||||
@test cigar(genotype) == "2D7M1I7M2D8M"
|
||||
end
|
||||
|
||||
@testset "HaplotypeAlignment" begin
|
||||
# This test is broken until we get a way to remove sequence info from alignments
|
||||
# See: https://github.com/BioJulia/BioAlignments.jl/issues/90
|
||||
@test_broken alignment(var) == align(seq1, seq2)
|
||||
end
|
||||
|
||||
@testset "HaplotypeTranslation" begin
|
||||
ref1 = seq2
|
||||
ref2 = seq3
|
||||
|
|
Loading…
Reference in a new issue