Compare commits

...

11 Commits

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- `alignment` and `cigar` getters for `Haplotype`s ([#39](https://github.com/BioJulia/SequenceVariation.jl/issues/39)/[#42](https://github.com/BioJulia/SequenceVariation.jl/pull/42))
## [0.2.2] - 2023-01-28
### Fixed

@ -15,13 +15,15 @@ Deletion
Insertion
```
## Variants
## Haplotypes
```@docs
Haplotype
reference(::Haplotype)
variations
reconstruct
BioAlignments.alignment
BioAlignment.cigar
translate(::Haplotype{S,T}, ::PairwiseAlignment{S,S}) where {S,T}
```

@ -39,6 +39,24 @@ human2 == bovine
human2 == human
```
## Alignment reconstruction
Just like [Sequence reconstruction](@ref), alignments can also be reconstructed
from `Haplotype`s using the extension of the [`BioAlignments.alignment`](@ref)
function.
```@repl call_variants
human_alignment = alignment(bos_human_haplotype)
human_alignment == bos_human_alignment
```
Alternatively, you can get the information in CIGAR format using the
extension of the [`BioAlignments.cigar`](@ref) function.
```@repl call_variants
cigar(bos_human_haplotype)
```
## Reference switching
All variations within a haplotype can be mapped to a new reference sequence

@ -221,6 +221,46 @@ function reconstruct(h::Haplotype)
return seq
end
"""
cigar(hap::Haplotype{S,T}) where {S,T}
Constructs a CIGAR string representing the alignment of the sequence of `hap` to its
reference.
"""
function BioAlignments.cigar(hap::Haplotype{S,T}) where {S,T}
cigar_string = String[]
mismatch_vars = filter(var -> !isa(mutation(var), Substitution), variations(hap))
length(mismatch_vars) > 0 || return "$(length(reference(hap)))M"
lastvar = first(mismatch_vars)
leftposition(lastvar) > 1 && push!(cigar_string, "$(leftposition(lastvar))M")
for var in mismatch_vars
push!(cigar_string, _cigar_between(lastvar, var))
push!(cigar_string, _cigar(var))
lastvar = var
end #for
remaining_bases = length(reference(hap)) - rightposition(lastvar)
remaining_bases > 0 && push!(cigar_string, "$(remaining_bases)M")
return join(cigar_string, "")
end
"""
alignment(hap::Haplotype)
Gets a `PairwiseAlignment` of the mutated sequence of `hap` mapped to its refernce sequence
"""
function BioAlignments.alignment(hap::Haplotype)
return PairwiseAlignment(
AlignedSequence(reconstruct(hap), Alignment(cigar(hap))), reference(hap)
)
end
"""
translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}

@ -20,7 +20,15 @@ TODO now:
* Add tests
"""
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence
using BioAlignments:
BioAlignments,
Alignment,
AlignedSequence,
PairwiseAlignment,
OP_SOFT_CLIP,
alignment,
cigar,
sequence
using BioGenerics: BioGenerics, leftposition, rightposition
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
using BioSymbols: BioSymbol

@ -120,6 +120,41 @@ function Base.in(v::Variation, var::Haplotype)
return any(v.edit == edit for edit in var.edits)
end
"""
_cigar(var::Variation{S,T}) where {S,T}
Returns a CIGAR operation for `var`. Only supports insertions and deletions.
See also [`_cigar_between`](@ref)
"""
function _cigar(var::Variation{S,T}) where {S,T}
mut = mutation(var)
mut isa Union{Deletion,Insertion} ||
throw(ArgumentError("var must be an Insertion or Deletion"))
cigar_letter = mut isa Deletion ? 'D' : 'I'
return string(length(mut), cigar_letter)
end
"""
_cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
Returns a CIGAR operation for the (assumed) matching bases between `x` and `y`.
See also [`_cigar`](@ref)
"""
function _cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
x == y && return ""
match_length = leftposition(y) - rightposition(x)
if mutation(y) isa Insertion
match_length -= 1
end
if mutation(y) isa Deletion
match_length += 1
end
match_length > 0 || return ""
return "$(match_length)M"
end
"""
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}

@ -31,7 +31,9 @@ using BioAlignments:
pairalign,
Alignment,
AlignedSequence,
PairwiseAlignment
PairwiseAlignment,
alignment,
cigar
using BioSequences: BioSequence, @dna_str, ungap!
using BioSymbols: DNA_A
using SequenceVariation
@ -127,6 +129,31 @@ end
@test Variation(seq2, "A3T") < Variation(seq2, "T4A")
end
@testset "CIGAR" begin
reference = dna"TGATGCGTGTAGCAACACTTATAGCG"
reference_genotype = Haplotype(
reference, Variation{typeof(reference),eltype(reference)}[]
)
genotype = Haplotype(
reference,
[
Variation(reference, "Δ1-2"),
Variation(reference, "10T"),
Variation(reference, "Δ17-18"),
Variation(reference, "A23C"),
],
)
@test cigar(reference_genotype) == "26M"
@test cigar(genotype) == "2D7M1I7M2D8M"
end
@testset "HaplotypeAlignment" begin
# This test is broken until we get a way to remove sequence info from alignments
# See: https://github.com/BioJulia/BioAlignments.jl/issues/90
@test_broken alignment(var) == align(seq1, seq2)
end
@testset "HaplotypeTranslation" begin
ref1 = seq2
ref2 = seq3

Loading…
Cancel
Save