Compare commits

...

11 commits

Author SHA1 Message Date
373e485802
chore: Update CHANGELOG 2023-04-08 20:51:09 -05:00
6b2bd018e2
docs: Add examples of alignment getters 2023-04-08 20:44:21 -05:00
341faa45e3
docs: Add autodocs for cigar 2023-04-08 20:43:50 -05:00
8458f74053
docs: Add autodocs for alignment 2023-04-08 20:43:30 -05:00
2d16c0b7b6
docs: Rename 'Variants' header to 'Haplotypes' 2023-04-08 20:40:17 -05:00
db170730f0
test: Add test for alignment(::Haplotype) 2023-04-08 20:24:39 -05:00
4817c8016a
feat: Add alignment(::Haplotype) function 2023-04-08 20:24:35 -05:00
95ef4fdc62
test: Add tests for cigar(::Haplotype) 2023-04-08 20:23:39 -05:00
719a123992
feat: Add cigar(::Haplotype) function 2023-04-08 20:23:34 -05:00
e84f765357
feat: Add _cigar_between(::Variation, ::Variation) function
Add a function that can calculate the matching bases between two
(non-matching) Variations and return a matching (M) CIGAR operation
2023-04-08 20:19:49 -05:00
c3a7788dbd
feat: Add _cigar(::Variation) function
Add a function that can convert the information contained in an
Insertion or a Deletion into a CIGAR operator.
2023-04-08 20:19:46 -05:00
7 changed files with 137 additions and 3 deletions

View file

@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
### Added
- `alignment` and `cigar` getters for `Haplotype`s ([#39](https://github.com/BioJulia/SequenceVariation.jl/issues/39)/[#42](https://github.com/BioJulia/SequenceVariation.jl/pull/42))
## [0.2.2] - 2023-01-28 ## [0.2.2] - 2023-01-28
### Fixed ### Fixed

View file

@ -15,13 +15,15 @@ Deletion
Insertion Insertion
``` ```
## Variants ## Haplotypes
```@docs ```@docs
Haplotype Haplotype
reference(::Haplotype) reference(::Haplotype)
variations variations
reconstruct reconstruct
BioAlignments.alignment
BioAlignment.cigar
translate(::Haplotype{S,T}, ::PairwiseAlignment{S,S}) where {S,T} translate(::Haplotype{S,T}, ::PairwiseAlignment{S,S}) where {S,T}
``` ```

View file

@ -39,6 +39,24 @@ human2 == bovine
human2 == human human2 == human
``` ```
## Alignment reconstruction
Just like [Sequence reconstruction](@ref), alignments can also be reconstructed
from `Haplotype`s using the extension of the [`BioAlignments.alignment`](@ref)
function.
```@repl call_variants
human_alignment = alignment(bos_human_haplotype)
human_alignment == bos_human_alignment
```
Alternatively, you can get the information in CIGAR format using the
extension of the [`BioAlignments.cigar`](@ref) function.
```@repl call_variants
cigar(bos_human_haplotype)
```
## Reference switching ## Reference switching
All variations within a haplotype can be mapped to a new reference sequence All variations within a haplotype can be mapped to a new reference sequence

View file

@ -221,6 +221,46 @@ function reconstruct(h::Haplotype)
return seq return seq
end end
"""
cigar(hap::Haplotype{S,T}) where {S,T}
Constructs a CIGAR string representing the alignment of the sequence of `hap` to its
reference.
"""
function BioAlignments.cigar(hap::Haplotype{S,T}) where {S,T}
cigar_string = String[]
mismatch_vars = filter(var -> !isa(mutation(var), Substitution), variations(hap))
length(mismatch_vars) > 0 || return "$(length(reference(hap)))M"
lastvar = first(mismatch_vars)
leftposition(lastvar) > 1 && push!(cigar_string, "$(leftposition(lastvar))M")
for var in mismatch_vars
push!(cigar_string, _cigar_between(lastvar, var))
push!(cigar_string, _cigar(var))
lastvar = var
end #for
remaining_bases = length(reference(hap)) - rightposition(lastvar)
remaining_bases > 0 && push!(cigar_string, "$(remaining_bases)M")
return join(cigar_string, "")
end
"""
alignment(hap::Haplotype)
Gets a `PairwiseAlignment` of the mutated sequence of `hap` mapped to its refernce sequence
"""
function BioAlignments.alignment(hap::Haplotype)
return PairwiseAlignment(
AlignedSequence(reconstruct(hap), Alignment(cigar(hap))), reference(hap)
)
end
""" """
translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T} translate(hap::Haplotype{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}

View file

@ -20,7 +20,15 @@ TODO now:
* Add tests * Add tests
""" """
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence using BioAlignments:
BioAlignments,
Alignment,
AlignedSequence,
PairwiseAlignment,
OP_SOFT_CLIP,
alignment,
cigar,
sequence
using BioGenerics: BioGenerics, leftposition, rightposition using BioGenerics: BioGenerics, leftposition, rightposition
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
using BioSymbols: BioSymbol using BioSymbols: BioSymbol

View file

@ -120,6 +120,41 @@ function Base.in(v::Variation, var::Haplotype)
return any(v.edit == edit for edit in var.edits) return any(v.edit == edit for edit in var.edits)
end end
"""
_cigar(var::Variation{S,T}) where {S,T}
Returns a CIGAR operation for `var`. Only supports insertions and deletions.
See also [`_cigar_between`](@ref)
"""
function _cigar(var::Variation{S,T}) where {S,T}
mut = mutation(var)
mut isa Union{Deletion,Insertion} ||
throw(ArgumentError("var must be an Insertion or Deletion"))
cigar_letter = mut isa Deletion ? 'D' : 'I'
return string(length(mut), cigar_letter)
end
"""
_cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
Returns a CIGAR operation for the (assumed) matching bases between `x` and `y`.
See also [`_cigar`](@ref)
"""
function _cigar_between(x::Variation{S,T}, y::Variation{S,T}) where {S,T}
x == y && return ""
match_length = leftposition(y) - rightposition(x)
if mutation(y) isa Insertion
match_length -= 1
end
if mutation(y) isa Deletion
match_length += 1
end
match_length > 0 || return ""
return "$(match_length)M"
end
""" """
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T} translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}

View file

@ -31,7 +31,9 @@ using BioAlignments:
pairalign, pairalign,
Alignment, Alignment,
AlignedSequence, AlignedSequence,
PairwiseAlignment PairwiseAlignment,
alignment,
cigar
using BioSequences: BioSequence, @dna_str, ungap! using BioSequences: BioSequence, @dna_str, ungap!
using BioSymbols: DNA_A using BioSymbols: DNA_A
using SequenceVariation using SequenceVariation
@ -127,6 +129,31 @@ end
@test Variation(seq2, "A3T") < Variation(seq2, "T4A") @test Variation(seq2, "A3T") < Variation(seq2, "T4A")
end end
@testset "CIGAR" begin
reference = dna"TGATGCGTGTAGCAACACTTATAGCG"
reference_genotype = Haplotype(
reference, Variation{typeof(reference),eltype(reference)}[]
)
genotype = Haplotype(
reference,
[
Variation(reference, "Δ1-2"),
Variation(reference, "10T"),
Variation(reference, "Δ17-18"),
Variation(reference, "A23C"),
],
)
@test cigar(reference_genotype) == "26M"
@test cigar(genotype) == "2D7M1I7M2D8M"
end
@testset "HaplotypeAlignment" begin
# This test is broken until we get a way to remove sequence info from alignments
# See: https://github.com/BioJulia/BioAlignments.jl/issues/90
@test_broken alignment(var) == align(seq1, seq2)
end
@testset "HaplotypeTranslation" begin @testset "HaplotypeTranslation" begin
ref1 = seq2 ref1 = seq2
ref2 = seq3 ref2 = seq3