Compare commits

...

12 commits

Author SHA1 Message Date
17f4cd6f39 Update CHANGELOG 2022-12-17 22:02:15 +00:00
128a5445ad Switch alignment validation to after edit construction 2022-12-17 22:02:15 +00:00
f9058c5cb3 Update ranges to allow end position insertions 2022-12-17 22:02:15 +00:00
5be4dce200 Add check for soft clips when construction Variants from alignment 2022-12-17 22:02:15 +00:00
d8435be115 Add Variation-level validation for Variants
Testing indicates that some Variants (particularly those with insertions
and/or clips at the send of the query sequence) will validate fine as a
Variant, but will fail validation as a Variation. This problem is
particularly annoying when constructing Variants in the REPL, as the
creation will complete successfully, but the show of the Variant will
error out. To fix this, leverage the existing validation of Variation to
check each Edit within a Variant upon construction.
2022-12-17 22:02:15 +00:00
91c3acc85e Add equality function for Variant 2022-12-17 22:02:15 +00:00
f9e76d60d6 Add test for ending substitutions being invalid 2022-12-17 22:02:15 +00:00
3b61ccefb5 Add test for ending insertion 2022-12-17 22:02:15 +00:00
abff6692d4 Add test for combined clips 2022-12-17 22:02:15 +00:00
dd00231840 Add test for soft clips 2022-12-17 22:02:15 +00:00
b45081a56e Update CHANGELOG for v0.1.3 2022-11-22 18:07:53 +00:00
00495d4e14 Bump version number in Project.toml 2022-11-22 18:07:53 +00:00
4 changed files with 47 additions and 12 deletions

View file

@ -7,13 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.1.3] - 2022-11-22
## Changed ## Changed
- Variations getter now returns type-parameterized vector ([#23](https://github.com/BioJulia/SequenceVariation.jl/pull/23)) - Variations getter now returns type-parameterized vector ([#23](https://github.com/BioJulia/SequenceVariation.jl/pull/23))
### Fixed
- Soft clips at end of alignment causing invalid `Variant`s ([#25](https://github.com/BioJulia/SequenceVariation.jl/issues/25)/[#26](https://github.com/BioJulia/SequenceVariation.jl/pull/26))
## [0.1.2] - 2022-10-04 ## [0.1.2] - 2022-10-04
- ## Changed ## Changed
- Updated dependency compats ([#21](https://github.com/BioJulia/SequenceVariation.jl/pull/21)) - Updated dependency compats ([#21](https://github.com/BioJulia/SequenceVariation.jl/pull/21))
- BioAlignments: 2 -> 2,3 - BioAlignments: 2 -> 2,3
@ -37,7 +43,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `Variant` constructor to automatically detect mutations from a `BioAlignments.PairwiseAlignment` - `Variant` constructor to automatically detect mutations from a `BioAlignments.PairwiseAlignment`
- Methods to get reference and alternate bases from a `Variation` - Methods to get reference and alternate bases from a `Variation`
[unreleased]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.2...HEAD [unreleased]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.3...HEAD
[0.1.3]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.2...v0.1.3
[0.1.2]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.1...v0.1.2 [0.1.2]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.1...v0.1.2
[0.1.1]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.0...v0.1.1 [0.1.1]: https://github.com/BioJulia/SequenceVariation.jl/compare/v0.1.0...v0.1.1
[0.1.0]: https://github.com/BioJulia/SequenceVariation.jl/releases/tag/v0.1.0 [0.1.0]: https://github.com/BioJulia/SequenceVariation.jl/releases/tag/v0.1.0

View file

@ -1,7 +1,7 @@
name = "SequenceVariation" name = "SequenceVariation"
uuid = "eef6e190-9969-4f06-a38f-35a110a8fdc8" uuid = "eef6e190-9969-4f06-a38f-35a110a8fdc8"
authors = ["Jakob Nybo Nissen <jakobnybonissen@gmail.com>", "Thomas A. Christensen II <25492070+MillironX@users.noreply.github.com>"] authors = ["Jakob Nybo Nissen <jakobnybonissen@gmail.com>", "Thomas A. Christensen II <25492070+MillironX@users.noreply.github.com>"]
version = "0.1.2" version = "0.1.3"
[deps] [deps]
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"

View file

@ -20,7 +20,7 @@ TODO now:
* Add tests * Add tests
""" """
using BioAlignments: BioAlignments, PairwiseAlignment using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP
using BioGenerics: BioGenerics, leftposition, rightposition using BioGenerics: BioGenerics, leftposition, rightposition
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
using BioSymbols: BioSymbol using BioSymbols: BioSymbol
@ -233,6 +233,10 @@ function is_valid(v::Variant)
for edit in v.edits for edit in v.edits
pos = edit.pos pos = edit.pos
op = edit.x op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
is_valid(Variation(v.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base # For substitutions we simply do not allow another modification of the same base
if op isa Substitution if op isa Substitution
pos in valid_positions || return false pos in valid_positions || return false
@ -242,7 +246,7 @@ function is_valid(v::Variant)
# for next op. However, we cannot have two insertions at the same position, because # for next op. However, we cannot have two insertions at the same position, because
# then the order of them is ambiguous # then the order of them is ambiguous
elseif op isa Insertion elseif op isa Insertion
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)) || return false pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
last_was_insert = true last_was_insert = true
# Deletions obviously invalidate the reference bases that are deleted. # Deletions obviously invalidate the reference bases that are deleted.
elseif op isa Deletion elseif op isa Deletion
@ -259,7 +263,6 @@ function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{
ref = aln.b ref = aln.b
E = eltype(typeof(ref)) E = eltype(typeof(ref))
edits = Edit{T, E}[] edits = Edit{T, E}[]
result = Variant(ref, edits)
refpos = first(aln.a.aln.anchors).refpos refpos = first(aln.a.aln.anchors).refpos
seqpos = first(aln.a.aln.anchors).seqpos seqpos = first(aln.a.aln.anchors).seqpos
markpos = 0 markpos = 0
@ -296,18 +299,24 @@ function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{
end end
end end
# Check for clips at the end of the alignment
last_anchors = aln.a.aln.anchors[end-1:end]
# Final indel, if applicable # Final indel, if applicable
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
if !iszero(n_gaps) if !iszero(n_gaps)
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos))) push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
elseif !iszero(n_ins) elseif !iszero(n_ins)
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos))) push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
end end
end
return result return Variant(ref, edits)
end end
edits(v::Variant) = v.edits edits(v::Variant) = v.edits
reference(v::Variant) = v.ref reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
function lendiff(edit::Edit) function lendiff(edit::Edit)
x = edit.x x = edit.x
@ -389,7 +398,7 @@ function is_valid(v::Variation)
if op isa Substitution if op isa Substitution
return pos in eachindex(v.ref) return pos in eachindex(v.ref)
elseif op isa Insertion elseif op isa Insertion
return pos in 0:lastindex(v.ref) return pos in 0:lastindex(v.ref)+1
elseif op isa Deletion elseif op isa Deletion
return pos in 1:(lastindex(v.ref)-length(op) + 1) return pos in 1:(lastindex(v.ref)-length(op) + 1)
end end

View file

@ -106,3 +106,22 @@ end
@test refbases(Variation(dna"ATCGA", "1C")) == dna"A" @test refbases(Variation(dna"ATCGA", "1C")) == dna"A"
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA" @test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
end end
@testset "SoftclipVariant" begin
refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA"
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq), eltype(refseq)}[])
# Test for ending soft clip
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)) == refvar
# Test for ending soft+hard clip
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)) == refvar
# Test that ending insertions are still valid
@test length(Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)).edits) == 1
# Test that out-of-bounds bases are still caught
@test_throws BoundsError Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq))
end