mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-23 05:49:55 +00:00
308 lines
9.4 KiB
Julia
308 lines
9.4 KiB
Julia
"""
|
|
Needs to be able to:
|
|
* Given a sequence and a reference, create a `Haplotype` that unambiguously represents
|
|
the sequence
|
|
|
|
* Given a `Haplotype` and a new reference, translate the variant to the new reference.
|
|
|
|
* Given a mutation and a reference and a sequence, determine if the sequence has that
|
|
mutation
|
|
|
|
TODO now:
|
|
* Create a string repr and parser for Edit, perhaps
|
|
* A243T for sub
|
|
* 119TAGGCTA for insertion
|
|
* TGAGCTA9 for deletion
|
|
* Create a parser + print/show for edit
|
|
* Play around with some NGS results rel. to picked reference.
|
|
* Is it easy to construct ref and variants? I.e. is API nice?
|
|
* Is it nice and easy to check if a mut is present?
|
|
*
|
|
|
|
* Implement "reference switching".
|
|
* Add tests
|
|
"""
|
|
|
|
using Aqua
|
|
using BioAlignments:
|
|
AffineGapScoreModel,
|
|
GlobalAlignment,
|
|
EDNAFULL,
|
|
pairalign,
|
|
Alignment,
|
|
AlignedSequence,
|
|
PairwiseAlignment,
|
|
cigar
|
|
using BioSequences: BioSequence, @dna_str, ungap!
|
|
using BioSymbols: DNA_A
|
|
using SequenceVariation
|
|
using Test
|
|
|
|
const DNA_MODEL = AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_extend=-2)
|
|
|
|
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
|
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
|
seq2 = ungap!(dna"TGATGCGTGT-AGCAACACTTATAGCG")
|
|
seq3 = ungap!(dna"-GATGCGTGT-AGCAACACTTATCGC-")
|
|
var = Haplotype(align(seq1, seq2))
|
|
|
|
const SEQ = typeof(seq1)
|
|
const BSE = eltype(seq1)
|
|
|
|
@testset "EditSorting" begin
|
|
S = typeof(seq1)
|
|
T = eltype(seq1)
|
|
@test SequenceVariation.Edit{S,T}(Deletion(1), 1) <
|
|
SequenceVariation.Edit{S,T}(Deletion(1), 2)
|
|
@test SequenceVariation.Edit{S,T}(Deletion(1), 1) <
|
|
SequenceVariation.Edit{S,T}(Deletion(2), 1)
|
|
end
|
|
|
|
@testset "HaplotypeValidation" begin
|
|
# Test that we can't use an empty reference
|
|
@test_throws ErrorException Haplotype(
|
|
dna"", [SequenceVariation.Edit{SEQ,BSE}(Insertion{SEQ}(dna"A"), 0)]
|
|
)
|
|
|
|
# Test that substitutions cannot share the same position
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "T2A"), Variation(seq2, "T2C")]
|
|
)
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "G2A"), Variation(seq2, "G2T")]
|
|
)
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "G26A"), Variation(seq2, "G26T")]
|
|
)
|
|
|
|
# Test that insertions cannot share the same position
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "0AAA"), Variation(seq2, "0TTT")]
|
|
)
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "2AAA"), Variation(seq2, "2TTT")]
|
|
)
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "26AAA"), Variation(seq2, "26TTT")]
|
|
)
|
|
|
|
# Test that deletions invalidate further operations within the deleted positions
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "Δ2-2"), Variation(seq2, "G2A")]
|
|
)
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "G2A"), Variation(seq2, "Δ2-2")]
|
|
)
|
|
@test_throws ErrorException Haplotype(
|
|
seq2, [Variation(seq2, "Δ2-6"), Variation(seq2, "Δ3-5")]
|
|
)
|
|
|
|
# Test that a complicated (but valid) Haplotype still checks out
|
|
@test first(
|
|
SequenceVariation._is_valid(
|
|
Haplotype(
|
|
seq2,
|
|
[
|
|
Variation(seq2, "0AAA"),
|
|
Variation(seq2, "T2A"),
|
|
Variation(seq2, "Δ5-8"),
|
|
Variation(seq2, "G26A"),
|
|
],
|
|
),
|
|
),
|
|
)
|
|
end
|
|
|
|
@testset "HaplotypeRoundtrip" begin
|
|
for v in variations(var)
|
|
@test v in var
|
|
@test v in Haplotype(seq2, [v])
|
|
end
|
|
end
|
|
|
|
@testset "HaplotypeReconstruction" begin
|
|
@test reconstruct(var) == seq1
|
|
end
|
|
|
|
@testset "VariationSorting" begin
|
|
@test Variation(seq2, "A3T") < Variation(seq2, "T4A")
|
|
end
|
|
|
|
@testset "CIGAR" begin
|
|
reference = dna"TGATGCGTGTAGCAACACTTATAGCG"
|
|
reference_genotype = Haplotype(
|
|
reference, Variation{typeof(reference),eltype(reference)}[]
|
|
)
|
|
genotype = Haplotype(
|
|
reference,
|
|
[
|
|
Variation(reference, "Δ1-2"),
|
|
Variation(reference, "10T"),
|
|
Variation(reference, "Δ17-18"),
|
|
Variation(reference, "A23C"),
|
|
],
|
|
)
|
|
|
|
@test cigar(reference_genotype) == "26M"
|
|
@test cigar(genotype) == "2D7M1I7M2D8M"
|
|
end
|
|
|
|
@testset "HaplotypeTranslation" begin
|
|
ref1 = seq2
|
|
ref2 = seq3
|
|
alt = seq1
|
|
|
|
@test all(
|
|
v -> v in Haplotype(align(alt, ref2)), variations(translate(var, align(ref2, ref1)))
|
|
)
|
|
end
|
|
|
|
@testset "VariationPosition" begin
|
|
refseq = dna"ACAACTTTATCT"
|
|
mutseq = dna"ACATCTTTATCT"
|
|
|
|
read01 = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
|
|
read02 = AlignedSequence(mutseq[3:12], Alignment("10M", 1, 3))
|
|
aln01 = PairwiseAlignment(read01, refseq)
|
|
aln02 = PairwiseAlignment(read02, refseq)
|
|
@test Haplotype(aln01).edits == Haplotype(aln02).edits
|
|
end
|
|
|
|
@testset "VariationParsing" begin
|
|
refseq = dna"ACAACTTTATCT"
|
|
|
|
sub = Variation(refseq, "A4T")
|
|
del = Variation(refseq, "Δ4-5")
|
|
ins = Variation(refseq, "4TT")
|
|
|
|
@test mutation(sub) isa Substitution
|
|
@test mutation(del) isa Deletion
|
|
@test mutation(ins) isa Insertion
|
|
end
|
|
|
|
@testset "VariationRetrieval" begin
|
|
refseq = dna"ACAACTTTATCT"
|
|
mutseq = dna"ACATCTTTATCT"
|
|
|
|
read = AlignedSequence(mutseq[1:10], Alignment("10M", 1, 1))
|
|
aln = PairwiseAlignment(read, refseq)
|
|
var = Haplotype(aln)
|
|
|
|
sub = Variation(refseq, "A4T")
|
|
@test first(variations(var)) == sub
|
|
end
|
|
|
|
@testset "VariationTranslation" begin
|
|
ref1 = seq2
|
|
ref2 = seq3
|
|
ref3 = copy(ref1)
|
|
ref3[1] = DNA_A
|
|
alt = seq1
|
|
|
|
ref2_on_ref1 = align(ref2, ref1)
|
|
alt_on_ref1 = align(alt, ref1)
|
|
ref1_on_alt = align(ref1, alt)
|
|
ref3_on_ref1 = align(ref3, ref1)
|
|
|
|
# Test insertion at position zero
|
|
@test translate(Variation(ref1, "0CAT"), ref2_on_ref1) ==
|
|
SequenceVariation.Inapplicable()
|
|
@test translate(Variation(ref1, "0CAT"), ref3_on_ref1) == Variation(ref3, "0CAT")
|
|
|
|
# Test substitution on deletion
|
|
@test isnothing(translate(Variation(ref1, "A17T"), ref1_on_alt))
|
|
|
|
# Test substitution to itself
|
|
@test isnothing(translate(Variation(ref1, "A23C"), ref2_on_ref1))
|
|
|
|
# Test simple substitution
|
|
@test translate(Variation(ref1, "T10A"), ref2_on_ref1) == Variation(ref2, "T9A")
|
|
|
|
# Test simple deletion
|
|
@test translate(Variation(ref1, "Δ17-18"), ref2_on_ref1) == Variation(ref2, "Δ16-17")
|
|
|
|
# Test deletion on deletion
|
|
@test isnothing(translate(Variation(ref1, "Δ17-17"), alt_on_ref1))
|
|
|
|
# Test simple insertion
|
|
@test translate(Variation(ref1, "6CAT"), ref2_on_ref1) == Variation(ref2, "5CAT")
|
|
|
|
# Test two insertions at the same position
|
|
@test translate(Variation(ref1, "10G"), alt_on_ref1) == SequenceVariation.Inapplicable()
|
|
|
|
# Test insertion on deletion
|
|
@test translate(Variation(ref1, "17CAT"), alt_on_ref1) ==
|
|
SequenceVariation.Inapplicable()
|
|
end
|
|
|
|
@testset "VariationBases" begin
|
|
# Test substition bases
|
|
@test refbases(Variation(dna"ATCGA", "C3G")) == dna"C"
|
|
@test altbases(Variation(dna"ATCGA", "C3G")) == dna"G"
|
|
|
|
# Test single deletion bases
|
|
@test refbases(Variation(dna"ATCGA", "Δ3-3")) == dna"TC"
|
|
@test altbases(Variation(dna"ATCGA", "Δ3-3")) == dna"T"
|
|
|
|
# Test multiple deletion bases
|
|
@test refbases(Variation(dna"ATCGA", "Δ3-4")) == dna"TCG"
|
|
@test altbases(Variation(dna"ATCGA", "Δ3-4")) == dna"T"
|
|
|
|
# Test first position deletion
|
|
@test refbases(Variation(dna"ATCGA", "Δ1-1")) == dna"AT"
|
|
@test altbases(Variation(dna"ATCGA", "Δ1-1")) == dna"T"
|
|
|
|
# Test single insertion bases
|
|
@test refbases(Variation(dna"ATCGA", "3A")) == dna"C"
|
|
@test altbases(Variation(dna"ATCGA", "3A")) == dna"CA"
|
|
|
|
# Test multiple insertion bases
|
|
@test refbases(Variation(dna"ATCGA", "3TAG")) == dna"C"
|
|
@test altbases(Variation(dna"ATCGA", "3TAG")) == dna"CTAG"
|
|
|
|
# Test first position insertion
|
|
@test refbases(Variation(dna"ATCGA", "1C")) == dna"A"
|
|
@test altbases(Variation(dna"ATCGA", "1C")) == dna"CA"
|
|
end
|
|
|
|
@testset "SoftclipHaplotype" begin
|
|
refseq = dna"GATTACA"
|
|
mutseq = dna"GATTACAAAA"
|
|
|
|
refvar = Haplotype(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
|
|
|
# Test for ending soft clip
|
|
@test Haplotype(
|
|
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
|
|
) == refvar
|
|
|
|
# Test for ending soft+hard clip
|
|
@test Haplotype(
|
|
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
|
|
) == refvar
|
|
|
|
# Test that ending insertions are still valid
|
|
@test length(
|
|
Haplotype(
|
|
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
|
|
).edits,
|
|
) == 1
|
|
|
|
# Test that out-of-bounds bases are still caught
|
|
@test_throws BoundsError Haplotype(
|
|
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
|
|
)
|
|
end
|
|
|
|
@testset "Aqua" begin
|
|
Aqua.test_ambiguities(SequenceVariation; recursive=false)
|
|
# TODO: Refactor `Edit` so that this test doesn't fail
|
|
# TODO: This test _should_ be set to @test_fails, but Aqua's syntax doesn't allow that
|
|
# Aqua.test_unbound_args(SequenceVariation)
|
|
Aqua.test_undefined_exports(SequenceVariation)
|
|
Aqua.test_piracy(SequenceVariation)
|
|
Aqua.test_project_extras(SequenceVariation)
|
|
Aqua.test_stale_deps(SequenceVariation)
|
|
Aqua.test_deps_compat(SequenceVariation)
|
|
Aqua.test_project_toml_formatting(SequenceVariation)
|
|
end
|