Compare commits

..

38 commits

Author SHA1 Message Date
3ef4a92ba2 Update CHANGELOG 2023-01-04 11:56:45 -06:00
8ae09aa9a1 Add linting workflow for PRs 2023-01-04 11:56:45 -06:00
d29764d2bc Apply Blue style to tests file 2023-01-04 11:56:45 -06:00
fb9ab68cca Apply Blue style to docs file 2023-01-04 11:56:45 -06:00
cb53defe9a Add Aqua tests 2023-01-04 11:56:45 -06:00
1942090cd6 Remove Windows and Mac CI tests
There is no platform-specific code in this package, so this just slows down
CI with no benefit
2023-01-04 11:56:45 -06:00
6510ee3fc0 Add tutorial-type documentation 2023-01-04 11:56:45 -06:00
134ee771b9 Correct package install directions in docs 2023-01-04 11:56:45 -06:00
6f7110caec Organize API docs 2023-01-04 11:56:45 -06:00
2a7d61e43d Add docstrings for Variation functions 2023-01-04 11:56:45 -06:00
673481b40f Add docstring for Variation 2023-01-04 11:56:45 -06:00
ea25ab87dc Add docstrings for Variant methods 2023-01-04 11:56:45 -06:00
f49220782b Add docstring for Variant 2023-01-04 11:56:45 -06:00
f1baf27865 Add docstring for lendiff(::Edit) 2023-01-04 11:56:45 -06:00
82ba838614 Add docstring for mutation(::Edit) 2023-01-04 11:56:45 -06:00
1f2b116518 Mark is_valid(::Variation) as private function 2023-01-04 11:56:45 -06:00
386acba8ab Mark edits(::Variant) as private API 2023-01-04 11:56:45 -06:00
a573693737 Mark edit(::Variation) as private function 2023-01-04 11:56:45 -06:00
2f4f5fb5ae Mark is_valid(::Variant) as private function 2023-01-04 11:56:45 -06:00
2719c438f3 Mark lendiff(::Edit) as private API 2023-01-04 11:56:45 -06:00
e8457e4241 Mark mutation(::Edit) as private API 2023-01-04 11:56:45 -06:00
cb1c429610 Update getter usage in translate function 2023-01-04 11:56:45 -06:00
ef4460bbbf Replace field accession with getter function usage 2023-01-04 11:56:45 -06:00
dd405c5f4b Refactor _lendiff to use multiple dispatch 2023-01-04 11:56:45 -06:00
cab3029bc6 Refactor Edit length to use dispatch instead of type checking 2023-01-04 11:56:45 -06:00
22d460b15a Remove unused argument names from base getters for edits 2023-01-04 11:56:45 -06:00
da2cbbd528 Add equals functionality to Deletion 2023-01-04 11:56:45 -06:00
f08d6eb991 Add length method for Substitution 2023-01-04 11:56:45 -06:00
3b2730b5c1 Add export for translate and reconstruct! 2023-01-04 11:56:45 -06:00
25c63906dc Move exports to top of file
Per Blue style, exports should be one per line, and appear immediately
after using statements. Reorganize module file to comply
2023-01-04 11:56:45 -06:00
9f9d0899bd Move Insertion-related code to edits/Insertion.jl 2023-01-04 11:56:45 -06:00
b36ab69c72 Move Deletion-related code to edits/Deletion.jl 2023-01-04 11:56:45 -06:00
ba9f0e8fb1 Move Substitution-related code to edits/Substitution.jl 2023-01-04 11:56:45 -06:00
8c2dd271f3 Move Variation-related code to Variation.jl 2023-01-04 11:56:45 -06:00
37965ac7eb Move Variant-related code to Variant.jl 2023-01-04 11:56:45 -06:00
3f631ace8f Move Edit-related code to Edit.jl 2023-01-04 11:56:45 -06:00
a6ad47a568 Declare Blue style formatting for all new documents 2023-01-04 11:56:45 -06:00
fb5b7dfacf Remove commented-out code 2023-01-04 11:56:45 -06:00
21 changed files with 898 additions and 531 deletions

1
.JuliaFormatter.toml Normal file
View file

@ -0,0 +1 @@
style = "blue"

View file

@ -19,8 +19,6 @@ jobs:
- x64
os:
- ubuntu-latest
- windows-latest
- macOS-latest
steps:
- name: Checkout repository

29
.github/workflows/linting.yml vendored Normal file
View file

@ -0,0 +1,29 @@
name: Code format checker
on:
push:
branches:
- master
pull_request:
jobs:
lint:
name: JuliaFormatter
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: julia-actions/setup-julia@v1
with:
version: "1"
- run: |
julia -e 'using Pkg; Pkg.add(PackageSpec(name="JuliaFormatter"))'
julia -e 'using JuliaFormatter; format(".", verbose=true)'
- run: |
julia -e '
out = Cmd(`git diff --name-only`) |> read |> String
if out == ""
exit(0)
else
@error "Some files have not been formatted !!!"
write(stdout, out)
exit(1)
end'

View file

@ -7,21 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### [0.1.4] - 2022-12-17
### Added
- Tutorial-type documentation ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
### Changed
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
### Removed
- Windows and MacOS CI tests ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
## [0.1.4] - 2022-12-17
### Fixed
- Soft clips at end of alignment causing invalid `Variant`s ([#25](https://github.com/BioJulia/SequenceVariation.jl/issues/25)/[#26](https://github.com/BioJulia/SequenceVariation.jl/pull/26))
### [0.1.3] - 2022-11-22
## [0.1.3] - 2022-11-22
## Changed
### Changed
- Variations getter now returns type-parameterized vector ([#23](https://github.com/BioJulia/SequenceVariation.jl/pull/23))
## [0.1.2] - 2022-10-04
## Changed
### Changed
- Updated dependency compats ([#21](https://github.com/BioJulia/SequenceVariation.jl/pull/21))
- BioAlignments: 2 -> 2,3

View file

@ -10,6 +10,7 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
[compat]
Aqua = "0.6.0"
BioAlignments = "2,3"
BioGenerics = "0.1"
BioSequences = "2,3"
@ -17,9 +18,10 @@ BioSymbols = "4,5"
julia = "1.6"
[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
test = ["Test", "BioSequences", "BioAlignments"]
test = ["Aqua", "Test", "BioSequences", "BioAlignments"]

View file

@ -30,6 +30,7 @@ SequenceVariation is tested against Julia `1.X` on Linux, OS X, and Windows.
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
## Contributing

View file

@ -1,6 +1,16 @@
[deps]
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
SequenceVariation = "eef6e190-9969-4f06-a38f-35a110a8fdc8"
[compat]
BioAlignments = "3"
BioSequences = "3"
BioSymbols = "5"
Documenter = "0.27"
Revise = "3.4"
julia = "1.6"

View file

@ -1,22 +1,31 @@
using Pkg
using Documenter
using SequenceVariation
using Revise
# see https://github.com/tlienart/LiveServer.jl/issues/140#issuecomment-1271591251
Revise.revise()
makedocs(;
checkdocs = :exports,
linkcheck = true,
sitename = "SequenceVariation.jl",
format = Documenter.HTML(),
modules = [SequenceVariation],
pages = [
checkdocs=:exports,
linkcheck=true,
sitename="SequenceVariation.jl",
format=Documenter.HTML(),
modules=[SequenceVariation],
pages=[
"Home" => "index.md",
"Working with variants" => "variants.md",
"Working with variations" => "variations.md",
"Comparing variations" => "compare.md",
"API Reference" => "api.md",
],
authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors."
authors=replace(
join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => ""
) * ", The BioJulia Organisation, and other contributors.",
)
deploydocs(
repo = "github.com/BioJulia/SequenceVariation.jl.git",
devbranch = "master",
push_preview = true,
deploydocs(;
repo="github.com/BioJulia/SequenceVariation.jl.git",
devbranch="master",
push_preview=true,
)

View file

@ -7,6 +7,54 @@ end
# API Reference
```@autodocs
Modules = [SequenceVariation]
## Edits
```@docs
Substitution
Deletion
Insertion
```
## Variants
```@docs
Variant
reference(::Variant)
variations
reconstruct!
```
## Variations
```@docs
Variation
reference(::Variation)
mutation
translate
refbases
altbases
```
## Private API
### Edits
```@docs
Edit
_mutation
_lendiff
```
### Variants
```@docs
_edits
_is_valid(::Variant)
```
### Variations
```@docs
_edit
_is_valid(::Variation)
```

47
docs/src/compare.md Normal file
View file

@ -0,0 +1,47 @@
```@meta
CurrentModule = SequenceVariation
```
# Comparing variations in sequences
## Checking for variations in a known variant
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
efficiently accomplished using the `in` operator.
```@setup call_variants
using SequenceVariation, BioAlignments, BioSequences
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
bos_ovis_alignment =
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
```
```@example call_variants
println("\tOvis aires\tHomo sapiens")
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant))
is_sheep = v in bos_ovis_variant
is_human = v in bos_human_variant
println("$v\t$is_sheep\t\t$is_human")
end
```
## Constructing new variants based on other variations
New variants can be constructed using variations. This might be useful to pool
variations found on different reads or to filter variations from a variant
that aren't validated by another variant.
```@repl call_variants
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
Variant(bovine, sheeple)
reconstruct!(bovine, ans)
```

View file

@ -18,7 +18,7 @@ You can install SequenceVariation from the [Julia REPL](https://docs.julialang.o
Press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), and enter the following:
```julia
add SequenceVariation.jl
add SequenceVariation
```
## Testing
@ -30,6 +30,7 @@ SequenceVariation is tested against Julia `1.X` on Linux, OS X, and Windows.
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
## Contributing

40
docs/src/variants.md Normal file
View file

@ -0,0 +1,40 @@
```@meta
CurrentModule = SequenceVariation
```
# Working with variants
## Calling variants
The first step in working with sequence variation is to identify (call)
variations. SequenceVariation can directly call variants using the
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
```@repl call_variants
using SequenceVariation, BioAlignments, BioSequences
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
bos_ovis_alignment =
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
```
## Sequence reconstruction
If the alternate sequence of a variant is no longer available (as is often the
case when calling variants from alignment files), then the sequence can be
retrieved using the [`reconstruct!`](@ref) function.
```@repl call_variants
human2 = copy(bovine);
reconstruct!(human2, bos_human_variant)
human2 == bovine
human2 == human
```

66
docs/src/variations.md Normal file
View file

@ -0,0 +1,66 @@
```@meta
CurrentModule = SequenceVariation
```
# Working with individual variations
## Construction
Individual [`Variation`](@ref)s can be made using a reference sequence and
string syntax
| Variation type | Syntax | Interpretation | Example |
|:--- |:--- |:--- |:--- |
| Substitutions | `<REF><POS><ALT>` | `<ALT>` is substituted for `<REF>` in position `<POS>` | `"G16C"` |
| Deletions | `Δ<START>-<END>` | All bases (inclusive) between `<START>` and `<END>` are deleted. It is valid to have `<START>` equal `<END>`: that is a deletion of one base. | `"Δ1-2"` |
| Insertions | `<POS><ALT>` | `<ALT>` is inserted between positions `<POS>` and `<POS>+1` | `"11T"` |
```@repl
using BioSequences: @dna_str
using SequenceVariation
bovine_ins = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG"
Variation(bovine_ins, "C4A")
mutation(ans)
typeof(mutation(Variation(bovine_ins, "Δ13-14")))
mutation(Variation(bovine_ins, "25ACA"))
```
## Extraction
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
using the [`variations`](@ref) function.
```@setup call_variants
using SequenceVariation, BioAlignments, BioSequences
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
bos_ovis_alignment =
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
bos_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
bos_ovis_variant = Variant(bos_ovis_alignment)
bos_human_variant = Variant(bos_human_alignment)
```
```@repl call_variants
variations(bos_ovis_variant)
variations(bos_human_variant)
```
## Reference switching
An individual variation can be mapped to a new reference sequence given an
alignment between the new and old references using the [`translate`](@ref).
```@repl call_variants
ovis_human_alignment =
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
human_variation = first(variations(bos_ovis_variant))
reference(ans) == bovine
SequenceVariation.translate(human_variation, ovis_human_alignment)
reference(ans) == bovine
```

75
src/Edit.jl Normal file
View file

@ -0,0 +1,75 @@
include("edits/Substitution.jl")
include("edits/Deletion.jl")
include("edits/Insertion.jl")
"""
Edit{S <: BioSequence, T <: BioSymbol}
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
"""
struct Edit{S<:BioSequence,T<:BioSymbol}
x::Union{Substitution{T},Deletion,Insertion{S}}
pos::UInt
end
Base.length(e::Edit) = length(_mutation(e))
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
function Base.parse(::Type{T}, s::AbstractString) where {T<:Edit{Se,Sy}} where {Se,Sy}
return parse(T, String(s))
end
function Base.parse(::Type{<:Edit{Se,Sy}}, s::Union{String,SubString{String}}) where {Se,Sy}
# Either "Δ1-2", "11T" or "G16C"
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
pos = parse(UInt, m[1])
stop = parse(UInt, m[2])
stop pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
Edit{Se,Sy}(Deletion(stop - pos + 1), pos)
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
pos = parse(UInt, m[1])
seq = Se(m[2])
Edit{Se,Sy}(Insertion(seq), pos)
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
pos = parse(UInt, m[1])
sym = Sy(first(m[2]))
Edit{Se,Sy}(Substitution(sym), pos)
else
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
end
end
"""
_mutation(e::Edit)
Returns the underlying [`Substitution`](@ref), [`Insertion`](@ref), or [`Deletion`](@ref) of
`e`.
"""
_mutation(e::Edit) = e.x
BioGenerics.leftposition(e::Edit) = e.pos
function BioGenerics.rightposition(e::Edit)
if _mutation(e) isa Substitution
return leftposition(e)
elseif _mutation(e) isa Insertion
return leftposition(e) + 1
elseif _mutation(e) isa Deletion
return leftposition(e) + length(e) - 1
else
error("Unknown mutation type $(typeof(_mutation(e)))")
end
end
"""
_lendiff(edit::Edit)
Gets the number of bases that `edit` adds to the reference sequence
"""
function _lendiff(edit::Edit)
x = _mutation(edit)
# Each edit type has logic for its length, we just need to know what direction to go
multiplier = x isa Substitution ? 0 : (x isa Deletion ? -1 : 1)
return length(x) * multiplier
end

View file

@ -20,519 +20,32 @@ TODO now:
* Add tests
"""
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence
using BioGenerics: BioGenerics, leftposition, rightposition
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
using BioSymbols: BioSymbol
export Deletion
export Insertion
export Substitution
export Variant
export Variation
export altbases
export mutation
export reconstruct!
export refbases
export reference
export translate
export variations
const BA = BioAlignments
const BS = BioSequences
#=
import Automa
import Automa.RegExp: @re_str
=#
struct Unsafe end
struct Inapplicable end
#=
const CONTEXT = Automa.CodeGenContext(
vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte),
generator=:goto,
checkbounds=false
)
const BYTES = Union{String, SubString{String}, Vector{UInt8}}
=#
"""
Substitution
Represents the presence of a `T` at a given position. The position is stored
outside this struct.
"""
struct Substitution{T <: BioSymbol}
x::T
end
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
"""
Deletion
Represents the deletion of N symbols. The location of the deletion is stored
outside this struct
"""
struct Deletion
len::UInt
function Deletion(len::UInt)
iszero(len) && error("Deletion must be at least 1 symbol")
new(len)
end
end
Deletion(x::Integer) = Deletion(convert(UInt, x))
Base.length(x::Deletion) = Int(x.len)
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
"""
Insertion{S <: BioSequence}
Represents the insertion of a `S` into a sequence. The location of the insertion
is stored outside the struct.
"""
struct Insertion{S <: BioSequence}
seq::S
function Insertion{S}(x::S) where {S <: BioSequence}
isempty(x) && error("Insertion must be at least 1 symbol")
new(x)
end
end
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
Base.length(x::Insertion) = length(x.seq)
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
"""
Edit{S <: BioSequence, T <: BioSymbol}
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
"""
struct Edit{S <: BioSequence, T <: BioSymbol}
x::Union{Substitution{T}, Deletion, Insertion{S}}
pos::UInt
end
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
Base.length(e::Edit) = e isa Substitution ? 1 : length(mutation(e))
function Base.parse(::Type{T}, s::AbstractString) where {T <: Edit{Se, Sy}} where {Se, Sy}
parse(T, String(s))
end
function Base.parse(::Type{<:Edit{Se, Sy}}, s::Union{String, SubString{String}}) where {Se, Sy}
# Either "Δ1-2", "11T" or "G16C"
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
pos = parse(UInt, m[1])
stop = parse(UInt, m[2])
stop pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
Edit{Se, Sy}(Deletion(stop - pos + 1), pos)
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
pos = parse(UInt, m[1])
seq = Se(m[2])
Edit{Se, Sy}(Insertion(seq), pos)
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
pos = parse(UInt, m[1])
sym = Sy(first(m[2]))
Edit{Se, Sy}(Substitution(sym), pos)
else
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
end
end
mutation(e::Edit) = e.x
BioGenerics.leftposition(e::Edit) = e.pos
function BioGenerics.rightposition(e::Edit)
if mutation(e) isa Substitution
return leftposition(e)
elseif mutation(e) isa Insertion
return leftposition(e) + 1
elseif mutation(e) isa Deletion
return leftposition(e) + length(e) - 1
else
error("Unknown mutation type $(typeof(mutation(e)))")
end
end
#=
@noinline throw_parse_error(T, p::Integer) = error("Failed to parse $T at byte $p")
# Parse substitution
let
machine = let
biosymbol = re"[A-Za-z]"
number = re"[0-9]+"
biosymbol.actions[:enter] = [:enter]
number.actions[:all] = [:digit]
Automa.compile(biosymbol * number * biosymbol)
end
actions = Dict(
:enter => quote
symbol = T(Char(byte))
if firstsymbol === nothing
firstsymbol = symbol
else
lastsymbol = symbol
end
end,
:digit => :(num = UInt(10)*num + (byte - 0x30) % UInt),
)
@eval begin
function Base.parse(::Type{Edit{S, T}}, data::BYTES) where {S, T}
$(Automa.generate_init_code(CONTEXT, machine))
p_eof = p_end = sizeof(data)
firstsymbol = lastsymbol = nothing
num = UInt(0)
$(Automa.generate_exec_code(CONTEXT, machine, actions))
iszero(cs) || throw_parse_error(Edit{S, T}, p)
if firstsymbol == lastsymbol
error("First symbol and last symbol are identical")
end
return Edit{S, T}(Substitution{T}(lastsymbol), num)
end
end
end
=#
# Edits are applied sequentially from first to last pos.
# The vector must always be sorted by pos.
struct Variant{S <: BioSequence, T <: BioSymbol}
ref::S
edits::Vector{Edit{S, T}}
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
end
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
sort!(edits, by=x -> x.pos)
result = Variant{S, T}(ref, edits, Unsafe())
is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
Variant{S, T}(ref, edits)
end
function Base.show(io::IO, x::Variant)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
v = Variation(x.ref, i)
print(io, "\n ")
show(io, v)
end
end
# Validate:
# A sequence is invalid if any of its operations are out of bounds, or the same position
# is affected by multiple edits.
function is_valid(v::Variant)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
last_was_insert = false
for edit in v.edits
pos = edit.pos
op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
is_valid(Variation(v.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base
if op isa Substitution
pos in valid_positions || return false
valid_positions = first(valid_positions) + 1 : last(valid_positions)
last_was_insert = false
# Insertions affect 0 reference bases, so it does not modify the valid positions
# for next op. However, we cannot have two insertions at the same position, because
# then the order of them is ambiguous
elseif op isa Insertion
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
last_was_insert = true
# Deletions obviously invalidate the reference bases that are deleted.
elseif op isa Deletion
len = length(op)
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
valid_positions = first(valid_positions) + len : last(valid_positions)
last_was_insert = false
end
end
return true
end
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
ref = aln.b
E = eltype(typeof(ref))
edits = Edit{T, E}[]
refpos = first(aln.a.aln.anchors).refpos
seqpos = first(aln.a.aln.anchors).seqpos
markpos = 0
n_gaps = n_ins = 0
insertion_buffer = E[]
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
isgap(seqi) || (seqpos += 1)
# Check for deletions
if isgap(seqi)
iszero(n_gaps) && (markpos = refpos)
n_gaps += 1
elseif !iszero(n_gaps)
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
n_gaps = 0
end
# Check for insertions
if isgap(refi)
iszero(n_ins) && (markpos = refpos + 1)
push!(insertion_buffer, seqi)
n_ins += 1
elseif !iszero(n_ins)
seq = T(insertion_buffer)
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
empty!(insertion_buffer)
n_ins = 0
end
# Substitutions
if !isgap(refi) && !isgap(seqi) && seqi != refi
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
end
end
# Check for clips at the end of the alignment
last_anchors = aln.a.aln.anchors[end-1:end]
# Final indel, if applicable
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
if !iszero(n_gaps)
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
elseif !iszero(n_ins)
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
end
end
return Variant(ref, edits)
end
edits(v::Variant) = v.edits
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
function lendiff(edit::Edit)
x = edit.x
x isa Substitution ? 0 : (x isa Deletion ? -length(x) : length(x.x))
end
function reconstruct!(seq::S, x::Variant{S}) where S
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
resize!(seq, len % UInt)
refpos = seqpos = 1
for edit in x.edits
while refpos < edit.pos
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
editx = edit.x
if editx isa Substitution
seq[seqpos] = editx.x
seqpos += 1
refpos += 1
elseif editx isa Deletion
refpos += editx.len
elseif editx isa Insertion
for i in editx.x
seq[seqpos] = i
seqpos += 1
end
end
end
while seqpos length(seq)
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
seq
end
struct Variation{S <: BioSequence, T <: BioSymbol}
ref::S
edit::Edit{S, T}
function Variation{S, T}(ref::S, e::Edit{S, T}, ::Unsafe) where {S <: BioSequence, T <: BioSymbol}
new(ref, e)
end
end
function Variation{S, T}(ref::S, e::Edit{S, T}) where {S <: BioSequence, T <: BioSymbol}
v = Variation{S, T}(ref, e, Unsafe())
is_valid(v) ? v : throw(ArgumentError("Invalid variant"))
end
Variation(ref::S, edit::Edit{S, T}) where {S, T} = Variation{S, T}(ref, edit)
function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
T = eltype(ref)
e = parse(Edit{S,T}, edit)
return Variation{S,T}(ref, e)
end
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence, T<:BioSymbol}
edits = edit.(vars)
return Variant{S, T}(ref, edits)
end
reference(v::Variation) = v.ref
edit(v::Variation) = v.edit
mutation(v::Variation) = mutation(edit(v))
BioGenerics.leftposition(v::Variation) = leftposition(edit(v))
BioGenerics.rightposition(v::Variation) = rightposition(edit(v))
Base.:(==)(x::Variation, y::Variation) = x.ref == y.ref && x.edit == y.edit
Base.hash(x::Variation, h::UInt) = hash(Variation, hash((x.ref, x.edit), h))
function is_valid(v::Variation)
isempty(v.ref) && return false
op = v.edit.x
pos = v.edit.pos
if op isa Substitution
return pos in eachindex(v.ref)
elseif op isa Insertion
return pos in 0:lastindex(v.ref)+1
elseif op isa Deletion
return pos in 1:(lastindex(v.ref)-length(op) + 1)
end
end
function Base.show(io::IO, x::Variation)
content = x.edit.x
pos = x.edit.pos
if content isa Substitution
print(io, x.ref[pos], pos, content.x)
elseif content isa Deletion
print(io, 'Δ', pos, '-', pos + content.len - 1)
elseif content isa Insertion
print(io, pos, content.seq)
else
print(io, pos, content.x)
end
end
function Base.in(v::Variation, var::Variant)
if v.ref != var.ref
error("References must be equal")
end
any(v.edit == edit for edit in var.edits)
end
function translate(var::Variation{S, T}, aln::PairwiseAlignment{S, S}) where {S, T}
kind = var.edit.x
pos = var.edit.pos
seq, ref = aln.seq, aln.b
# Special case: Insertions may have a pos of 0, which cannot be mapped to
# the seq using ref2seq
if iszero(pos)
(s, r), _ = iterate(aln)
(isgap(s) | isgap(r)) && return Inapplicable()
return Variation{S, T}(seq, Edit{S, T}(Insertion(var.edit.x), 0))
end
(seqpos, op) = BA.ref2seq(aln, pos)
if kind isa Substitution
# If it's a substitution, return nothing if it maps to a deleted
# position, or substitutes to same base.
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
seq[seqpos] == kind.x && return nothing
edit = Edit{S, T}(kind, seqpos)
return Variation{S, T}(seq, edit, Unsafe())
elseif kind isa Deletion
# If it's a deletion, return nothing if the deleted part is already missing
# from the new reference.
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
start = seqpos + op == BA.OP_DELETE
start < stop && return nothing
edit = Edit{S, T}(Deletion(stop - start + 1), start)
return Variation{S, T}(seq, edit, Unsafe())
else
# If it maps directly to a symbol, just insert
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
# This happens if there is already an insertion at the position
if pos != lastindex(ref) && first(ref2seq(aln, pos+1)) != seqpos + 1
return Inapplicable()
else
edit = Edit{S, T}(Insertion(var.edit.x), seqpos)
return Variation{S, T}(seq, edit, Unsafe())
end
# Alternatively, it can map to a deletion. In that case, it become really
# tricky to talk about the "same" insertion.
else
return Inapplicable()
end
end
end
function variations(v::Variant{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(edits(v)))
for (i, e) in enumerate(edits(v))
vs[i] = Variation{S,T}(reference(v), e)
end
return vs
end
function _refbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
return S([reference[pos]])
end
function _altbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
return S([s.x])
end
function _refbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
if pos == 1
return S(reference[UnitRange{Int}(pos, pos+length(d))])
else
return S(reference[UnitRange{Int}(pos-1, pos+length(d)-1)])
end
end
function _altbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
if pos == 1
return S([reference[pos+1]])
else
return S([reference[pos-1]])
end
end
function _refbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
return S([reference[pos]])
end
function _altbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
if pos == 1
return S([i.seq..., reference[pos]])
else
return S([reference[pos], i.seq...])
end
end
function refbases(v::Variation)
return _refbases(mutation(v), reference(v), leftposition(v))
end
function altbases(v::Variation)
return _altbases(mutation(v), reference(v), leftposition(v))
end
export Insertion,
Deletion,
Substitution,
Variant,
Variation,
reference,
mutation,
variations,
refbases,
altbases
include("Edit.jl")
include("Variant.jl")
include("Variation.jl")
end # module

197
src/Variant.jl Normal file
View file

@ -0,0 +1,197 @@
"""
Variant{S<:BioSequence,T<:BioSymbol}
A set of variations within a given sequence that are all found together. Depending on the
field, it might also be referred to as a "genotype," "haplotype," or "strain."
# Constructors
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
Variant(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
edits are applied sequentially from first to last position, therefore the vector must always
be sorted by position. These edits are sorted automatically if constructing from an
alignment.
"""
struct Variant{S<:BioSequence,T<:BioSymbol}
ref::S
edits::Vector{Edit{S,T}}
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
end
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
sort!(edits; by=x -> x.pos)
result = Variant{S,T}(ref, edits, Unsafe())
_is_valid(result) || error("TODO") # report what kind of error message?
return result
end
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
return Variant{S,T}(ref, edits)
end
function Base.show(io::IO, x::Variant)
n = length(x.edits)
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
for i in x.edits
v = Variation(x.ref, i)
print(io, "\n ")
show(io, v)
end
end
"""
is_valid(v::Variant)
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
position is affected by multiple edits.
"""
function _is_valid(v::Variant)
isempty(v.ref) && return false
valid_positions = 1:length(v.ref)
last_was_insert = false
for edit in v.edits
pos = edit.pos
op = edit.x
# Sanity check: for this to be a valid variant, it must be comprised of valid
# variations
_is_valid(Variation(v.ref, edit)) || return false
# For substitutions we simply do not allow another modification of the same base
if op isa Substitution
pos in valid_positions || return false
valid_positions = (first(valid_positions) + 1):last(valid_positions)
last_was_insert = false
# Insertions affect 0 reference bases, so it does not modify the valid positions
# for next op. However, we cannot have two insertions at the same position, because
# then the order of them is ambiguous
elseif op isa Insertion
pos in
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
return false
last_was_insert = true
# Deletions obviously invalidate the reference bases that are deleted.
elseif op isa Deletion
len = length(op)
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
return false
valid_positions = (first(valid_positions) + len):last(valid_positions)
last_was_insert = false
end
end
return true
end
function Variant(
aln::PairwiseAlignment{T,T}
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
ref = aln.b
E = eltype(typeof(ref))
edits = Edit{T,E}[]
refpos = first(aln.a.aln.anchors).refpos
seqpos = first(aln.a.aln.anchors).seqpos
markpos = 0
n_gaps = n_ins = 0
insertion_buffer = E[]
for (seqi, refi) in aln
isgap(refi) || (refpos += 1)
isgap(seqi) || (seqpos += 1)
# Check for deletions
if isgap(seqi)
iszero(n_gaps) && (markpos = refpos)
n_gaps += 1
elseif !iszero(n_gaps)
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
n_gaps = 0
end
# Check for insertions
if isgap(refi)
iszero(n_ins) && (markpos = refpos + 1)
push!(insertion_buffer, seqi)
n_ins += 1
elseif !iszero(n_ins)
seq = T(insertion_buffer)
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
empty!(insertion_buffer)
n_ins = 0
end
# Substitutions
if !isgap(refi) && !isgap(seqi) && seqi != refi
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
end
end
# Check for clips at the end of the alignment
last_anchors = aln.a.aln.anchors[(end - 1):end]
# Final indel, if applicable
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
if !iszero(n_gaps)
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
elseif !iszero(n_ins)
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
end
end
return Variant(ref, edits)
end
"""
_edits(v::Variant)
Gets the [`Edit`](@ref)s that comprise `v`
"""
_edits(v::Variant) = v.edits
"""
reference(v::Variant)
Gets the reference sequence of `v`.
"""
reference(v::Variant) = v.ref
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
"""
reconstruct!(seq::S, x::Variant{S}) where {S}
Apply the edits in `x` to `seq` and return the mutated sequence
"""
function reconstruct!(seq::S, x::Variant{S}) where {S}
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
resize!(seq, len % UInt)
refpos = seqpos = 1
for edit in x.edits
while refpos < edit.pos
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
editx = edit.x
if editx isa Substitution
seq[seqpos] = editx.x
seqpos += 1
refpos += 1
elseif editx isa Deletion
refpos += editx.len
elseif editx isa Insertion
for i in editx.x
seq[seqpos] = i
seqpos += 1
end
end
end
while seqpos length(seq)
seq[seqpos] = x.ref[refpos]
refpos += 1
seqpos += 1
end
return seq
end

206
src/Variation.jl Normal file
View file

@ -0,0 +1,206 @@
"""
Variation{S<:BioSequence,T<:BioSymbol}
A single change to a biological sequence. A general wrapper that can represent a
sequence-specific [`Substitution`](@ref), [`Deletion`](@ref) or [`Insertion`](@ref).
`Variation` is more robust than [`Edit`](@ref), due to inclusion of the reference sequence
and built-in validation.
# Constructors
Variation(ref::S, e::Edit{S,T}) where {S<:BioSequence,T<:BioSymbol}
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
[`variations(::Variant)`](@ref) is encouraged, instead.
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
following syntax:
- Substitution: `"<REFBASE><POS><ALTBASE>"`, e.g. `"G16C"`
- Deletion: `"Δ<STARTPOS>-<ENDPOS>"`, e.g. `"Δ1-2"`
- Insertion: `"<POS><ALTBASES>"`, e.g. `"11T"`
"""
struct Variation{S<:BioSequence,T<:BioSymbol}
ref::S
edit::Edit{S,T}
function Variation{S,T}(
ref::S, e::Edit{S,T}, ::Unsafe
) where {S<:BioSequence,T<:BioSymbol}
return new(ref, e)
end
end
function Variation{S,T}(ref::S, e::Edit{S,T}) where {S<:BioSequence,T<:BioSymbol}
v = Variation{S,T}(ref, e, Unsafe())
return _is_valid(v) ? v : throw(ArgumentError("Invalid variant"))
end
Variation(ref::S, edit::Edit{S,T}) where {S,T} = Variation{S,T}(ref, edit)
function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
T = eltype(ref)
e = parse(Edit{S,T}, edit)
return Variation{S,T}(ref, e)
end
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
edits = _edit.(vars)
return Variant{S,T}(ref, edits)
end
"""
reference(v::Variation)
Gets the reference sequence of `v`
"""
reference(v::Variation) = v.ref
"""
_edit(v::Variation)
Gets the underlying [`Edit`](@ref) of `v`
"""
_edit(v::Variation) = v.edit
"""
mutation(v::Variation)
Gets the underlying [`Substitution`](@ref), [`Insertion`](@ref), or [`Deletion`](@ref) of
`v`.
"""
mutation(v::Variation) = _mutation(_edit(v))
BioGenerics.leftposition(v::Variation) = leftposition(_edit(v))
BioGenerics.rightposition(v::Variation) = rightposition(_edit(v))
Base.:(==)(x::Variation, y::Variation) = x.ref == y.ref && x.edit == y.edit
Base.hash(x::Variation, h::UInt) = hash(Variation, hash((x.ref, x.edit), h))
"""
_is_valid(v::Variation)
Validate `v`. `v` is invalid if its opertation is out of bounds.
"""
function _is_valid(v::Variation)
isempty(v.ref) && return false
op = v.edit.x
pos = v.edit.pos
if op isa Substitution
return pos in eachindex(v.ref)
elseif op isa Insertion
return pos in 0:(lastindex(v.ref) + 1)
elseif op isa Deletion
return pos in 1:(lastindex(v.ref) - length(op) + 1)
end
end
function Base.show(io::IO, x::Variation)
content = x.edit.x
pos = x.edit.pos
if content isa Substitution
print(io, x.ref[pos], pos, content.x)
elseif content isa Deletion
print(io, 'Δ', pos, '-', pos + content.len - 1)
elseif content isa Insertion
print(io, pos, content.seq)
else
print(io, pos, content.x)
end
end
function Base.in(v::Variation, var::Variant)
if v.ref != var.ref
error("References must be equal")
end
return any(v.edit == edit for edit in var.edits)
end
"""
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
Convert the difference in `var` to a new reference sequence based upon `aln`. `aln` is the
alignment of the old reference (`aln.b`) and the new reference sequence (`aln.seq`). Returns
the new [`Variation`](@ref).
"""
function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
kind = mutation(var)
pos = leftposition(var)
seq = sequence(aln)
ref = aln.b
# Special case: Insertions may have a pos of 0, which cannot be mapped to
# the seq using ref2seq
if iszero(pos)
(s, r), _ = iterate(aln)
(isgap(s) | isgap(r)) && return Inapplicable()
return Variation{S,T}(seq, Edit{S,T}(Insertion(var.edit.x), 0))
end
(seqpos, op) = BA.ref2seq(aln, pos)
if kind isa Substitution
# If it's a substitution, return nothing if it maps to a deleted
# position, or substitutes to same base.
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
seq[seqpos] == kind.x && return nothing
edit = Edit{S,T}(kind, seqpos)
return Variation{S,T}(seq, edit, Unsafe())
elseif kind isa Deletion
# If it's a deletion, return nothing if the deleted part is already missing
# from the new reference.
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
start = seqpos + op == BA.OP_DELETE
start < stop && return nothing
edit = Edit{S,T}(Deletion(stop - start + 1), start)
return Variation{S,T}(seq, edit, Unsafe())
else
# If it maps directly to a symbol, just insert
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
# This happens if there is already an insertion at the position
if pos != lastindex(ref) && first(ref2seq(aln, pos + 1)) != seqpos + 1
return Inapplicable()
else
edit = Edit{S,T}(Insertion(var.edit.x), seqpos)
return Variation{S,T}(seq, edit, Unsafe())
end
# Alternatively, it can map to a deletion. In that case, it become really
# tricky to talk about the "same" insertion.
else
return Inapplicable()
end
end
end
"""
variations(v::Variant{S,T}) where {S,T}
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
"""
function variations(v::Variant{S,T}) where {S,T}
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
for (i, e) in enumerate(_edits(v))
vs[i] = Variation{S,T}(reference(v), e)
end
return vs
end
"""
refbases(v::Variation)
Get the reference bases of `v`. Note that for deletions, `refbases` also returns the base
_before_ the deletion in accordance with the `REF` field of the
[VCF v4 specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf).
"""
function refbases(v::Variation)
return _refbases(mutation(v), reference(v), leftposition(v))
end
"""
altbases(v::Variation)
Get the alternate bases of `v`. Note that for insertions, `altbases` also returns the base
_before_ the insertion in accordance with the `ALT` field of the
[VCF v4 specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf).
"""
function altbases(v::Variation)
return _altbases(mutation(v), reference(v), leftposition(v))
end

35
src/edits/Deletion.jl Normal file
View file

@ -0,0 +1,35 @@
"""
Deletion
Represents the deletion of N symbols. The location of the deletion is stored
outside this struct
"""
struct Deletion
len::UInt
function Deletion(len::UInt)
iszero(len) && error("Deletion must be at least 1 symbol")
return new(len)
end
end
Deletion(x::Integer) = Deletion(convert(UInt, x))
Base.length(x::Deletion) = Int(x.len)
Base.:(==)(x::Substitution, y::Substitution) = length(x) == length(y)
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
function _refbases(d::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
if pos == 1
return S(reference[UnitRange{Int}(pos, pos + length(d))])
else
return S(reference[UnitRange{Int}(pos - 1, pos + length(d) - 1)])
end
end
function _altbases(::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
if pos == 1
return S([reference[pos + 1]])
else
return S([reference[pos - 1]])
end
end

31
src/edits/Insertion.jl Normal file
View file

@ -0,0 +1,31 @@
"""
Insertion{S <: BioSequence}
Represents the insertion of a `S` into a sequence. The location of the insertion
is stored outside the struct.
"""
struct Insertion{S<:BioSequence}
seq::S
function Insertion{S}(x::S) where {S<:BioSequence}
isempty(x) && error("Insertion must be at least 1 symbol")
return new(x)
end
end
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
Base.length(x::Insertion) = length(x.seq)
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
function _refbases(::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
return S([reference[pos]])
end
function _altbases(i::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
if pos == 1
return S([i.seq..., reference[pos]])
else
return S([reference[pos], i.seq...])
end
end

21
src/edits/Substitution.jl Normal file
View file

@ -0,0 +1,21 @@
"""
Substitution
Represents the presence of a `T` at a given position. The position is stored
outside this struct.
"""
struct Substitution{T<:BioSymbol}
x::T
end
Base.length(::Substitution) = 1
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
function _refbases(::Substitution, reference::S, pos::UInt) where {S<:BioSequence}
return S([reference[pos]])
end
function _altbases(s::Substitution, ::S, pos::UInt) where {S<:BioSequence}
return S([s.x])
end

View file

@ -23,12 +23,13 @@ TODO now:
* Add tests
"""
using Aqua
using BioAlignments
using BioSequences
using SequenceVariation
using Test
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL, gap_open=-25, gap_extend=-2)
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_extend=-2)
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
@ -111,17 +112,40 @@ end
refseq = dna"GATTACA"
mutseq = dna"GATTACAAAA"
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq), eltype(refseq)}[])
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
# Test for ending soft clip
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)) == refvar
@test Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
) == refvar
# Test for ending soft+hard clip
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)) == refvar
@test Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
) == refvar
# Test that ending insertions are still valid
@test length(Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)).edits) == 1
@test length(
Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
).edits,
) == 1
# Test that out-of-bounds bases are still caught
@test_throws BoundsError Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq))
@test_throws BoundsError Variant(
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
)
end
@testset "Aqua" begin
Aqua.test_ambiguities(SequenceVariation; recursive=false)
# TODO: Refactor `Edit` so that this test doesn't fail
# TODO: This test _should_ be set to @test_fails, but Aqua's syntax doesn't allow that
# Aqua.test_unbound_args(SequenceVariation)
Aqua.test_undefined_exports(SequenceVariation)
Aqua.test_piracy(SequenceVariation)
Aqua.test_project_extras(SequenceVariation)
Aqua.test_stale_deps(SequenceVariation)
Aqua.test_deps_compat(SequenceVariation)
Aqua.test_project_toml_formatting(SequenceVariation)
end