mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 13:29:56 +00:00
Compare commits
No commits in common. "3ef4a92ba268faa509547bb9774d070aba38cdd5" and "73d2781ee945ebc64f6b5599ef0838f4cab585e9" have entirely different histories.
3ef4a92ba2
...
73d2781ee9
21 changed files with 531 additions and 898 deletions
|
@ -1 +0,0 @@
|
|||
style = "blue"
|
2
.github/workflows/UnitTests.yml
vendored
2
.github/workflows/UnitTests.yml
vendored
|
@ -19,6 +19,8 @@ jobs:
|
|||
- x64
|
||||
os:
|
||||
- ubuntu-latest
|
||||
- windows-latest
|
||||
- macOS-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
|
29
.github/workflows/linting.yml
vendored
29
.github/workflows/linting.yml
vendored
|
@ -1,29 +0,0 @@
|
|||
name: Code format checker
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: JuliaFormatter
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: julia-actions/setup-julia@v1
|
||||
with:
|
||||
version: "1"
|
||||
- run: |
|
||||
julia -e 'using Pkg; Pkg.add(PackageSpec(name="JuliaFormatter"))'
|
||||
julia -e 'using JuliaFormatter; format(".", verbose=true)'
|
||||
- run: |
|
||||
julia -e '
|
||||
out = Cmd(`git diff --name-only`) |> read |> String
|
||||
if out == ""
|
||||
exit(0)
|
||||
else
|
||||
@error "Some files have not been formatted !!!"
|
||||
write(stdout, out)
|
||||
exit(1)
|
||||
end'
|
21
CHANGELOG.md
21
CHANGELOG.md
|
@ -7,34 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Tutorial-type documentation ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
||||
|
||||
### Changed
|
||||
|
||||
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
||||
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
||||
|
||||
### Removed
|
||||
|
||||
- Windows and MacOS CI tests ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
||||
|
||||
## [0.1.4] - 2022-12-17
|
||||
### [0.1.4] - 2022-12-17
|
||||
|
||||
### Fixed
|
||||
|
||||
- Soft clips at end of alignment causing invalid `Variant`s ([#25](https://github.com/BioJulia/SequenceVariation.jl/issues/25)/[#26](https://github.com/BioJulia/SequenceVariation.jl/pull/26))
|
||||
|
||||
## [0.1.3] - 2022-11-22
|
||||
### [0.1.3] - 2022-11-22
|
||||
|
||||
### Changed
|
||||
## Changed
|
||||
|
||||
- Variations getter now returns type-parameterized vector ([#23](https://github.com/BioJulia/SequenceVariation.jl/pull/23))
|
||||
|
||||
## [0.1.2] - 2022-10-04
|
||||
|
||||
### Changed
|
||||
## Changed
|
||||
|
||||
- Updated dependency compats ([#21](https://github.com/BioJulia/SequenceVariation.jl/pull/21))
|
||||
- BioAlignments: 2 -> 2,3
|
||||
|
|
|
@ -10,7 +10,6 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
|||
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
||||
|
||||
[compat]
|
||||
Aqua = "0.6.0"
|
||||
BioAlignments = "2,3"
|
||||
BioGenerics = "0.1"
|
||||
BioSequences = "2,3"
|
||||
|
@ -18,10 +17,9 @@ BioSymbols = "4,5"
|
|||
julia = "1.6"
|
||||
|
||||
[extras]
|
||||
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
|
||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[targets]
|
||||
test = ["Aqua", "Test", "BioSequences", "BioAlignments"]
|
||||
test = ["Test", "BioSequences", "BioAlignments"]
|
||||
|
|
|
@ -30,7 +30,6 @@ SequenceVariation is tested against Julia `1.X` on Linux, OS X, and Windows.
|
|||
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
|
||||
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
||||
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
|
||||
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
|
||||
|
||||
## Contributing
|
||||
|
||||
|
|
|
@ -1,16 +1,6 @@
|
|||
[deps]
|
||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
||||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
||||
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
|
||||
SequenceVariation = "eef6e190-9969-4f06-a38f-35a110a8fdc8"
|
||||
|
||||
[compat]
|
||||
BioAlignments = "3"
|
||||
BioSequences = "3"
|
||||
BioSymbols = "5"
|
||||
Documenter = "0.27"
|
||||
Revise = "3.4"
|
||||
julia = "1.6"
|
||||
|
|
31
docs/make.jl
31
docs/make.jl
|
@ -1,31 +1,22 @@
|
|||
using Pkg
|
||||
using Documenter
|
||||
using SequenceVariation
|
||||
using Revise
|
||||
|
||||
# see https://github.com/tlienart/LiveServer.jl/issues/140#issuecomment-1271591251
|
||||
Revise.revise()
|
||||
|
||||
makedocs(;
|
||||
checkdocs=:exports,
|
||||
linkcheck=true,
|
||||
sitename="SequenceVariation.jl",
|
||||
format=Documenter.HTML(),
|
||||
modules=[SequenceVariation],
|
||||
pages=[
|
||||
checkdocs = :exports,
|
||||
linkcheck = true,
|
||||
sitename = "SequenceVariation.jl",
|
||||
format = Documenter.HTML(),
|
||||
modules = [SequenceVariation],
|
||||
pages = [
|
||||
"Home" => "index.md",
|
||||
"Working with variants" => "variants.md",
|
||||
"Working with variations" => "variations.md",
|
||||
"Comparing variations" => "compare.md",
|
||||
"API Reference" => "api.md",
|
||||
],
|
||||
authors=replace(
|
||||
join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => ""
|
||||
) * ", The BioJulia Organisation, and other contributors.",
|
||||
authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors."
|
||||
)
|
||||
|
||||
deploydocs(;
|
||||
repo="github.com/BioJulia/SequenceVariation.jl.git",
|
||||
devbranch="master",
|
||||
push_preview=true,
|
||||
deploydocs(
|
||||
repo = "github.com/BioJulia/SequenceVariation.jl.git",
|
||||
devbranch = "master",
|
||||
push_preview = true,
|
||||
)
|
||||
|
|
|
@ -7,54 +7,6 @@ end
|
|||
|
||||
# API Reference
|
||||
|
||||
## Edits
|
||||
|
||||
```@docs
|
||||
Substitution
|
||||
Deletion
|
||||
Insertion
|
||||
```
|
||||
|
||||
## Variants
|
||||
|
||||
```@docs
|
||||
Variant
|
||||
reference(::Variant)
|
||||
variations
|
||||
reconstruct!
|
||||
```
|
||||
|
||||
## Variations
|
||||
|
||||
```@docs
|
||||
Variation
|
||||
reference(::Variation)
|
||||
mutation
|
||||
translate
|
||||
refbases
|
||||
altbases
|
||||
```
|
||||
|
||||
## Private API
|
||||
|
||||
### Edits
|
||||
|
||||
```@docs
|
||||
Edit
|
||||
_mutation
|
||||
_lendiff
|
||||
```
|
||||
|
||||
### Variants
|
||||
|
||||
```@docs
|
||||
_edits
|
||||
_is_valid(::Variant)
|
||||
```
|
||||
|
||||
### Variations
|
||||
|
||||
```@docs
|
||||
_edit
|
||||
_is_valid(::Variation)
|
||||
```@autodocs
|
||||
Modules = [SequenceVariation]
|
||||
```
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
```@meta
|
||||
CurrentModule = SequenceVariation
|
||||
```
|
||||
|
||||
# Comparing variations in sequences
|
||||
|
||||
## Checking for variations in a known variant
|
||||
|
||||
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
|
||||
efficiently accomplished using the `in` operator.
|
||||
|
||||
```@setup call_variants
|
||||
using SequenceVariation, BioAlignments, BioSequences
|
||||
|
||||
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
|
||||
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
|
||||
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
|
||||
|
||||
bos_ovis_alignment =
|
||||
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
|
||||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
```
|
||||
|
||||
```@example call_variants
|
||||
println("\tOvis aires\tHomo sapiens")
|
||||
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant))
|
||||
is_sheep = v in bos_ovis_variant
|
||||
is_human = v in bos_human_variant
|
||||
println("$v\t$is_sheep\t\t$is_human")
|
||||
end
|
||||
```
|
||||
|
||||
## Constructing new variants based on other variations
|
||||
|
||||
New variants can be constructed using variations. This might be useful to pool
|
||||
variations found on different reads or to filter variations from a variant
|
||||
that aren't validated by another variant.
|
||||
|
||||
```@repl call_variants
|
||||
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
|
||||
Variant(bovine, sheeple)
|
||||
reconstruct!(bovine, ans)
|
||||
```
|
|
@ -18,7 +18,7 @@ You can install SequenceVariation from the [Julia REPL](https://docs.julialang.o
|
|||
Press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), and enter the following:
|
||||
|
||||
```julia
|
||||
add SequenceVariation
|
||||
add SequenceVariation.jl
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
@ -30,7 +30,6 @@ SequenceVariation is tested against Julia `1.X` on Linux, OS X, and Windows.
|
|||
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
|
||||
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
||||
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
|
||||
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
|
||||
|
||||
## Contributing
|
||||
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
```@meta
|
||||
CurrentModule = SequenceVariation
|
||||
```
|
||||
|
||||
# Working with variants
|
||||
|
||||
## Calling variants
|
||||
|
||||
The first step in working with sequence variation is to identify (call)
|
||||
variations. SequenceVariation can directly call variants using the
|
||||
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
|
||||
|
||||
```@repl call_variants
|
||||
using SequenceVariation, BioAlignments, BioSequences
|
||||
|
||||
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
|
||||
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
|
||||
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
|
||||
|
||||
bos_ovis_alignment =
|
||||
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
|
||||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
```
|
||||
|
||||
## Sequence reconstruction
|
||||
|
||||
If the alternate sequence of a variant is no longer available (as is often the
|
||||
case when calling variants from alignment files), then the sequence can be
|
||||
retrieved using the [`reconstruct!`](@ref) function.
|
||||
|
||||
```@repl call_variants
|
||||
human2 = copy(bovine);
|
||||
reconstruct!(human2, bos_human_variant)
|
||||
human2 == bovine
|
||||
human2 == human
|
||||
```
|
|
@ -1,66 +0,0 @@
|
|||
```@meta
|
||||
CurrentModule = SequenceVariation
|
||||
```
|
||||
|
||||
# Working with individual variations
|
||||
|
||||
## Construction
|
||||
|
||||
Individual [`Variation`](@ref)s can be made using a reference sequence and
|
||||
string syntax
|
||||
|
||||
| Variation type | Syntax | Interpretation | Example |
|
||||
|:--- |:--- |:--- |:--- |
|
||||
| Substitutions | `<REF><POS><ALT>` | `<ALT>` is substituted for `<REF>` in position `<POS>` | `"G16C"` |
|
||||
| Deletions | `Δ<START>-<END>` | All bases (inclusive) between `<START>` and `<END>` are deleted. It is valid to have `<START>` equal `<END>`: that is a deletion of one base. | `"Δ1-2"` |
|
||||
| Insertions | `<POS><ALT>` | `<ALT>` is inserted between positions `<POS>` and `<POS>+1` | `"11T"` |
|
||||
|
||||
```@repl
|
||||
using BioSequences: @dna_str
|
||||
using SequenceVariation
|
||||
bovine_ins = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG"
|
||||
Variation(bovine_ins, "C4A")
|
||||
mutation(ans)
|
||||
typeof(mutation(Variation(bovine_ins, "Δ13-14")))
|
||||
mutation(Variation(bovine_ins, "25ACA"))
|
||||
```
|
||||
|
||||
## Extraction
|
||||
|
||||
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
|
||||
using the [`variations`](@ref) function.
|
||||
|
||||
```@setup call_variants
|
||||
using SequenceVariation, BioAlignments, BioSequences
|
||||
|
||||
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
|
||||
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
|
||||
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
|
||||
|
||||
bos_ovis_alignment =
|
||||
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
|
||||
bos_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
||||
|
||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
||||
bos_human_variant = Variant(bos_human_alignment)
|
||||
```
|
||||
|
||||
```@repl call_variants
|
||||
variations(bos_ovis_variant)
|
||||
variations(bos_human_variant)
|
||||
```
|
||||
|
||||
## Reference switching
|
||||
|
||||
An individual variation can be mapped to a new reference sequence given an
|
||||
alignment between the new and old references using the [`translate`](@ref).
|
||||
|
||||
```@repl call_variants
|
||||
ovis_human_alignment =
|
||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
|
||||
human_variation = first(variations(bos_ovis_variant))
|
||||
reference(ans) == bovine
|
||||
SequenceVariation.translate(human_variation, ovis_human_alignment)
|
||||
reference(ans) == bovine
|
||||
```
|
75
src/Edit.jl
75
src/Edit.jl
|
@ -1,75 +0,0 @@
|
|||
include("edits/Substitution.jl")
|
||||
include("edits/Deletion.jl")
|
||||
include("edits/Insertion.jl")
|
||||
|
||||
"""
|
||||
Edit{S <: BioSequence, T <: BioSymbol}
|
||||
|
||||
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
|
||||
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
|
||||
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
|
||||
"""
|
||||
struct Edit{S<:BioSequence,T<:BioSymbol}
|
||||
x::Union{Substitution{T},Deletion,Insertion{S}}
|
||||
pos::UInt
|
||||
end
|
||||
|
||||
Base.length(e::Edit) = length(_mutation(e))
|
||||
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
|
||||
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
|
||||
|
||||
function Base.parse(::Type{T}, s::AbstractString) where {T<:Edit{Se,Sy}} where {Se,Sy}
|
||||
return parse(T, String(s))
|
||||
end
|
||||
|
||||
function Base.parse(::Type{<:Edit{Se,Sy}}, s::Union{String,SubString{String}}) where {Se,Sy}
|
||||
# Either "Δ1-2", "11T" or "G16C"
|
||||
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
|
||||
pos = parse(UInt, m[1])
|
||||
stop = parse(UInt, m[2])
|
||||
stop ≥ pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
|
||||
Edit{Se,Sy}(Deletion(stop - pos + 1), pos)
|
||||
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
|
||||
pos = parse(UInt, m[1])
|
||||
seq = Se(m[2])
|
||||
Edit{Se,Sy}(Insertion(seq), pos)
|
||||
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
|
||||
pos = parse(UInt, m[1])
|
||||
sym = Sy(first(m[2]))
|
||||
Edit{Se,Sy}(Substitution(sym), pos)
|
||||
else
|
||||
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
_mutation(e::Edit)
|
||||
|
||||
Returns the underlying [`Substitution`](@ref), [`Insertion`](@ref), or [`Deletion`](@ref) of
|
||||
`e`.
|
||||
"""
|
||||
_mutation(e::Edit) = e.x
|
||||
BioGenerics.leftposition(e::Edit) = e.pos
|
||||
function BioGenerics.rightposition(e::Edit)
|
||||
if _mutation(e) isa Substitution
|
||||
return leftposition(e)
|
||||
elseif _mutation(e) isa Insertion
|
||||
return leftposition(e) + 1
|
||||
elseif _mutation(e) isa Deletion
|
||||
return leftposition(e) + length(e) - 1
|
||||
else
|
||||
error("Unknown mutation type $(typeof(_mutation(e)))")
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
_lendiff(edit::Edit)
|
||||
|
||||
Gets the number of bases that `edit` adds to the reference sequence
|
||||
"""
|
||||
function _lendiff(edit::Edit)
|
||||
x = _mutation(edit)
|
||||
# Each edit type has logic for its length, we just need to know what direction to go
|
||||
multiplier = x isa Substitution ? 0 : (x isa Deletion ? -1 : 1)
|
||||
return length(x) * multiplier
|
||||
end
|
|
@ -20,32 +20,519 @@ TODO now:
|
|||
* Add tests
|
||||
"""
|
||||
|
||||
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence
|
||||
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP
|
||||
using BioGenerics: BioGenerics, leftposition, rightposition
|
||||
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
|
||||
using BioSymbols: BioSymbol
|
||||
|
||||
export Deletion
|
||||
export Insertion
|
||||
export Substitution
|
||||
export Variant
|
||||
export Variation
|
||||
export altbases
|
||||
export mutation
|
||||
export reconstruct!
|
||||
export refbases
|
||||
export reference
|
||||
export translate
|
||||
export variations
|
||||
|
||||
const BA = BioAlignments
|
||||
const BS = BioSequences
|
||||
|
||||
#=
|
||||
import Automa
|
||||
import Automa.RegExp: @re_str
|
||||
=#
|
||||
|
||||
struct Unsafe end
|
||||
struct Inapplicable end
|
||||
|
||||
include("Edit.jl")
|
||||
include("Variant.jl")
|
||||
include("Variation.jl")
|
||||
#=
|
||||
const CONTEXT = Automa.CodeGenContext(
|
||||
vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte),
|
||||
generator=:goto,
|
||||
checkbounds=false
|
||||
)
|
||||
|
||||
|
||||
const BYTES = Union{String, SubString{String}, Vector{UInt8}}
|
||||
=#
|
||||
|
||||
"""
|
||||
Substitution
|
||||
|
||||
Represents the presence of a `T` at a given position. The position is stored
|
||||
outside this struct.
|
||||
"""
|
||||
struct Substitution{T <: BioSymbol}
|
||||
x::T
|
||||
end
|
||||
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
|
||||
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
|
||||
|
||||
"""
|
||||
Deletion
|
||||
|
||||
Represents the deletion of N symbols. The location of the deletion is stored
|
||||
outside this struct
|
||||
"""
|
||||
struct Deletion
|
||||
len::UInt
|
||||
|
||||
function Deletion(len::UInt)
|
||||
iszero(len) && error("Deletion must be at least 1 symbol")
|
||||
new(len)
|
||||
end
|
||||
end
|
||||
Deletion(x::Integer) = Deletion(convert(UInt, x))
|
||||
Base.length(x::Deletion) = Int(x.len)
|
||||
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
|
||||
|
||||
"""
|
||||
Insertion{S <: BioSequence}
|
||||
|
||||
Represents the insertion of a `S` into a sequence. The location of the insertion
|
||||
is stored outside the struct.
|
||||
"""
|
||||
struct Insertion{S <: BioSequence}
|
||||
seq::S
|
||||
|
||||
function Insertion{S}(x::S) where {S <: BioSequence}
|
||||
isempty(x) && error("Insertion must be at least 1 symbol")
|
||||
new(x)
|
||||
end
|
||||
end
|
||||
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
||||
Base.length(x::Insertion) = length(x.seq)
|
||||
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
||||
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
|
||||
|
||||
"""
|
||||
Edit{S <: BioSequence, T <: BioSymbol}
|
||||
|
||||
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
|
||||
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
|
||||
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
|
||||
"""
|
||||
struct Edit{S <: BioSequence, T <: BioSymbol}
|
||||
x::Union{Substitution{T}, Deletion, Insertion{S}}
|
||||
pos::UInt
|
||||
end
|
||||
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
|
||||
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
|
||||
Base.length(e::Edit) = e isa Substitution ? 1 : length(mutation(e))
|
||||
|
||||
function Base.parse(::Type{T}, s::AbstractString) where {T <: Edit{Se, Sy}} where {Se, Sy}
|
||||
parse(T, String(s))
|
||||
end
|
||||
|
||||
function Base.parse(::Type{<:Edit{Se, Sy}}, s::Union{String, SubString{String}}) where {Se, Sy}
|
||||
# Either "Δ1-2", "11T" or "G16C"
|
||||
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
|
||||
pos = parse(UInt, m[1])
|
||||
stop = parse(UInt, m[2])
|
||||
stop ≥ pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
|
||||
Edit{Se, Sy}(Deletion(stop - pos + 1), pos)
|
||||
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
|
||||
pos = parse(UInt, m[1])
|
||||
seq = Se(m[2])
|
||||
Edit{Se, Sy}(Insertion(seq), pos)
|
||||
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
|
||||
pos = parse(UInt, m[1])
|
||||
sym = Sy(first(m[2]))
|
||||
Edit{Se, Sy}(Substitution(sym), pos)
|
||||
else
|
||||
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
|
||||
end
|
||||
end
|
||||
|
||||
mutation(e::Edit) = e.x
|
||||
BioGenerics.leftposition(e::Edit) = e.pos
|
||||
function BioGenerics.rightposition(e::Edit)
|
||||
if mutation(e) isa Substitution
|
||||
return leftposition(e)
|
||||
elseif mutation(e) isa Insertion
|
||||
return leftposition(e) + 1
|
||||
elseif mutation(e) isa Deletion
|
||||
return leftposition(e) + length(e) - 1
|
||||
else
|
||||
error("Unknown mutation type $(typeof(mutation(e)))")
|
||||
end
|
||||
end
|
||||
|
||||
#=
|
||||
@noinline throw_parse_error(T, p::Integer) = error("Failed to parse $T at byte $p")
|
||||
|
||||
# Parse substitution
|
||||
let
|
||||
machine = let
|
||||
biosymbol = re"[A-Za-z]"
|
||||
number = re"[0-9]+"
|
||||
|
||||
biosymbol.actions[:enter] = [:enter]
|
||||
number.actions[:all] = [:digit]
|
||||
|
||||
Automa.compile(biosymbol * number * biosymbol)
|
||||
end
|
||||
actions = Dict(
|
||||
:enter => quote
|
||||
symbol = T(Char(byte))
|
||||
if firstsymbol === nothing
|
||||
firstsymbol = symbol
|
||||
else
|
||||
lastsymbol = symbol
|
||||
end
|
||||
end,
|
||||
:digit => :(num = UInt(10)*num + (byte - 0x30) % UInt),
|
||||
)
|
||||
@eval begin
|
||||
function Base.parse(::Type{Edit{S, T}}, data::BYTES) where {S, T}
|
||||
$(Automa.generate_init_code(CONTEXT, machine))
|
||||
p_eof = p_end = sizeof(data)
|
||||
firstsymbol = lastsymbol = nothing
|
||||
num = UInt(0)
|
||||
$(Automa.generate_exec_code(CONTEXT, machine, actions))
|
||||
iszero(cs) || throw_parse_error(Edit{S, T}, p)
|
||||
if firstsymbol == lastsymbol
|
||||
error("First symbol and last symbol are identical")
|
||||
end
|
||||
return Edit{S, T}(Substitution{T}(lastsymbol), num)
|
||||
end
|
||||
end
|
||||
end
|
||||
=#
|
||||
|
||||
# Edits are applied sequentially from first to last pos.
|
||||
# The vector must always be sorted by pos.
|
||||
struct Variant{S <: BioSequence, T <: BioSymbol}
|
||||
ref::S
|
||||
edits::Vector{Edit{S, T}}
|
||||
|
||||
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
|
||||
end
|
||||
|
||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||
sort!(edits, by=x -> x.pos)
|
||||
result = Variant{S, T}(ref, edits, Unsafe())
|
||||
is_valid(result) || error("TODO") # report what kind of error message?
|
||||
return result
|
||||
end
|
||||
|
||||
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||
Variant{S, T}(ref, edits)
|
||||
end
|
||||
|
||||
|
||||
|
||||
function Base.show(io::IO, x::Variant)
|
||||
n = length(x.edits)
|
||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||
for i in x.edits
|
||||
v = Variation(x.ref, i)
|
||||
print(io, "\n ")
|
||||
show(io, v)
|
||||
end
|
||||
end
|
||||
|
||||
# Validate:
|
||||
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||
# is affected by multiple edits.
|
||||
function is_valid(v::Variant)
|
||||
isempty(v.ref) && return false
|
||||
valid_positions = 1:length(v.ref)
|
||||
last_was_insert = false
|
||||
for edit in v.edits
|
||||
pos = edit.pos
|
||||
op = edit.x
|
||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||
# variations
|
||||
is_valid(Variation(v.ref, edit)) || return false
|
||||
|
||||
# For substitutions we simply do not allow another modification of the same base
|
||||
if op isa Substitution
|
||||
pos in valid_positions || return false
|
||||
valid_positions = first(valid_positions) + 1 : last(valid_positions)
|
||||
last_was_insert = false
|
||||
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||
# for next op. However, we cannot have two insertions at the same position, because
|
||||
# then the order of them is ambiguous
|
||||
elseif op isa Insertion
|
||||
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
|
||||
last_was_insert = true
|
||||
# Deletions obviously invalidate the reference bases that are deleted.
|
||||
elseif op isa Deletion
|
||||
len = length(op)
|
||||
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
|
||||
valid_positions = first(valid_positions) + len : last(valid_positions)
|
||||
last_was_insert = false
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
|
||||
ref = aln.b
|
||||
E = eltype(typeof(ref))
|
||||
edits = Edit{T, E}[]
|
||||
refpos = first(aln.a.aln.anchors).refpos
|
||||
seqpos = first(aln.a.aln.anchors).seqpos
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
insertion_buffer = E[]
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
isgap(seqi) || (seqpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(n_gaps) && (markpos = refpos)
|
||||
n_gaps += 1
|
||||
elseif !iszero(n_gaps)
|
||||
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
n_gaps = 0
|
||||
end
|
||||
|
||||
# Check for insertions
|
||||
if isgap(refi)
|
||||
iszero(n_ins) && (markpos = refpos + 1)
|
||||
push!(insertion_buffer, seqi)
|
||||
n_ins += 1
|
||||
elseif !iszero(n_ins)
|
||||
seq = T(insertion_buffer)
|
||||
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
|
||||
empty!(insertion_buffer)
|
||||
n_ins = 0
|
||||
end
|
||||
|
||||
# Substitutions
|
||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
|
||||
end
|
||||
end
|
||||
|
||||
# Check for clips at the end of the alignment
|
||||
last_anchors = aln.a.aln.anchors[end-1:end]
|
||||
|
||||
# Final indel, if applicable
|
||||
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||||
if !iszero(n_gaps)
|
||||
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
elseif !iszero(n_ins)
|
||||
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||
end
|
||||
end
|
||||
|
||||
return Variant(ref, edits)
|
||||
end
|
||||
|
||||
edits(v::Variant) = v.edits
|
||||
reference(v::Variant) = v.ref
|
||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||
|
||||
function lendiff(edit::Edit)
|
||||
x = edit.x
|
||||
x isa Substitution ? 0 : (x isa Deletion ? -length(x) : length(x.x))
|
||||
end
|
||||
|
||||
function reconstruct!(seq::S, x::Variant{S}) where S
|
||||
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
||||
for edit in x.edits
|
||||
while refpos < edit.pos
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
editx = edit.x
|
||||
if editx isa Substitution
|
||||
seq[seqpos] = editx.x
|
||||
seqpos += 1
|
||||
refpos += 1
|
||||
elseif editx isa Deletion
|
||||
refpos += editx.len
|
||||
elseif editx isa Insertion
|
||||
for i in editx.x
|
||||
seq[seqpos] = i
|
||||
seqpos += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
while seqpos ≤ length(seq)
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
seq
|
||||
end
|
||||
|
||||
struct Variation{S <: BioSequence, T <: BioSymbol}
|
||||
ref::S
|
||||
edit::Edit{S, T}
|
||||
|
||||
function Variation{S, T}(ref::S, e::Edit{S, T}, ::Unsafe) where {S <: BioSequence, T <: BioSymbol}
|
||||
new(ref, e)
|
||||
end
|
||||
end
|
||||
|
||||
function Variation{S, T}(ref::S, e::Edit{S, T}) where {S <: BioSequence, T <: BioSymbol}
|
||||
v = Variation{S, T}(ref, e, Unsafe())
|
||||
is_valid(v) ? v : throw(ArgumentError("Invalid variant"))
|
||||
end
|
||||
|
||||
Variation(ref::S, edit::Edit{S, T}) where {S, T} = Variation{S, T}(ref, edit)
|
||||
|
||||
function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
||||
T = eltype(ref)
|
||||
|
||||
e = parse(Edit{S,T}, edit)
|
||||
return Variation{S,T}(ref, e)
|
||||
end
|
||||
|
||||
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||
edits = edit.(vars)
|
||||
return Variant{S, T}(ref, edits)
|
||||
end
|
||||
|
||||
reference(v::Variation) = v.ref
|
||||
edit(v::Variation) = v.edit
|
||||
mutation(v::Variation) = mutation(edit(v))
|
||||
BioGenerics.leftposition(v::Variation) = leftposition(edit(v))
|
||||
BioGenerics.rightposition(v::Variation) = rightposition(edit(v))
|
||||
Base.:(==)(x::Variation, y::Variation) = x.ref == y.ref && x.edit == y.edit
|
||||
Base.hash(x::Variation, h::UInt) = hash(Variation, hash((x.ref, x.edit), h))
|
||||
|
||||
function is_valid(v::Variation)
|
||||
isempty(v.ref) && return false
|
||||
op = v.edit.x
|
||||
pos = v.edit.pos
|
||||
if op isa Substitution
|
||||
return pos in eachindex(v.ref)
|
||||
elseif op isa Insertion
|
||||
return pos in 0:lastindex(v.ref)+1
|
||||
elseif op isa Deletion
|
||||
return pos in 1:(lastindex(v.ref)-length(op) + 1)
|
||||
end
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::Variation)
|
||||
content = x.edit.x
|
||||
pos = x.edit.pos
|
||||
if content isa Substitution
|
||||
print(io, x.ref[pos], pos, content.x)
|
||||
elseif content isa Deletion
|
||||
print(io, 'Δ', pos, '-', pos + content.len - 1)
|
||||
elseif content isa Insertion
|
||||
print(io, pos, content.seq)
|
||||
else
|
||||
print(io, pos, content.x)
|
||||
end
|
||||
end
|
||||
|
||||
function Base.in(v::Variation, var::Variant)
|
||||
if v.ref != var.ref
|
||||
error("References must be equal")
|
||||
end
|
||||
any(v.edit == edit for edit in var.edits)
|
||||
end
|
||||
|
||||
function translate(var::Variation{S, T}, aln::PairwiseAlignment{S, S}) where {S, T}
|
||||
kind = var.edit.x
|
||||
pos = var.edit.pos
|
||||
seq, ref = aln.seq, aln.b
|
||||
|
||||
# Special case: Insertions may have a pos of 0, which cannot be mapped to
|
||||
# the seq using ref2seq
|
||||
if iszero(pos)
|
||||
(s, r), _ = iterate(aln)
|
||||
(isgap(s) | isgap(r)) && return Inapplicable()
|
||||
return Variation{S, T}(seq, Edit{S, T}(Insertion(var.edit.x), 0))
|
||||
end
|
||||
|
||||
(seqpos, op) = BA.ref2seq(aln, pos)
|
||||
if kind isa Substitution
|
||||
# If it's a substitution, return nothing if it maps to a deleted
|
||||
# position, or substitutes to same base.
|
||||
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
|
||||
seq[seqpos] == kind.x && return nothing
|
||||
edit = Edit{S, T}(kind, seqpos)
|
||||
return Variation{S, T}(seq, edit, Unsafe())
|
||||
elseif kind isa Deletion
|
||||
# If it's a deletion, return nothing if the deleted part is already missing
|
||||
# from the new reference.
|
||||
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
|
||||
start = seqpos + op == BA.OP_DELETE
|
||||
start < stop && return nothing
|
||||
edit = Edit{S, T}(Deletion(stop - start + 1), start)
|
||||
return Variation{S, T}(seq, edit, Unsafe())
|
||||
else
|
||||
# If it maps directly to a symbol, just insert
|
||||
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
|
||||
# This happens if there is already an insertion at the position
|
||||
if pos != lastindex(ref) && first(ref2seq(aln, pos+1)) != seqpos + 1
|
||||
return Inapplicable()
|
||||
else
|
||||
edit = Edit{S, T}(Insertion(var.edit.x), seqpos)
|
||||
return Variation{S, T}(seq, edit, Unsafe())
|
||||
end
|
||||
# Alternatively, it can map to a deletion. In that case, it become really
|
||||
# tricky to talk about the "same" insertion.
|
||||
else
|
||||
return Inapplicable()
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function variations(v::Variant{S,T}) where {S,T}
|
||||
vs = Vector{Variation{S,T}}(undef, length(edits(v)))
|
||||
for (i, e) in enumerate(edits(v))
|
||||
vs[i] = Variation{S,T}(reference(v), e)
|
||||
end
|
||||
return vs
|
||||
end
|
||||
|
||||
function _refbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
|
||||
return S([reference[pos]])
|
||||
end
|
||||
|
||||
function _altbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
|
||||
return S([s.x])
|
||||
end
|
||||
|
||||
function _refbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
|
||||
if pos == 1
|
||||
return S(reference[UnitRange{Int}(pos, pos+length(d))])
|
||||
else
|
||||
return S(reference[UnitRange{Int}(pos-1, pos+length(d)-1)])
|
||||
end
|
||||
end
|
||||
|
||||
function _altbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
|
||||
if pos == 1
|
||||
return S([reference[pos+1]])
|
||||
else
|
||||
return S([reference[pos-1]])
|
||||
end
|
||||
end
|
||||
|
||||
function _refbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
|
||||
return S([reference[pos]])
|
||||
end
|
||||
|
||||
function _altbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
|
||||
if pos == 1
|
||||
return S([i.seq..., reference[pos]])
|
||||
else
|
||||
return S([reference[pos], i.seq...])
|
||||
end
|
||||
end
|
||||
|
||||
function refbases(v::Variation)
|
||||
return _refbases(mutation(v), reference(v), leftposition(v))
|
||||
end
|
||||
|
||||
function altbases(v::Variation)
|
||||
return _altbases(mutation(v), reference(v), leftposition(v))
|
||||
end
|
||||
|
||||
export Insertion,
|
||||
Deletion,
|
||||
Substitution,
|
||||
Variant,
|
||||
Variation,
|
||||
reference,
|
||||
mutation,
|
||||
variations,
|
||||
refbases,
|
||||
altbases
|
||||
|
||||
end # module
|
||||
|
|
197
src/Variant.jl
197
src/Variant.jl
|
@ -1,197 +0,0 @@
|
|||
"""
|
||||
Variant{S<:BioSequence,T<:BioSymbol}
|
||||
|
||||
A set of variations within a given sequence that are all found together. Depending on the
|
||||
field, it might also be referred to as a "genotype," "haplotype," or "strain."
|
||||
|
||||
# Constructors
|
||||
|
||||
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variant(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
|
||||
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
|
||||
edits are applied sequentially from first to last position, therefore the vector must always
|
||||
be sorted by position. These edits are sorted automatically if constructing from an
|
||||
alignment.
|
||||
"""
|
||||
struct Variant{S<:BioSequence,T<:BioSymbol}
|
||||
ref::S
|
||||
edits::Vector{Edit{S,T}}
|
||||
|
||||
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
||||
end
|
||||
|
||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
sort!(edits; by=x -> x.pos)
|
||||
result = Variant{S,T}(ref, edits, Unsafe())
|
||||
_is_valid(result) || error("TODO") # report what kind of error message?
|
||||
return result
|
||||
end
|
||||
|
||||
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
return Variant{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::Variant)
|
||||
n = length(x.edits)
|
||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||
for i in x.edits
|
||||
v = Variation(x.ref, i)
|
||||
print(io, "\n ")
|
||||
show(io, v)
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
is_valid(v::Variant)
|
||||
|
||||
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
|
||||
position is affected by multiple edits.
|
||||
"""
|
||||
function _is_valid(v::Variant)
|
||||
isempty(v.ref) && return false
|
||||
valid_positions = 1:length(v.ref)
|
||||
last_was_insert = false
|
||||
for edit in v.edits
|
||||
pos = edit.pos
|
||||
op = edit.x
|
||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||
# variations
|
||||
_is_valid(Variation(v.ref, edit)) || return false
|
||||
|
||||
# For substitutions we simply do not allow another modification of the same base
|
||||
if op isa Substitution
|
||||
pos in valid_positions || return false
|
||||
valid_positions = (first(valid_positions) + 1):last(valid_positions)
|
||||
last_was_insert = false
|
||||
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||
# for next op. However, we cannot have two insertions at the same position, because
|
||||
# then the order of them is ambiguous
|
||||
elseif op isa Insertion
|
||||
pos in
|
||||
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
|
||||
return false
|
||||
last_was_insert = true
|
||||
# Deletions obviously invalidate the reference bases that are deleted.
|
||||
elseif op isa Deletion
|
||||
len = length(op)
|
||||
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
|
||||
return false
|
||||
valid_positions = (first(valid_positions) + len):last(valid_positions)
|
||||
last_was_insert = false
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
function Variant(
|
||||
aln::PairwiseAlignment{T,T}
|
||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
||||
ref = aln.b
|
||||
E = eltype(typeof(ref))
|
||||
edits = Edit{T,E}[]
|
||||
refpos = first(aln.a.aln.anchors).refpos
|
||||
seqpos = first(aln.a.aln.anchors).seqpos
|
||||
markpos = 0
|
||||
n_gaps = n_ins = 0
|
||||
insertion_buffer = E[]
|
||||
for (seqi, refi) in aln
|
||||
isgap(refi) || (refpos += 1)
|
||||
isgap(seqi) || (seqpos += 1)
|
||||
|
||||
# Check for deletions
|
||||
if isgap(seqi)
|
||||
iszero(n_gaps) && (markpos = refpos)
|
||||
n_gaps += 1
|
||||
elseif !iszero(n_gaps)
|
||||
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
n_gaps = 0
|
||||
end
|
||||
|
||||
# Check for insertions
|
||||
if isgap(refi)
|
||||
iszero(n_ins) && (markpos = refpos + 1)
|
||||
push!(insertion_buffer, seqi)
|
||||
n_ins += 1
|
||||
elseif !iszero(n_ins)
|
||||
seq = T(insertion_buffer)
|
||||
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
|
||||
empty!(insertion_buffer)
|
||||
n_ins = 0
|
||||
end
|
||||
|
||||
# Substitutions
|
||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
|
||||
end
|
||||
end
|
||||
|
||||
# Check for clips at the end of the alignment
|
||||
last_anchors = aln.a.aln.anchors[(end - 1):end]
|
||||
|
||||
# Final indel, if applicable
|
||||
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||||
if !iszero(n_gaps)
|
||||
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||
elseif !iszero(n_ins)
|
||||
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||
end
|
||||
end
|
||||
|
||||
return Variant(ref, edits)
|
||||
end
|
||||
|
||||
"""
|
||||
_edits(v::Variant)
|
||||
|
||||
Gets the [`Edit`](@ref)s that comprise `v`
|
||||
"""
|
||||
_edits(v::Variant) = v.edits
|
||||
|
||||
"""
|
||||
reference(v::Variant)
|
||||
|
||||
Gets the reference sequence of `v`.
|
||||
"""
|
||||
reference(v::Variant) = v.ref
|
||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||
|
||||
"""
|
||||
reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
|
||||
Apply the edits in `x` to `seq` and return the mutated sequence
|
||||
"""
|
||||
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
||||
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
|
||||
resize!(seq, len % UInt)
|
||||
refpos = seqpos = 1
|
||||
for edit in x.edits
|
||||
while refpos < edit.pos
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
editx = edit.x
|
||||
if editx isa Substitution
|
||||
seq[seqpos] = editx.x
|
||||
seqpos += 1
|
||||
refpos += 1
|
||||
elseif editx isa Deletion
|
||||
refpos += editx.len
|
||||
elseif editx isa Insertion
|
||||
for i in editx.x
|
||||
seq[seqpos] = i
|
||||
seqpos += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
while seqpos ≤ length(seq)
|
||||
seq[seqpos] = x.ref[refpos]
|
||||
refpos += 1
|
||||
seqpos += 1
|
||||
end
|
||||
return seq
|
||||
end
|
206
src/Variation.jl
206
src/Variation.jl
|
@ -1,206 +0,0 @@
|
|||
"""
|
||||
Variation{S<:BioSequence,T<:BioSymbol}
|
||||
|
||||
A single change to a biological sequence. A general wrapper that can represent a
|
||||
sequence-specific [`Substitution`](@ref), [`Deletion`](@ref) or [`Insertion`](@ref).
|
||||
`Variation` is more robust than [`Edit`](@ref), due to inclusion of the reference sequence
|
||||
and built-in validation.
|
||||
|
||||
# Constructors
|
||||
|
||||
Variation(ref::S, e::Edit{S,T}) where {S<:BioSequence,T<:BioSymbol}
|
||||
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
||||
|
||||
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
|
||||
[`variations(::Variant)`](@ref) is encouraged, instead.
|
||||
|
||||
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
|
||||
following syntax:
|
||||
- Substitution: `"<REFBASE><POS><ALTBASE>"`, e.g. `"G16C"`
|
||||
- Deletion: `"Δ<STARTPOS>-<ENDPOS>"`, e.g. `"Δ1-2"`
|
||||
- Insertion: `"<POS><ALTBASES>"`, e.g. `"11T"`
|
||||
"""
|
||||
struct Variation{S<:BioSequence,T<:BioSymbol}
|
||||
ref::S
|
||||
edit::Edit{S,T}
|
||||
|
||||
function Variation{S,T}(
|
||||
ref::S, e::Edit{S,T}, ::Unsafe
|
||||
) where {S<:BioSequence,T<:BioSymbol}
|
||||
return new(ref, e)
|
||||
end
|
||||
end
|
||||
|
||||
function Variation{S,T}(ref::S, e::Edit{S,T}) where {S<:BioSequence,T<:BioSymbol}
|
||||
v = Variation{S,T}(ref, e, Unsafe())
|
||||
return _is_valid(v) ? v : throw(ArgumentError("Invalid variant"))
|
||||
end
|
||||
|
||||
Variation(ref::S, edit::Edit{S,T}) where {S,T} = Variation{S,T}(ref, edit)
|
||||
|
||||
function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
||||
T = eltype(ref)
|
||||
|
||||
e = parse(Edit{S,T}, edit)
|
||||
return Variation{S,T}(ref, e)
|
||||
end
|
||||
|
||||
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
||||
edits = _edit.(vars)
|
||||
return Variant{S,T}(ref, edits)
|
||||
end
|
||||
|
||||
"""
|
||||
reference(v::Variation)
|
||||
|
||||
Gets the reference sequence of `v`
|
||||
"""
|
||||
reference(v::Variation) = v.ref
|
||||
|
||||
"""
|
||||
_edit(v::Variation)
|
||||
|
||||
Gets the underlying [`Edit`](@ref) of `v`
|
||||
"""
|
||||
_edit(v::Variation) = v.edit
|
||||
|
||||
"""
|
||||
mutation(v::Variation)
|
||||
|
||||
Gets the underlying [`Substitution`](@ref), [`Insertion`](@ref), or [`Deletion`](@ref) of
|
||||
`v`.
|
||||
"""
|
||||
mutation(v::Variation) = _mutation(_edit(v))
|
||||
BioGenerics.leftposition(v::Variation) = leftposition(_edit(v))
|
||||
BioGenerics.rightposition(v::Variation) = rightposition(_edit(v))
|
||||
Base.:(==)(x::Variation, y::Variation) = x.ref == y.ref && x.edit == y.edit
|
||||
Base.hash(x::Variation, h::UInt) = hash(Variation, hash((x.ref, x.edit), h))
|
||||
|
||||
"""
|
||||
_is_valid(v::Variation)
|
||||
|
||||
Validate `v`. `v` is invalid if its opertation is out of bounds.
|
||||
"""
|
||||
function _is_valid(v::Variation)
|
||||
isempty(v.ref) && return false
|
||||
op = v.edit.x
|
||||
pos = v.edit.pos
|
||||
if op isa Substitution
|
||||
return pos in eachindex(v.ref)
|
||||
elseif op isa Insertion
|
||||
return pos in 0:(lastindex(v.ref) + 1)
|
||||
elseif op isa Deletion
|
||||
return pos in 1:(lastindex(v.ref) - length(op) + 1)
|
||||
end
|
||||
end
|
||||
|
||||
function Base.show(io::IO, x::Variation)
|
||||
content = x.edit.x
|
||||
pos = x.edit.pos
|
||||
if content isa Substitution
|
||||
print(io, x.ref[pos], pos, content.x)
|
||||
elseif content isa Deletion
|
||||
print(io, 'Δ', pos, '-', pos + content.len - 1)
|
||||
elseif content isa Insertion
|
||||
print(io, pos, content.seq)
|
||||
else
|
||||
print(io, pos, content.x)
|
||||
end
|
||||
end
|
||||
|
||||
function Base.in(v::Variation, var::Variant)
|
||||
if v.ref != var.ref
|
||||
error("References must be equal")
|
||||
end
|
||||
return any(v.edit == edit for edit in var.edits)
|
||||
end
|
||||
|
||||
"""
|
||||
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
||||
|
||||
Convert the difference in `var` to a new reference sequence based upon `aln`. `aln` is the
|
||||
alignment of the old reference (`aln.b`) and the new reference sequence (`aln.seq`). Returns
|
||||
the new [`Variation`](@ref).
|
||||
"""
|
||||
function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
||||
kind = mutation(var)
|
||||
pos = leftposition(var)
|
||||
seq = sequence(aln)
|
||||
ref = aln.b
|
||||
|
||||
# Special case: Insertions may have a pos of 0, which cannot be mapped to
|
||||
# the seq using ref2seq
|
||||
if iszero(pos)
|
||||
(s, r), _ = iterate(aln)
|
||||
(isgap(s) | isgap(r)) && return Inapplicable()
|
||||
return Variation{S,T}(seq, Edit{S,T}(Insertion(var.edit.x), 0))
|
||||
end
|
||||
|
||||
(seqpos, op) = BA.ref2seq(aln, pos)
|
||||
if kind isa Substitution
|
||||
# If it's a substitution, return nothing if it maps to a deleted
|
||||
# position, or substitutes to same base.
|
||||
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
|
||||
seq[seqpos] == kind.x && return nothing
|
||||
edit = Edit{S,T}(kind, seqpos)
|
||||
return Variation{S,T}(seq, edit, Unsafe())
|
||||
elseif kind isa Deletion
|
||||
# If it's a deletion, return nothing if the deleted part is already missing
|
||||
# from the new reference.
|
||||
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
|
||||
start = seqpos + op == BA.OP_DELETE
|
||||
start < stop && return nothing
|
||||
edit = Edit{S,T}(Deletion(stop - start + 1), start)
|
||||
return Variation{S,T}(seq, edit, Unsafe())
|
||||
else
|
||||
# If it maps directly to a symbol, just insert
|
||||
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
|
||||
# This happens if there is already an insertion at the position
|
||||
if pos != lastindex(ref) && first(ref2seq(aln, pos + 1)) != seqpos + 1
|
||||
return Inapplicable()
|
||||
else
|
||||
edit = Edit{S,T}(Insertion(var.edit.x), seqpos)
|
||||
return Variation{S,T}(seq, edit, Unsafe())
|
||||
end
|
||||
# Alternatively, it can map to a deletion. In that case, it become really
|
||||
# tricky to talk about the "same" insertion.
|
||||
else
|
||||
return Inapplicable()
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
variations(v::Variant{S,T}) where {S,T}
|
||||
|
||||
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
|
||||
"""
|
||||
function variations(v::Variant{S,T}) where {S,T}
|
||||
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
|
||||
for (i, e) in enumerate(_edits(v))
|
||||
vs[i] = Variation{S,T}(reference(v), e)
|
||||
end
|
||||
return vs
|
||||
end
|
||||
|
||||
"""
|
||||
refbases(v::Variation)
|
||||
|
||||
Get the reference bases of `v`. Note that for deletions, `refbases` also returns the base
|
||||
_before_ the deletion in accordance with the `REF` field of the
|
||||
[VCF v4 specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf).
|
||||
"""
|
||||
function refbases(v::Variation)
|
||||
return _refbases(mutation(v), reference(v), leftposition(v))
|
||||
end
|
||||
|
||||
"""
|
||||
altbases(v::Variation)
|
||||
|
||||
Get the alternate bases of `v`. Note that for insertions, `altbases` also returns the base
|
||||
_before_ the insertion in accordance with the `ALT` field of the
|
||||
[VCF v4 specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf).
|
||||
"""
|
||||
function altbases(v::Variation)
|
||||
return _altbases(mutation(v), reference(v), leftposition(v))
|
||||
end
|
|
@ -1,35 +0,0 @@
|
|||
"""
|
||||
Deletion
|
||||
|
||||
Represents the deletion of N symbols. The location of the deletion is stored
|
||||
outside this struct
|
||||
"""
|
||||
struct Deletion
|
||||
len::UInt
|
||||
|
||||
function Deletion(len::UInt)
|
||||
iszero(len) && error("Deletion must be at least 1 symbol")
|
||||
return new(len)
|
||||
end
|
||||
end
|
||||
Deletion(x::Integer) = Deletion(convert(UInt, x))
|
||||
|
||||
Base.length(x::Deletion) = Int(x.len)
|
||||
Base.:(==)(x::Substitution, y::Substitution) = length(x) == length(y)
|
||||
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
|
||||
|
||||
function _refbases(d::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
|
||||
if pos == 1
|
||||
return S(reference[UnitRange{Int}(pos, pos + length(d))])
|
||||
else
|
||||
return S(reference[UnitRange{Int}(pos - 1, pos + length(d) - 1)])
|
||||
end
|
||||
end
|
||||
|
||||
function _altbases(::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
|
||||
if pos == 1
|
||||
return S([reference[pos + 1]])
|
||||
else
|
||||
return S([reference[pos - 1]])
|
||||
end
|
||||
end
|
|
@ -1,31 +0,0 @@
|
|||
"""
|
||||
Insertion{S <: BioSequence}
|
||||
|
||||
Represents the insertion of a `S` into a sequence. The location of the insertion
|
||||
is stored outside the struct.
|
||||
"""
|
||||
struct Insertion{S<:BioSequence}
|
||||
seq::S
|
||||
|
||||
function Insertion{S}(x::S) where {S<:BioSequence}
|
||||
isempty(x) && error("Insertion must be at least 1 symbol")
|
||||
return new(x)
|
||||
end
|
||||
end
|
||||
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
||||
|
||||
Base.length(x::Insertion) = length(x.seq)
|
||||
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
||||
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
|
||||
|
||||
function _refbases(::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
|
||||
return S([reference[pos]])
|
||||
end
|
||||
|
||||
function _altbases(i::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
|
||||
if pos == 1
|
||||
return S([i.seq..., reference[pos]])
|
||||
else
|
||||
return S([reference[pos], i.seq...])
|
||||
end
|
||||
end
|
|
@ -1,21 +0,0 @@
|
|||
"""
|
||||
Substitution
|
||||
|
||||
Represents the presence of a `T` at a given position. The position is stored
|
||||
outside this struct.
|
||||
"""
|
||||
struct Substitution{T<:BioSymbol}
|
||||
x::T
|
||||
end
|
||||
|
||||
Base.length(::Substitution) = 1
|
||||
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
|
||||
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
|
||||
|
||||
function _refbases(::Substitution, reference::S, pos::UInt) where {S<:BioSequence}
|
||||
return S([reference[pos]])
|
||||
end
|
||||
|
||||
function _altbases(s::Substitution, ::S, pos::UInt) where {S<:BioSequence}
|
||||
return S([s.x])
|
||||
end
|
|
@ -23,13 +23,12 @@ TODO now:
|
|||
* Add tests
|
||||
"""
|
||||
|
||||
using Aqua
|
||||
using BioAlignments
|
||||
using BioSequences
|
||||
using SequenceVariation
|
||||
using Test
|
||||
|
||||
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_extend=-2)
|
||||
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL, gap_open=-25, gap_extend=-2)
|
||||
|
||||
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
||||
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
||||
|
@ -112,40 +111,17 @@ end
|
|||
refseq = dna"GATTACA"
|
||||
mutseq = dna"GATTACAAAA"
|
||||
|
||||
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
||||
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq), eltype(refseq)}[])
|
||||
|
||||
# Test for ending soft clip
|
||||
@test Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
|
||||
) == refvar
|
||||
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)) == refvar
|
||||
|
||||
# Test for ending soft+hard clip
|
||||
@test Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
|
||||
) == refvar
|
||||
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)) == refvar
|
||||
|
||||
# Test that ending insertions are still valid
|
||||
@test length(
|
||||
Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
|
||||
).edits,
|
||||
) == 1
|
||||
@test length(Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)).edits) == 1
|
||||
|
||||
# Test that out-of-bounds bases are still caught
|
||||
@test_throws BoundsError Variant(
|
||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
|
||||
)
|
||||
end
|
||||
|
||||
@testset "Aqua" begin
|
||||
Aqua.test_ambiguities(SequenceVariation; recursive=false)
|
||||
# TODO: Refactor `Edit` so that this test doesn't fail
|
||||
# TODO: This test _should_ be set to @test_fails, but Aqua's syntax doesn't allow that
|
||||
# Aqua.test_unbound_args(SequenceVariation)
|
||||
Aqua.test_undefined_exports(SequenceVariation)
|
||||
Aqua.test_piracy(SequenceVariation)
|
||||
Aqua.test_project_extras(SequenceVariation)
|
||||
Aqua.test_stale_deps(SequenceVariation)
|
||||
Aqua.test_deps_compat(SequenceVariation)
|
||||
Aqua.test_project_toml_formatting(SequenceVariation)
|
||||
@test_throws BoundsError Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq))
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue