mirror of
https://github.com/MillironX/SequenceVariation.jl.git
synced 2024-11-22 13:29:56 +00:00
Compare commits
No commits in common. "3ef4a92ba268faa509547bb9774d070aba38cdd5" and "73d2781ee945ebc64f6b5599ef0838f4cab585e9" have entirely different histories.
3ef4a92ba2
...
73d2781ee9
21 changed files with 531 additions and 898 deletions
|
@ -1 +0,0 @@
|
||||||
style = "blue"
|
|
2
.github/workflows/UnitTests.yml
vendored
2
.github/workflows/UnitTests.yml
vendored
|
@ -19,6 +19,8 @@ jobs:
|
||||||
- x64
|
- x64
|
||||||
os:
|
os:
|
||||||
- ubuntu-latest
|
- ubuntu-latest
|
||||||
|
- windows-latest
|
||||||
|
- macOS-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|
29
.github/workflows/linting.yml
vendored
29
.github/workflows/linting.yml
vendored
|
@ -1,29 +0,0 @@
|
||||||
name: Code format checker
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
pull_request:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
lint:
|
|
||||||
name: JuliaFormatter
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- uses: julia-actions/setup-julia@v1
|
|
||||||
with:
|
|
||||||
version: "1"
|
|
||||||
- run: |
|
|
||||||
julia -e 'using Pkg; Pkg.add(PackageSpec(name="JuliaFormatter"))'
|
|
||||||
julia -e 'using JuliaFormatter; format(".", verbose=true)'
|
|
||||||
- run: |
|
|
||||||
julia -e '
|
|
||||||
out = Cmd(`git diff --name-only`) |> read |> String
|
|
||||||
if out == ""
|
|
||||||
exit(0)
|
|
||||||
else
|
|
||||||
@error "Some files have not been formatted !!!"
|
|
||||||
write(stdout, out)
|
|
||||||
exit(1)
|
|
||||||
end'
|
|
21
CHANGELOG.md
21
CHANGELOG.md
|
@ -7,34 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
### Added
|
### [0.1.4] - 2022-12-17
|
||||||
|
|
||||||
- Tutorial-type documentation ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
|
|
||||||
- Code now follows [Blue style](https://github.com/invenia/BlueStyle) ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
|
||||||
- :bomb: [BREAKING] Public and private API defined based on Blue style guidelines ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
|
||||||
|
|
||||||
### Removed
|
|
||||||
|
|
||||||
- Windows and MacOS CI tests ([#28](https://github.com/BioJulia/SequenceVariation.jl/pull/28))
|
|
||||||
|
|
||||||
## [0.1.4] - 2022-12-17
|
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
- Soft clips at end of alignment causing invalid `Variant`s ([#25](https://github.com/BioJulia/SequenceVariation.jl/issues/25)/[#26](https://github.com/BioJulia/SequenceVariation.jl/pull/26))
|
- Soft clips at end of alignment causing invalid `Variant`s ([#25](https://github.com/BioJulia/SequenceVariation.jl/issues/25)/[#26](https://github.com/BioJulia/SequenceVariation.jl/pull/26))
|
||||||
|
|
||||||
## [0.1.3] - 2022-11-22
|
### [0.1.3] - 2022-11-22
|
||||||
|
|
||||||
### Changed
|
## Changed
|
||||||
|
|
||||||
- Variations getter now returns type-parameterized vector ([#23](https://github.com/BioJulia/SequenceVariation.jl/pull/23))
|
- Variations getter now returns type-parameterized vector ([#23](https://github.com/BioJulia/SequenceVariation.jl/pull/23))
|
||||||
|
|
||||||
## [0.1.2] - 2022-10-04
|
## [0.1.2] - 2022-10-04
|
||||||
|
|
||||||
### Changed
|
## Changed
|
||||||
|
|
||||||
- Updated dependency compats ([#21](https://github.com/BioJulia/SequenceVariation.jl/pull/21))
|
- Updated dependency compats ([#21](https://github.com/BioJulia/SequenceVariation.jl/pull/21))
|
||||||
- BioAlignments: 2 -> 2,3
|
- BioAlignments: 2 -> 2,3
|
||||||
|
|
|
@ -10,7 +10,6 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||||
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
||||||
|
|
||||||
[compat]
|
[compat]
|
||||||
Aqua = "0.6.0"
|
|
||||||
BioAlignments = "2,3"
|
BioAlignments = "2,3"
|
||||||
BioGenerics = "0.1"
|
BioGenerics = "0.1"
|
||||||
BioSequences = "2,3"
|
BioSequences = "2,3"
|
||||||
|
@ -18,10 +17,9 @@ BioSymbols = "4,5"
|
||||||
julia = "1.6"
|
julia = "1.6"
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
|
|
||||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||||
|
|
||||||
[targets]
|
[targets]
|
||||||
test = ["Aqua", "Test", "BioSequences", "BioAlignments"]
|
test = ["Test", "BioSequences", "BioAlignments"]
|
||||||
|
|
|
@ -30,7 +30,6 @@ SequenceVariation is tested against Julia `1.X` on Linux, OS X, and Windows.
|
||||||
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
|
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
|
||||||
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
||||||
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
|
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
|
||||||
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,6 @@
|
||||||
[deps]
|
[deps]
|
||||||
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
|
||||||
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
|
||||||
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
|
|
||||||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
||||||
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||||
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
|
|
||||||
SequenceVariation = "eef6e190-9969-4f06-a38f-35a110a8fdc8"
|
|
||||||
|
|
||||||
[compat]
|
[compat]
|
||||||
BioAlignments = "3"
|
|
||||||
BioSequences = "3"
|
|
||||||
BioSymbols = "5"
|
|
||||||
Documenter = "0.27"
|
Documenter = "0.27"
|
||||||
Revise = "3.4"
|
|
||||||
julia = "1.6"
|
|
||||||
|
|
31
docs/make.jl
31
docs/make.jl
|
@ -1,31 +1,22 @@
|
||||||
using Pkg
|
using Pkg
|
||||||
using Documenter
|
using Documenter
|
||||||
using SequenceVariation
|
using SequenceVariation
|
||||||
using Revise
|
|
||||||
|
|
||||||
# see https://github.com/tlienart/LiveServer.jl/issues/140#issuecomment-1271591251
|
|
||||||
Revise.revise()
|
|
||||||
|
|
||||||
makedocs(;
|
makedocs(;
|
||||||
checkdocs=:exports,
|
checkdocs = :exports,
|
||||||
linkcheck=true,
|
linkcheck = true,
|
||||||
sitename="SequenceVariation.jl",
|
sitename = "SequenceVariation.jl",
|
||||||
format=Documenter.HTML(),
|
format = Documenter.HTML(),
|
||||||
modules=[SequenceVariation],
|
modules = [SequenceVariation],
|
||||||
pages=[
|
pages = [
|
||||||
"Home" => "index.md",
|
"Home" => "index.md",
|
||||||
"Working with variants" => "variants.md",
|
|
||||||
"Working with variations" => "variations.md",
|
|
||||||
"Comparing variations" => "compare.md",
|
|
||||||
"API Reference" => "api.md",
|
"API Reference" => "api.md",
|
||||||
],
|
],
|
||||||
authors=replace(
|
authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors."
|
||||||
join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => ""
|
|
||||||
) * ", The BioJulia Organisation, and other contributors.",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
deploydocs(;
|
deploydocs(
|
||||||
repo="github.com/BioJulia/SequenceVariation.jl.git",
|
repo = "github.com/BioJulia/SequenceVariation.jl.git",
|
||||||
devbranch="master",
|
devbranch = "master",
|
||||||
push_preview=true,
|
push_preview = true,
|
||||||
)
|
)
|
||||||
|
|
|
@ -7,54 +7,6 @@ end
|
||||||
|
|
||||||
# API Reference
|
# API Reference
|
||||||
|
|
||||||
## Edits
|
```@autodocs
|
||||||
|
Modules = [SequenceVariation]
|
||||||
```@docs
|
|
||||||
Substitution
|
|
||||||
Deletion
|
|
||||||
Insertion
|
|
||||||
```
|
|
||||||
|
|
||||||
## Variants
|
|
||||||
|
|
||||||
```@docs
|
|
||||||
Variant
|
|
||||||
reference(::Variant)
|
|
||||||
variations
|
|
||||||
reconstruct!
|
|
||||||
```
|
|
||||||
|
|
||||||
## Variations
|
|
||||||
|
|
||||||
```@docs
|
|
||||||
Variation
|
|
||||||
reference(::Variation)
|
|
||||||
mutation
|
|
||||||
translate
|
|
||||||
refbases
|
|
||||||
altbases
|
|
||||||
```
|
|
||||||
|
|
||||||
## Private API
|
|
||||||
|
|
||||||
### Edits
|
|
||||||
|
|
||||||
```@docs
|
|
||||||
Edit
|
|
||||||
_mutation
|
|
||||||
_lendiff
|
|
||||||
```
|
|
||||||
|
|
||||||
### Variants
|
|
||||||
|
|
||||||
```@docs
|
|
||||||
_edits
|
|
||||||
_is_valid(::Variant)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Variations
|
|
||||||
|
|
||||||
```@docs
|
|
||||||
_edit
|
|
||||||
_is_valid(::Variation)
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
```@meta
|
|
||||||
CurrentModule = SequenceVariation
|
|
||||||
```
|
|
||||||
|
|
||||||
# Comparing variations in sequences
|
|
||||||
|
|
||||||
## Checking for variations in a known variant
|
|
||||||
|
|
||||||
Looking for a known [`Variation`](@ref) within a [`Variant`](@ref) is
|
|
||||||
efficiently accomplished using the `in` operator.
|
|
||||||
|
|
||||||
```@setup call_variants
|
|
||||||
using SequenceVariation, BioAlignments, BioSequences
|
|
||||||
|
|
||||||
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
|
|
||||||
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
|
|
||||||
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
|
|
||||||
|
|
||||||
bos_ovis_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
|
|
||||||
bos_human_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
|
||||||
|
|
||||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
|
||||||
bos_human_variant = Variant(bos_human_alignment)
|
|
||||||
```
|
|
||||||
|
|
||||||
```@example call_variants
|
|
||||||
println("\tOvis aires\tHomo sapiens")
|
|
||||||
for v in vcat(variations(bos_ovis_variant), variations(bos_human_variant))
|
|
||||||
is_sheep = v in bos_ovis_variant
|
|
||||||
is_human = v in bos_human_variant
|
|
||||||
println("$v\t$is_sheep\t\t$is_human")
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
## Constructing new variants based on other variations
|
|
||||||
|
|
||||||
New variants can be constructed using variations. This might be useful to pool
|
|
||||||
variations found on different reads or to filter variations from a variant
|
|
||||||
that aren't validated by another variant.
|
|
||||||
|
|
||||||
```@repl call_variants
|
|
||||||
sheeple = vcat(variations(bos_ovis_variant), variations(bos_human_variant));
|
|
||||||
Variant(bovine, sheeple)
|
|
||||||
reconstruct!(bovine, ans)
|
|
||||||
```
|
|
|
@ -18,7 +18,7 @@ You can install SequenceVariation from the [Julia REPL](https://docs.julialang.o
|
||||||
Press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), and enter the following:
|
Press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), and enter the following:
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
add SequenceVariation
|
add SequenceVariation.jl
|
||||||
```
|
```
|
||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
@ -30,7 +30,6 @@ SequenceVariation is tested against Julia `1.X` on Linux, OS X, and Windows.
|
||||||
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
|
[![Unit Tests](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions/workflows/UnitTests.yml)
|
||||||
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
[![Documentation](https://github.com/BioJulia/SequenceVariation.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/SequenceVariation.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
||||||
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
|
[![codecov](https://codecov.io/gh/BioJulia/SequenceVariation.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/SequenceVariation.jl)
|
||||||
[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|
|
@ -1,40 +0,0 @@
|
||||||
```@meta
|
|
||||||
CurrentModule = SequenceVariation
|
|
||||||
```
|
|
||||||
|
|
||||||
# Working with variants
|
|
||||||
|
|
||||||
## Calling variants
|
|
||||||
|
|
||||||
The first step in working with sequence variation is to identify (call)
|
|
||||||
variations. SequenceVariation can directly call variants using the
|
|
||||||
`Variant(::PairwiseAlignment)` constructor of the [`Variant`](@ref) type.
|
|
||||||
|
|
||||||
```@repl call_variants
|
|
||||||
using SequenceVariation, BioAlignments, BioSequences
|
|
||||||
|
|
||||||
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
|
|
||||||
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
|
|
||||||
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
|
|
||||||
|
|
||||||
bos_ovis_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
|
|
||||||
bos_human_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
|
||||||
|
|
||||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
|
||||||
bos_human_variant = Variant(bos_human_alignment)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Sequence reconstruction
|
|
||||||
|
|
||||||
If the alternate sequence of a variant is no longer available (as is often the
|
|
||||||
case when calling variants from alignment files), then the sequence can be
|
|
||||||
retrieved using the [`reconstruct!`](@ref) function.
|
|
||||||
|
|
||||||
```@repl call_variants
|
|
||||||
human2 = copy(bovine);
|
|
||||||
reconstruct!(human2, bos_human_variant)
|
|
||||||
human2 == bovine
|
|
||||||
human2 == human
|
|
||||||
```
|
|
|
@ -1,66 +0,0 @@
|
||||||
```@meta
|
|
||||||
CurrentModule = SequenceVariation
|
|
||||||
```
|
|
||||||
|
|
||||||
# Working with individual variations
|
|
||||||
|
|
||||||
## Construction
|
|
||||||
|
|
||||||
Individual [`Variation`](@ref)s can be made using a reference sequence and
|
|
||||||
string syntax
|
|
||||||
|
|
||||||
| Variation type | Syntax | Interpretation | Example |
|
|
||||||
|:--- |:--- |:--- |:--- |
|
|
||||||
| Substitutions | `<REF><POS><ALT>` | `<ALT>` is substituted for `<REF>` in position `<POS>` | `"G16C"` |
|
|
||||||
| Deletions | `Δ<START>-<END>` | All bases (inclusive) between `<START>` and `<END>` are deleted. It is valid to have `<START>` equal `<END>`: that is a deletion of one base. | `"Δ1-2"` |
|
|
||||||
| Insertions | `<POS><ALT>` | `<ALT>` is inserted between positions `<POS>` and `<POS>+1` | `"11T"` |
|
|
||||||
|
|
||||||
```@repl
|
|
||||||
using BioSequences: @dna_str
|
|
||||||
using SequenceVariation
|
|
||||||
bovine_ins = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG"
|
|
||||||
Variation(bovine_ins, "C4A")
|
|
||||||
mutation(ans)
|
|
||||||
typeof(mutation(Variation(bovine_ins, "Δ13-14")))
|
|
||||||
mutation(Variation(bovine_ins, "25ACA"))
|
|
||||||
```
|
|
||||||
|
|
||||||
## Extraction
|
|
||||||
|
|
||||||
Sequence variations may also be extracted wholesale from a [`Variant`](@ref)
|
|
||||||
using the [`variations`](@ref) function.
|
|
||||||
|
|
||||||
```@setup call_variants
|
|
||||||
using SequenceVariation, BioAlignments, BioSequences
|
|
||||||
|
|
||||||
bovine = dna"GACCGGCTGCATTCGAGGCTGCCAGCAAGCAG";
|
|
||||||
ovine = dna"GACCGGCTGCATTCGAGGCTGTCAGCAAACAG";
|
|
||||||
human = dna"GACAGGCTGCATCAGAAGAGGCCATCAAGCAG";
|
|
||||||
|
|
||||||
bos_ovis_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(ovine, Alignment("32M", 1, 1)), bovine);
|
|
||||||
bos_human_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), bovine);
|
|
||||||
|
|
||||||
bos_ovis_variant = Variant(bos_ovis_alignment)
|
|
||||||
bos_human_variant = Variant(bos_human_alignment)
|
|
||||||
```
|
|
||||||
|
|
||||||
```@repl call_variants
|
|
||||||
variations(bos_ovis_variant)
|
|
||||||
variations(bos_human_variant)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Reference switching
|
|
||||||
|
|
||||||
An individual variation can be mapped to a new reference sequence given an
|
|
||||||
alignment between the new and old references using the [`translate`](@ref).
|
|
||||||
|
|
||||||
```@repl call_variants
|
|
||||||
ovis_human_alignment =
|
|
||||||
PairwiseAlignment(AlignedSequence(human, Alignment("32M", 1, 1)), ovine)
|
|
||||||
human_variation = first(variations(bos_ovis_variant))
|
|
||||||
reference(ans) == bovine
|
|
||||||
SequenceVariation.translate(human_variation, ovis_human_alignment)
|
|
||||||
reference(ans) == bovine
|
|
||||||
```
|
|
75
src/Edit.jl
75
src/Edit.jl
|
@ -1,75 +0,0 @@
|
||||||
include("edits/Substitution.jl")
|
|
||||||
include("edits/Deletion.jl")
|
|
||||||
include("edits/Insertion.jl")
|
|
||||||
|
|
||||||
"""
|
|
||||||
Edit{S <: BioSequence, T <: BioSymbol}
|
|
||||||
|
|
||||||
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
|
|
||||||
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
|
|
||||||
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
|
|
||||||
"""
|
|
||||||
struct Edit{S<:BioSequence,T<:BioSymbol}
|
|
||||||
x::Union{Substitution{T},Deletion,Insertion{S}}
|
|
||||||
pos::UInt
|
|
||||||
end
|
|
||||||
|
|
||||||
Base.length(e::Edit) = length(_mutation(e))
|
|
||||||
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
|
|
||||||
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
|
|
||||||
|
|
||||||
function Base.parse(::Type{T}, s::AbstractString) where {T<:Edit{Se,Sy}} where {Se,Sy}
|
|
||||||
return parse(T, String(s))
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.parse(::Type{<:Edit{Se,Sy}}, s::Union{String,SubString{String}}) where {Se,Sy}
|
|
||||||
# Either "Δ1-2", "11T" or "G16C"
|
|
||||||
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
|
|
||||||
pos = parse(UInt, m[1])
|
|
||||||
stop = parse(UInt, m[2])
|
|
||||||
stop ≥ pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
|
|
||||||
Edit{Se,Sy}(Deletion(stop - pos + 1), pos)
|
|
||||||
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
|
|
||||||
pos = parse(UInt, m[1])
|
|
||||||
seq = Se(m[2])
|
|
||||||
Edit{Se,Sy}(Insertion(seq), pos)
|
|
||||||
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
|
|
||||||
pos = parse(UInt, m[1])
|
|
||||||
sym = Sy(first(m[2]))
|
|
||||||
Edit{Se,Sy}(Substitution(sym), pos)
|
|
||||||
else
|
|
||||||
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
_mutation(e::Edit)
|
|
||||||
|
|
||||||
Returns the underlying [`Substitution`](@ref), [`Insertion`](@ref), or [`Deletion`](@ref) of
|
|
||||||
`e`.
|
|
||||||
"""
|
|
||||||
_mutation(e::Edit) = e.x
|
|
||||||
BioGenerics.leftposition(e::Edit) = e.pos
|
|
||||||
function BioGenerics.rightposition(e::Edit)
|
|
||||||
if _mutation(e) isa Substitution
|
|
||||||
return leftposition(e)
|
|
||||||
elseif _mutation(e) isa Insertion
|
|
||||||
return leftposition(e) + 1
|
|
||||||
elseif _mutation(e) isa Deletion
|
|
||||||
return leftposition(e) + length(e) - 1
|
|
||||||
else
|
|
||||||
error("Unknown mutation type $(typeof(_mutation(e)))")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
_lendiff(edit::Edit)
|
|
||||||
|
|
||||||
Gets the number of bases that `edit` adds to the reference sequence
|
|
||||||
"""
|
|
||||||
function _lendiff(edit::Edit)
|
|
||||||
x = _mutation(edit)
|
|
||||||
# Each edit type has logic for its length, we just need to know what direction to go
|
|
||||||
multiplier = x isa Substitution ? 0 : (x isa Deletion ? -1 : 1)
|
|
||||||
return length(x) * multiplier
|
|
||||||
end
|
|
|
@ -20,32 +20,519 @@ TODO now:
|
||||||
* Add tests
|
* Add tests
|
||||||
"""
|
"""
|
||||||
|
|
||||||
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP, sequence
|
using BioAlignments: BioAlignments, PairwiseAlignment, OP_SOFT_CLIP
|
||||||
using BioGenerics: BioGenerics, leftposition, rightposition
|
using BioGenerics: BioGenerics, leftposition, rightposition
|
||||||
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
|
using BioSequences: BioSequences, BioSequence, NucleotideSeq, LongSequence, isgap
|
||||||
using BioSymbols: BioSymbol
|
using BioSymbols: BioSymbol
|
||||||
|
|
||||||
export Deletion
|
|
||||||
export Insertion
|
|
||||||
export Substitution
|
|
||||||
export Variant
|
|
||||||
export Variation
|
|
||||||
export altbases
|
|
||||||
export mutation
|
|
||||||
export reconstruct!
|
|
||||||
export refbases
|
|
||||||
export reference
|
|
||||||
export translate
|
|
||||||
export variations
|
|
||||||
|
|
||||||
const BA = BioAlignments
|
const BA = BioAlignments
|
||||||
const BS = BioSequences
|
const BS = BioSequences
|
||||||
|
|
||||||
|
#=
|
||||||
|
import Automa
|
||||||
|
import Automa.RegExp: @re_str
|
||||||
|
=#
|
||||||
|
|
||||||
struct Unsafe end
|
struct Unsafe end
|
||||||
struct Inapplicable end
|
struct Inapplicable end
|
||||||
|
|
||||||
include("Edit.jl")
|
#=
|
||||||
include("Variant.jl")
|
const CONTEXT = Automa.CodeGenContext(
|
||||||
include("Variation.jl")
|
vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte),
|
||||||
|
generator=:goto,
|
||||||
|
checkbounds=false
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
const BYTES = Union{String, SubString{String}, Vector{UInt8}}
|
||||||
|
=#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Substitution
|
||||||
|
|
||||||
|
Represents the presence of a `T` at a given position. The position is stored
|
||||||
|
outside this struct.
|
||||||
|
"""
|
||||||
|
struct Substitution{T <: BioSymbol}
|
||||||
|
x::T
|
||||||
|
end
|
||||||
|
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
|
||||||
|
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
|
||||||
|
|
||||||
|
"""
|
||||||
|
Deletion
|
||||||
|
|
||||||
|
Represents the deletion of N symbols. The location of the deletion is stored
|
||||||
|
outside this struct
|
||||||
|
"""
|
||||||
|
struct Deletion
|
||||||
|
len::UInt
|
||||||
|
|
||||||
|
function Deletion(len::UInt)
|
||||||
|
iszero(len) && error("Deletion must be at least 1 symbol")
|
||||||
|
new(len)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
Deletion(x::Integer) = Deletion(convert(UInt, x))
|
||||||
|
Base.length(x::Deletion) = Int(x.len)
|
||||||
|
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
|
||||||
|
|
||||||
|
"""
|
||||||
|
Insertion{S <: BioSequence}
|
||||||
|
|
||||||
|
Represents the insertion of a `S` into a sequence. The location of the insertion
|
||||||
|
is stored outside the struct.
|
||||||
|
"""
|
||||||
|
struct Insertion{S <: BioSequence}
|
||||||
|
seq::S
|
||||||
|
|
||||||
|
function Insertion{S}(x::S) where {S <: BioSequence}
|
||||||
|
isempty(x) && error("Insertion must be at least 1 symbol")
|
||||||
|
new(x)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
||||||
|
Base.length(x::Insertion) = length(x.seq)
|
||||||
|
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
||||||
|
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
|
||||||
|
|
||||||
|
"""
|
||||||
|
Edit{S <: BioSequence, T <: BioSymbol}
|
||||||
|
|
||||||
|
An edit of either `Substitution{T}`, `Insertion{S}` or `Deletion` at a position.
|
||||||
|
If deletion: Deletion of length L at ref pos `pos:pos+L-1`
|
||||||
|
If insertion: Insertion of length L b/w ref pos `pos:pos+1`
|
||||||
|
"""
|
||||||
|
struct Edit{S <: BioSequence, T <: BioSymbol}
|
||||||
|
x::Union{Substitution{T}, Deletion, Insertion{S}}
|
||||||
|
pos::UInt
|
||||||
|
end
|
||||||
|
Base.:(==)(e1::Edit, e2::Edit) = e1.pos == e2.pos && e1.x == e2.x
|
||||||
|
Base.hash(x::Edit, h::UInt) = hash(Edit, hash((x.x, x.pos), h))
|
||||||
|
Base.length(e::Edit) = e isa Substitution ? 1 : length(mutation(e))
|
||||||
|
|
||||||
|
function Base.parse(::Type{T}, s::AbstractString) where {T <: Edit{Se, Sy}} where {Se, Sy}
|
||||||
|
parse(T, String(s))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.parse(::Type{<:Edit{Se, Sy}}, s::Union{String, SubString{String}}) where {Se, Sy}
|
||||||
|
# Either "Δ1-2", "11T" or "G16C"
|
||||||
|
if (m = match(r"^Δ(\d+)-(\d+)$", s); m) !== nothing
|
||||||
|
pos = parse(UInt, m[1])
|
||||||
|
stop = parse(UInt, m[2])
|
||||||
|
stop ≥ pos || throw(ArgumentError("Non-positive deletion length: \"" * s * "\""))
|
||||||
|
Edit{Se, Sy}(Deletion(stop - pos + 1), pos)
|
||||||
|
elseif (m = match(r"^(\d+)([A-Za-z]+)$", s); m) !== nothing
|
||||||
|
pos = parse(UInt, m[1])
|
||||||
|
seq = Se(m[2])
|
||||||
|
Edit{Se, Sy}(Insertion(seq), pos)
|
||||||
|
elseif (m = match(r"^[A-Za-z](\d+)([A-Za-z])$", s); m) !== nothing
|
||||||
|
pos = parse(UInt, m[1])
|
||||||
|
sym = Sy(first(m[2]))
|
||||||
|
Edit{Se, Sy}(Substitution(sym), pos)
|
||||||
|
else
|
||||||
|
throw(ArgumentError("Failed to parse edit \"" * s * '"'))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
mutation(e::Edit) = e.x
|
||||||
|
BioGenerics.leftposition(e::Edit) = e.pos
|
||||||
|
function BioGenerics.rightposition(e::Edit)
|
||||||
|
if mutation(e) isa Substitution
|
||||||
|
return leftposition(e)
|
||||||
|
elseif mutation(e) isa Insertion
|
||||||
|
return leftposition(e) + 1
|
||||||
|
elseif mutation(e) isa Deletion
|
||||||
|
return leftposition(e) + length(e) - 1
|
||||||
|
else
|
||||||
|
error("Unknown mutation type $(typeof(mutation(e)))")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
#=
|
||||||
|
@noinline throw_parse_error(T, p::Integer) = error("Failed to parse $T at byte $p")
|
||||||
|
|
||||||
|
# Parse substitution
|
||||||
|
let
|
||||||
|
machine = let
|
||||||
|
biosymbol = re"[A-Za-z]"
|
||||||
|
number = re"[0-9]+"
|
||||||
|
|
||||||
|
biosymbol.actions[:enter] = [:enter]
|
||||||
|
number.actions[:all] = [:digit]
|
||||||
|
|
||||||
|
Automa.compile(biosymbol * number * biosymbol)
|
||||||
|
end
|
||||||
|
actions = Dict(
|
||||||
|
:enter => quote
|
||||||
|
symbol = T(Char(byte))
|
||||||
|
if firstsymbol === nothing
|
||||||
|
firstsymbol = symbol
|
||||||
|
else
|
||||||
|
lastsymbol = symbol
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
:digit => :(num = UInt(10)*num + (byte - 0x30) % UInt),
|
||||||
|
)
|
||||||
|
@eval begin
|
||||||
|
function Base.parse(::Type{Edit{S, T}}, data::BYTES) where {S, T}
|
||||||
|
$(Automa.generate_init_code(CONTEXT, machine))
|
||||||
|
p_eof = p_end = sizeof(data)
|
||||||
|
firstsymbol = lastsymbol = nothing
|
||||||
|
num = UInt(0)
|
||||||
|
$(Automa.generate_exec_code(CONTEXT, machine, actions))
|
||||||
|
iszero(cs) || throw_parse_error(Edit{S, T}, p)
|
||||||
|
if firstsymbol == lastsymbol
|
||||||
|
error("First symbol and last symbol are identical")
|
||||||
|
end
|
||||||
|
return Edit{S, T}(Substitution{T}(lastsymbol), num)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
=#
|
||||||
|
|
||||||
|
# Edits are applied sequentially from first to last pos.
|
||||||
|
# The vector must always be sorted by pos.
|
||||||
|
struct Variant{S <: BioSequence, T <: BioSymbol}
|
||||||
|
ref::S
|
||||||
|
edits::Vector{Edit{S, T}}
|
||||||
|
|
||||||
|
Variant{S, T}(ref::S, edits::Vector{Edit{S, T}}, ::Unsafe) where {S, T} = new(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant{S,T}(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||||
|
sort!(edits, by=x -> x.pos)
|
||||||
|
result = Variant{S, T}(ref, edits, Unsafe())
|
||||||
|
is_valid(result) || error("TODO") # report what kind of error message?
|
||||||
|
return result
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant(ref::S, edits::Vector{Edit{S, T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||||
|
Variant{S, T}(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function Base.show(io::IO, x::Variant)
|
||||||
|
n = length(x.edits)
|
||||||
|
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
||||||
|
for i in x.edits
|
||||||
|
v = Variation(x.ref, i)
|
||||||
|
print(io, "\n ")
|
||||||
|
show(io, v)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Validate:
|
||||||
|
# A sequence is invalid if any of its operations are out of bounds, or the same position
|
||||||
|
# is affected by multiple edits.
|
||||||
|
function is_valid(v::Variant)
|
||||||
|
isempty(v.ref) && return false
|
||||||
|
valid_positions = 1:length(v.ref)
|
||||||
|
last_was_insert = false
|
||||||
|
for edit in v.edits
|
||||||
|
pos = edit.pos
|
||||||
|
op = edit.x
|
||||||
|
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
||||||
|
# variations
|
||||||
|
is_valid(Variation(v.ref, edit)) || return false
|
||||||
|
|
||||||
|
# For substitutions we simply do not allow another modification of the same base
|
||||||
|
if op isa Substitution
|
||||||
|
pos in valid_positions || return false
|
||||||
|
valid_positions = first(valid_positions) + 1 : last(valid_positions)
|
||||||
|
last_was_insert = false
|
||||||
|
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
||||||
|
# for next op. However, we cannot have two insertions at the same position, because
|
||||||
|
# then the order of them is ambiguous
|
||||||
|
elseif op isa Insertion
|
||||||
|
pos in (first(valid_positions)-1+last_was_insert:last(valid_positions)+1) || return false
|
||||||
|
last_was_insert = true
|
||||||
|
# Deletions obviously invalidate the reference bases that are deleted.
|
||||||
|
elseif op isa Deletion
|
||||||
|
len = length(op)
|
||||||
|
pos in (first(valid_positions):last(valid_positions)-len+1) || return false
|
||||||
|
valid_positions = first(valid_positions) + len : last(valid_positions)
|
||||||
|
last_was_insert = false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant(aln::PairwiseAlignment{T, T}) where {T <: LongSequence{<:Union{BS.AminoAcidAlphabet, BS.NucleicAcidAlphabet}}}
|
||||||
|
ref = aln.b
|
||||||
|
E = eltype(typeof(ref))
|
||||||
|
edits = Edit{T, E}[]
|
||||||
|
refpos = first(aln.a.aln.anchors).refpos
|
||||||
|
seqpos = first(aln.a.aln.anchors).seqpos
|
||||||
|
markpos = 0
|
||||||
|
n_gaps = n_ins = 0
|
||||||
|
insertion_buffer = E[]
|
||||||
|
for (seqi, refi) in aln
|
||||||
|
isgap(refi) || (refpos += 1)
|
||||||
|
isgap(seqi) || (seqpos += 1)
|
||||||
|
|
||||||
|
# Check for deletions
|
||||||
|
if isgap(seqi)
|
||||||
|
iszero(n_gaps) && (markpos = refpos)
|
||||||
|
n_gaps += 1
|
||||||
|
elseif !iszero(n_gaps)
|
||||||
|
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||||
|
n_gaps = 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Check for insertions
|
||||||
|
if isgap(refi)
|
||||||
|
iszero(n_ins) && (markpos = refpos + 1)
|
||||||
|
push!(insertion_buffer, seqi)
|
||||||
|
n_ins += 1
|
||||||
|
elseif !iszero(n_ins)
|
||||||
|
seq = T(insertion_buffer)
|
||||||
|
push!(edits, Edit{T, E}(Insertion(seq), UInt(markpos)))
|
||||||
|
empty!(insertion_buffer)
|
||||||
|
n_ins = 0
|
||||||
|
end
|
||||||
|
|
||||||
|
# Substitutions
|
||||||
|
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
||||||
|
push!(edits, Edit{T, E}(Substitution{E}(seqi), UInt(refpos)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Check for clips at the end of the alignment
|
||||||
|
last_anchors = aln.a.aln.anchors[end-1:end]
|
||||||
|
|
||||||
|
# Final indel, if applicable
|
||||||
|
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
||||||
|
if !iszero(n_gaps)
|
||||||
|
push!(edits, Edit{T, E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
||||||
|
elseif !iszero(n_ins)
|
||||||
|
push!(edits, Edit{T, E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return Variant(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
edits(v::Variant) = v.edits
|
||||||
|
reference(v::Variant) = v.ref
|
||||||
|
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
||||||
|
|
||||||
|
function lendiff(edit::Edit)
|
||||||
|
x = edit.x
|
||||||
|
x isa Substitution ? 0 : (x isa Deletion ? -length(x) : length(x.x))
|
||||||
|
end
|
||||||
|
|
||||||
|
function reconstruct!(seq::S, x::Variant{S}) where S
|
||||||
|
len = length(x.ref) + sum(edit -> lendiff(edit), x.edits)
|
||||||
|
resize!(seq, len % UInt)
|
||||||
|
refpos = seqpos = 1
|
||||||
|
for edit in x.edits
|
||||||
|
while refpos < edit.pos
|
||||||
|
seq[seqpos] = x.ref[refpos]
|
||||||
|
refpos += 1
|
||||||
|
seqpos += 1
|
||||||
|
end
|
||||||
|
editx = edit.x
|
||||||
|
if editx isa Substitution
|
||||||
|
seq[seqpos] = editx.x
|
||||||
|
seqpos += 1
|
||||||
|
refpos += 1
|
||||||
|
elseif editx isa Deletion
|
||||||
|
refpos += editx.len
|
||||||
|
elseif editx isa Insertion
|
||||||
|
for i in editx.x
|
||||||
|
seq[seqpos] = i
|
||||||
|
seqpos += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
while seqpos ≤ length(seq)
|
||||||
|
seq[seqpos] = x.ref[refpos]
|
||||||
|
refpos += 1
|
||||||
|
seqpos += 1
|
||||||
|
end
|
||||||
|
seq
|
||||||
|
end
|
||||||
|
|
||||||
|
struct Variation{S <: BioSequence, T <: BioSymbol}
|
||||||
|
ref::S
|
||||||
|
edit::Edit{S, T}
|
||||||
|
|
||||||
|
function Variation{S, T}(ref::S, e::Edit{S, T}, ::Unsafe) where {S <: BioSequence, T <: BioSymbol}
|
||||||
|
new(ref, e)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variation{S, T}(ref::S, e::Edit{S, T}) where {S <: BioSequence, T <: BioSymbol}
|
||||||
|
v = Variation{S, T}(ref, e, Unsafe())
|
||||||
|
is_valid(v) ? v : throw(ArgumentError("Invalid variant"))
|
||||||
|
end
|
||||||
|
|
||||||
|
Variation(ref::S, edit::Edit{S, T}) where {S, T} = Variation{S, T}(ref, edit)
|
||||||
|
|
||||||
|
function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
||||||
|
T = eltype(ref)
|
||||||
|
|
||||||
|
e = parse(Edit{S,T}, edit)
|
||||||
|
return Variation{S,T}(ref, e)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence, T<:BioSymbol}
|
||||||
|
edits = edit.(vars)
|
||||||
|
return Variant{S, T}(ref, edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
reference(v::Variation) = v.ref
|
||||||
|
edit(v::Variation) = v.edit
|
||||||
|
mutation(v::Variation) = mutation(edit(v))
|
||||||
|
BioGenerics.leftposition(v::Variation) = leftposition(edit(v))
|
||||||
|
BioGenerics.rightposition(v::Variation) = rightposition(edit(v))
|
||||||
|
Base.:(==)(x::Variation, y::Variation) = x.ref == y.ref && x.edit == y.edit
|
||||||
|
Base.hash(x::Variation, h::UInt) = hash(Variation, hash((x.ref, x.edit), h))
|
||||||
|
|
||||||
|
function is_valid(v::Variation)
|
||||||
|
isempty(v.ref) && return false
|
||||||
|
op = v.edit.x
|
||||||
|
pos = v.edit.pos
|
||||||
|
if op isa Substitution
|
||||||
|
return pos in eachindex(v.ref)
|
||||||
|
elseif op isa Insertion
|
||||||
|
return pos in 0:lastindex(v.ref)+1
|
||||||
|
elseif op isa Deletion
|
||||||
|
return pos in 1:(lastindex(v.ref)-length(op) + 1)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, x::Variation)
|
||||||
|
content = x.edit.x
|
||||||
|
pos = x.edit.pos
|
||||||
|
if content isa Substitution
|
||||||
|
print(io, x.ref[pos], pos, content.x)
|
||||||
|
elseif content isa Deletion
|
||||||
|
print(io, 'Δ', pos, '-', pos + content.len - 1)
|
||||||
|
elseif content isa Insertion
|
||||||
|
print(io, pos, content.seq)
|
||||||
|
else
|
||||||
|
print(io, pos, content.x)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.in(v::Variation, var::Variant)
|
||||||
|
if v.ref != var.ref
|
||||||
|
error("References must be equal")
|
||||||
|
end
|
||||||
|
any(v.edit == edit for edit in var.edits)
|
||||||
|
end
|
||||||
|
|
||||||
|
function translate(var::Variation{S, T}, aln::PairwiseAlignment{S, S}) where {S, T}
|
||||||
|
kind = var.edit.x
|
||||||
|
pos = var.edit.pos
|
||||||
|
seq, ref = aln.seq, aln.b
|
||||||
|
|
||||||
|
# Special case: Insertions may have a pos of 0, which cannot be mapped to
|
||||||
|
# the seq using ref2seq
|
||||||
|
if iszero(pos)
|
||||||
|
(s, r), _ = iterate(aln)
|
||||||
|
(isgap(s) | isgap(r)) && return Inapplicable()
|
||||||
|
return Variation{S, T}(seq, Edit{S, T}(Insertion(var.edit.x), 0))
|
||||||
|
end
|
||||||
|
|
||||||
|
(seqpos, op) = BA.ref2seq(aln, pos)
|
||||||
|
if kind isa Substitution
|
||||||
|
# If it's a substitution, return nothing if it maps to a deleted
|
||||||
|
# position, or substitutes to same base.
|
||||||
|
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
|
||||||
|
seq[seqpos] == kind.x && return nothing
|
||||||
|
edit = Edit{S, T}(kind, seqpos)
|
||||||
|
return Variation{S, T}(seq, edit, Unsafe())
|
||||||
|
elseif kind isa Deletion
|
||||||
|
# If it's a deletion, return nothing if the deleted part is already missing
|
||||||
|
# from the new reference.
|
||||||
|
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
|
||||||
|
start = seqpos + op == BA.OP_DELETE
|
||||||
|
start < stop && return nothing
|
||||||
|
edit = Edit{S, T}(Deletion(stop - start + 1), start)
|
||||||
|
return Variation{S, T}(seq, edit, Unsafe())
|
||||||
|
else
|
||||||
|
# If it maps directly to a symbol, just insert
|
||||||
|
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
|
||||||
|
# This happens if there is already an insertion at the position
|
||||||
|
if pos != lastindex(ref) && first(ref2seq(aln, pos+1)) != seqpos + 1
|
||||||
|
return Inapplicable()
|
||||||
|
else
|
||||||
|
edit = Edit{S, T}(Insertion(var.edit.x), seqpos)
|
||||||
|
return Variation{S, T}(seq, edit, Unsafe())
|
||||||
|
end
|
||||||
|
# Alternatively, it can map to a deletion. In that case, it become really
|
||||||
|
# tricky to talk about the "same" insertion.
|
||||||
|
else
|
||||||
|
return Inapplicable()
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function variations(v::Variant{S,T}) where {S,T}
|
||||||
|
vs = Vector{Variation{S,T}}(undef, length(edits(v)))
|
||||||
|
for (i, e) in enumerate(edits(v))
|
||||||
|
vs[i] = Variation{S,T}(reference(v), e)
|
||||||
|
end
|
||||||
|
return vs
|
||||||
|
end
|
||||||
|
|
||||||
|
function _refbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
|
||||||
|
return S([reference[pos]])
|
||||||
|
end
|
||||||
|
|
||||||
|
function _altbases(s::Substitution, reference::S, pos::UInt) where S <: BioSequence
|
||||||
|
return S([s.x])
|
||||||
|
end
|
||||||
|
|
||||||
|
function _refbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
|
||||||
|
if pos == 1
|
||||||
|
return S(reference[UnitRange{Int}(pos, pos+length(d))])
|
||||||
|
else
|
||||||
|
return S(reference[UnitRange{Int}(pos-1, pos+length(d)-1)])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function _altbases(d::Deletion, reference::S, pos::UInt) where S <: BioSequence
|
||||||
|
if pos == 1
|
||||||
|
return S([reference[pos+1]])
|
||||||
|
else
|
||||||
|
return S([reference[pos-1]])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function _refbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
|
||||||
|
return S([reference[pos]])
|
||||||
|
end
|
||||||
|
|
||||||
|
function _altbases(i::Insertion, reference::S, pos::UInt) where S <: BioSequence
|
||||||
|
if pos == 1
|
||||||
|
return S([i.seq..., reference[pos]])
|
||||||
|
else
|
||||||
|
return S([reference[pos], i.seq...])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function refbases(v::Variation)
|
||||||
|
return _refbases(mutation(v), reference(v), leftposition(v))
|
||||||
|
end
|
||||||
|
|
||||||
|
function altbases(v::Variation)
|
||||||
|
return _altbases(mutation(v), reference(v), leftposition(v))
|
||||||
|
end
|
||||||
|
|
||||||
|
export Insertion,
|
||||||
|
Deletion,
|
||||||
|
Substitution,
|
||||||
|
Variant,
|
||||||
|
Variation,
|
||||||
|
reference,
|
||||||
|
mutation,
|
||||||
|
variations,
|
||||||
|
refbases,
|
||||||
|
altbases
|
||||||
|
|
||||||
end # module
|
end # module
|
||||||
|
|
197
src/Variant.jl
197
src/Variant.jl
|
@ -1,197 +0,0 @@
|
||||||
"""
|
|
||||||
Variant{S<:BioSequence,T<:BioSymbol}
|
|
||||||
|
|
||||||
A set of variations within a given sequence that are all found together. Depending on the
|
|
||||||
field, it might also be referred to as a "genotype," "haplotype," or "strain."
|
|
||||||
|
|
||||||
# Constructors
|
|
||||||
|
|
||||||
Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
Variant(
|
|
||||||
aln::PairwiseAlignment{T,T}
|
|
||||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
|
||||||
|
|
||||||
When constructing a `Variant` from a vector of [`Edit`](@ref)s or [`Variation`](@ref)s, the
|
|
||||||
edits are applied sequentially from first to last position, therefore the vector must always
|
|
||||||
be sorted by position. These edits are sorted automatically if constructing from an
|
|
||||||
alignment.
|
|
||||||
"""
|
|
||||||
struct Variant{S<:BioSequence,T<:BioSymbol}
|
|
||||||
ref::S
|
|
||||||
edits::Vector{Edit{S,T}}
|
|
||||||
|
|
||||||
Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}, ::Unsafe) where {S,T} = new(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant{S,T}(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
sort!(edits; by=x -> x.pos)
|
|
||||||
result = Variant{S,T}(ref, edits, Unsafe())
|
|
||||||
_is_valid(result) || error("TODO") # report what kind of error message?
|
|
||||||
return result
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant(ref::S, edits::Vector{Edit{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
return Variant{S,T}(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.show(io::IO, x::Variant)
|
|
||||||
n = length(x.edits)
|
|
||||||
print(io, summary(x), " with $n edit$(n > 1 ? "s" : ""):")
|
|
||||||
for i in x.edits
|
|
||||||
v = Variation(x.ref, i)
|
|
||||||
print(io, "\n ")
|
|
||||||
show(io, v)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
is_valid(v::Variant)
|
|
||||||
|
|
||||||
Validate `v`. `v` is invalid if any of its operations are out of bounds, or the same
|
|
||||||
position is affected by multiple edits.
|
|
||||||
"""
|
|
||||||
function _is_valid(v::Variant)
|
|
||||||
isempty(v.ref) && return false
|
|
||||||
valid_positions = 1:length(v.ref)
|
|
||||||
last_was_insert = false
|
|
||||||
for edit in v.edits
|
|
||||||
pos = edit.pos
|
|
||||||
op = edit.x
|
|
||||||
# Sanity check: for this to be a valid variant, it must be comprised of valid
|
|
||||||
# variations
|
|
||||||
_is_valid(Variation(v.ref, edit)) || return false
|
|
||||||
|
|
||||||
# For substitutions we simply do not allow another modification of the same base
|
|
||||||
if op isa Substitution
|
|
||||||
pos in valid_positions || return false
|
|
||||||
valid_positions = (first(valid_positions) + 1):last(valid_positions)
|
|
||||||
last_was_insert = false
|
|
||||||
# Insertions affect 0 reference bases, so it does not modify the valid positions
|
|
||||||
# for next op. However, we cannot have two insertions at the same position, because
|
|
||||||
# then the order of them is ambiguous
|
|
||||||
elseif op isa Insertion
|
|
||||||
pos in
|
|
||||||
((first(valid_positions) - 1 + last_was_insert):(last(valid_positions) + 1)) ||
|
|
||||||
return false
|
|
||||||
last_was_insert = true
|
|
||||||
# Deletions obviously invalidate the reference bases that are deleted.
|
|
||||||
elseif op isa Deletion
|
|
||||||
len = length(op)
|
|
||||||
pos in (first(valid_positions):(last(valid_positions) - len + 1)) ||
|
|
||||||
return false
|
|
||||||
valid_positions = (first(valid_positions) + len):last(valid_positions)
|
|
||||||
last_was_insert = false
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return true
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant(
|
|
||||||
aln::PairwiseAlignment{T,T}
|
|
||||||
) where {T<:LongSequence{<:Union{BS.AminoAcidAlphabet,BS.NucleicAcidAlphabet}}}
|
|
||||||
ref = aln.b
|
|
||||||
E = eltype(typeof(ref))
|
|
||||||
edits = Edit{T,E}[]
|
|
||||||
refpos = first(aln.a.aln.anchors).refpos
|
|
||||||
seqpos = first(aln.a.aln.anchors).seqpos
|
|
||||||
markpos = 0
|
|
||||||
n_gaps = n_ins = 0
|
|
||||||
insertion_buffer = E[]
|
|
||||||
for (seqi, refi) in aln
|
|
||||||
isgap(refi) || (refpos += 1)
|
|
||||||
isgap(seqi) || (seqpos += 1)
|
|
||||||
|
|
||||||
# Check for deletions
|
|
||||||
if isgap(seqi)
|
|
||||||
iszero(n_gaps) && (markpos = refpos)
|
|
||||||
n_gaps += 1
|
|
||||||
elseif !iszero(n_gaps)
|
|
||||||
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
|
||||||
n_gaps = 0
|
|
||||||
end
|
|
||||||
|
|
||||||
# Check for insertions
|
|
||||||
if isgap(refi)
|
|
||||||
iszero(n_ins) && (markpos = refpos + 1)
|
|
||||||
push!(insertion_buffer, seqi)
|
|
||||||
n_ins += 1
|
|
||||||
elseif !iszero(n_ins)
|
|
||||||
seq = T(insertion_buffer)
|
|
||||||
push!(edits, Edit{T,E}(Insertion(seq), UInt(markpos)))
|
|
||||||
empty!(insertion_buffer)
|
|
||||||
n_ins = 0
|
|
||||||
end
|
|
||||||
|
|
||||||
# Substitutions
|
|
||||||
if !isgap(refi) && !isgap(seqi) && seqi != refi
|
|
||||||
push!(edits, Edit{T,E}(Substitution{E}(seqi), UInt(refpos)))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Check for clips at the end of the alignment
|
|
||||||
last_anchors = aln.a.aln.anchors[(end - 1):end]
|
|
||||||
|
|
||||||
# Final indel, if applicable
|
|
||||||
if !any(anchor -> anchor.op == OP_SOFT_CLIP, last_anchors)
|
|
||||||
if !iszero(n_gaps)
|
|
||||||
push!(edits, Edit{T,E}(Deletion(UInt(n_gaps)), UInt(markpos)))
|
|
||||||
elseif !iszero(n_ins)
|
|
||||||
push!(edits, Edit{T,E}(Insertion(T(insertion_buffer)), UInt(markpos)))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return Variant(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
_edits(v::Variant)
|
|
||||||
|
|
||||||
Gets the [`Edit`](@ref)s that comprise `v`
|
|
||||||
"""
|
|
||||||
_edits(v::Variant) = v.edits
|
|
||||||
|
|
||||||
"""
|
|
||||||
reference(v::Variant)
|
|
||||||
|
|
||||||
Gets the reference sequence of `v`.
|
|
||||||
"""
|
|
||||||
reference(v::Variant) = v.ref
|
|
||||||
Base.:(==)(x::Variant, y::Variant) = x.ref == y.ref && x.edits == y.edits
|
|
||||||
|
|
||||||
"""
|
|
||||||
reconstruct!(seq::S, x::Variant{S}) where {S}
|
|
||||||
|
|
||||||
Apply the edits in `x` to `seq` and return the mutated sequence
|
|
||||||
"""
|
|
||||||
function reconstruct!(seq::S, x::Variant{S}) where {S}
|
|
||||||
len = length(x.ref) + sum(edit -> _lendiff(edit), _edits(x))
|
|
||||||
resize!(seq, len % UInt)
|
|
||||||
refpos = seqpos = 1
|
|
||||||
for edit in x.edits
|
|
||||||
while refpos < edit.pos
|
|
||||||
seq[seqpos] = x.ref[refpos]
|
|
||||||
refpos += 1
|
|
||||||
seqpos += 1
|
|
||||||
end
|
|
||||||
editx = edit.x
|
|
||||||
if editx isa Substitution
|
|
||||||
seq[seqpos] = editx.x
|
|
||||||
seqpos += 1
|
|
||||||
refpos += 1
|
|
||||||
elseif editx isa Deletion
|
|
||||||
refpos += editx.len
|
|
||||||
elseif editx isa Insertion
|
|
||||||
for i in editx.x
|
|
||||||
seq[seqpos] = i
|
|
||||||
seqpos += 1
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
while seqpos ≤ length(seq)
|
|
||||||
seq[seqpos] = x.ref[refpos]
|
|
||||||
refpos += 1
|
|
||||||
seqpos += 1
|
|
||||||
end
|
|
||||||
return seq
|
|
||||||
end
|
|
206
src/Variation.jl
206
src/Variation.jl
|
@ -1,206 +0,0 @@
|
||||||
"""
|
|
||||||
Variation{S<:BioSequence,T<:BioSymbol}
|
|
||||||
|
|
||||||
A single change to a biological sequence. A general wrapper that can represent a
|
|
||||||
sequence-specific [`Substitution`](@ref), [`Deletion`](@ref) or [`Insertion`](@ref).
|
|
||||||
`Variation` is more robust than [`Edit`](@ref), due to inclusion of the reference sequence
|
|
||||||
and built-in validation.
|
|
||||||
|
|
||||||
# Constructors
|
|
||||||
|
|
||||||
Variation(ref::S, e::Edit{S,T}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
|
||||||
|
|
||||||
Generally speaking, the `Edit` constructor should be avoided to ensure corectness: use of
|
|
||||||
[`variations(::Variant)`](@ref) is encouraged, instead.
|
|
||||||
|
|
||||||
Constructing a `Variation` from an `AbstractString` will parse the from `edit` using the
|
|
||||||
following syntax:
|
|
||||||
- Substitution: `"<REFBASE><POS><ALTBASE>"`, e.g. `"G16C"`
|
|
||||||
- Deletion: `"Δ<STARTPOS>-<ENDPOS>"`, e.g. `"Δ1-2"`
|
|
||||||
- Insertion: `"<POS><ALTBASES>"`, e.g. `"11T"`
|
|
||||||
"""
|
|
||||||
struct Variation{S<:BioSequence,T<:BioSymbol}
|
|
||||||
ref::S
|
|
||||||
edit::Edit{S,T}
|
|
||||||
|
|
||||||
function Variation{S,T}(
|
|
||||||
ref::S, e::Edit{S,T}, ::Unsafe
|
|
||||||
) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
return new(ref, e)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variation{S,T}(ref::S, e::Edit{S,T}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
v = Variation{S,T}(ref, e, Unsafe())
|
|
||||||
return _is_valid(v) ? v : throw(ArgumentError("Invalid variant"))
|
|
||||||
end
|
|
||||||
|
|
||||||
Variation(ref::S, edit::Edit{S,T}) where {S,T} = Variation{S,T}(ref, edit)
|
|
||||||
|
|
||||||
function Variation(ref::S, edit::AbstractString) where {S<:BioSequence}
|
|
||||||
T = eltype(ref)
|
|
||||||
|
|
||||||
e = parse(Edit{S,T}, edit)
|
|
||||||
return Variation{S,T}(ref, e)
|
|
||||||
end
|
|
||||||
|
|
||||||
function Variant(ref::S, vars::Vector{Variation{S,T}}) where {S<:BioSequence,T<:BioSymbol}
|
|
||||||
edits = _edit.(vars)
|
|
||||||
return Variant{S,T}(ref, edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
reference(v::Variation)
|
|
||||||
|
|
||||||
Gets the reference sequence of `v`
|
|
||||||
"""
|
|
||||||
reference(v::Variation) = v.ref
|
|
||||||
|
|
||||||
"""
|
|
||||||
_edit(v::Variation)
|
|
||||||
|
|
||||||
Gets the underlying [`Edit`](@ref) of `v`
|
|
||||||
"""
|
|
||||||
_edit(v::Variation) = v.edit
|
|
||||||
|
|
||||||
"""
|
|
||||||
mutation(v::Variation)
|
|
||||||
|
|
||||||
Gets the underlying [`Substitution`](@ref), [`Insertion`](@ref), or [`Deletion`](@ref) of
|
|
||||||
`v`.
|
|
||||||
"""
|
|
||||||
mutation(v::Variation) = _mutation(_edit(v))
|
|
||||||
BioGenerics.leftposition(v::Variation) = leftposition(_edit(v))
|
|
||||||
BioGenerics.rightposition(v::Variation) = rightposition(_edit(v))
|
|
||||||
Base.:(==)(x::Variation, y::Variation) = x.ref == y.ref && x.edit == y.edit
|
|
||||||
Base.hash(x::Variation, h::UInt) = hash(Variation, hash((x.ref, x.edit), h))
|
|
||||||
|
|
||||||
"""
|
|
||||||
_is_valid(v::Variation)
|
|
||||||
|
|
||||||
Validate `v`. `v` is invalid if its opertation is out of bounds.
|
|
||||||
"""
|
|
||||||
function _is_valid(v::Variation)
|
|
||||||
isempty(v.ref) && return false
|
|
||||||
op = v.edit.x
|
|
||||||
pos = v.edit.pos
|
|
||||||
if op isa Substitution
|
|
||||||
return pos in eachindex(v.ref)
|
|
||||||
elseif op isa Insertion
|
|
||||||
return pos in 0:(lastindex(v.ref) + 1)
|
|
||||||
elseif op isa Deletion
|
|
||||||
return pos in 1:(lastindex(v.ref) - length(op) + 1)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.show(io::IO, x::Variation)
|
|
||||||
content = x.edit.x
|
|
||||||
pos = x.edit.pos
|
|
||||||
if content isa Substitution
|
|
||||||
print(io, x.ref[pos], pos, content.x)
|
|
||||||
elseif content isa Deletion
|
|
||||||
print(io, 'Δ', pos, '-', pos + content.len - 1)
|
|
||||||
elseif content isa Insertion
|
|
||||||
print(io, pos, content.seq)
|
|
||||||
else
|
|
||||||
print(io, pos, content.x)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function Base.in(v::Variation, var::Variant)
|
|
||||||
if v.ref != var.ref
|
|
||||||
error("References must be equal")
|
|
||||||
end
|
|
||||||
return any(v.edit == edit for edit in var.edits)
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
|
||||||
|
|
||||||
Convert the difference in `var` to a new reference sequence based upon `aln`. `aln` is the
|
|
||||||
alignment of the old reference (`aln.b`) and the new reference sequence (`aln.seq`). Returns
|
|
||||||
the new [`Variation`](@ref).
|
|
||||||
"""
|
|
||||||
function translate(var::Variation{S,T}, aln::PairwiseAlignment{S,S}) where {S,T}
|
|
||||||
kind = mutation(var)
|
|
||||||
pos = leftposition(var)
|
|
||||||
seq = sequence(aln)
|
|
||||||
ref = aln.b
|
|
||||||
|
|
||||||
# Special case: Insertions may have a pos of 0, which cannot be mapped to
|
|
||||||
# the seq using ref2seq
|
|
||||||
if iszero(pos)
|
|
||||||
(s, r), _ = iterate(aln)
|
|
||||||
(isgap(s) | isgap(r)) && return Inapplicable()
|
|
||||||
return Variation{S,T}(seq, Edit{S,T}(Insertion(var.edit.x), 0))
|
|
||||||
end
|
|
||||||
|
|
||||||
(seqpos, op) = BA.ref2seq(aln, pos)
|
|
||||||
if kind isa Substitution
|
|
||||||
# If it's a substitution, return nothing if it maps to a deleted
|
|
||||||
# position, or substitutes to same base.
|
|
||||||
op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH) || return nothing
|
|
||||||
seq[seqpos] == kind.x && return nothing
|
|
||||||
edit = Edit{S,T}(kind, seqpos)
|
|
||||||
return Variation{S,T}(seq, edit, Unsafe())
|
|
||||||
elseif kind isa Deletion
|
|
||||||
# If it's a deletion, return nothing if the deleted part is already missing
|
|
||||||
# from the new reference.
|
|
||||||
(stop, op2) = BA.ref2seq(aln, pos + length(kind) - 1)
|
|
||||||
start = seqpos + op == BA.OP_DELETE
|
|
||||||
start < stop && return nothing
|
|
||||||
edit = Edit{S,T}(Deletion(stop - start + 1), start)
|
|
||||||
return Variation{S,T}(seq, edit, Unsafe())
|
|
||||||
else
|
|
||||||
# If it maps directly to a symbol, just insert
|
|
||||||
if op in (BA.OP_MATCH, BA.OP_SEQ_MATCH, BA.OP_SEQ_MISMATCH)
|
|
||||||
# This happens if there is already an insertion at the position
|
|
||||||
if pos != lastindex(ref) && first(ref2seq(aln, pos + 1)) != seqpos + 1
|
|
||||||
return Inapplicable()
|
|
||||||
else
|
|
||||||
edit = Edit{S,T}(Insertion(var.edit.x), seqpos)
|
|
||||||
return Variation{S,T}(seq, edit, Unsafe())
|
|
||||||
end
|
|
||||||
# Alternatively, it can map to a deletion. In that case, it become really
|
|
||||||
# tricky to talk about the "same" insertion.
|
|
||||||
else
|
|
||||||
return Inapplicable()
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
variations(v::Variant{S,T}) where {S,T}
|
|
||||||
|
|
||||||
Converts the [`Edit`](@ref)s of `v` into a vector of [`Variation`](@ref)s.
|
|
||||||
"""
|
|
||||||
function variations(v::Variant{S,T}) where {S,T}
|
|
||||||
vs = Vector{Variation{S,T}}(undef, length(_edits(v)))
|
|
||||||
for (i, e) in enumerate(_edits(v))
|
|
||||||
vs[i] = Variation{S,T}(reference(v), e)
|
|
||||||
end
|
|
||||||
return vs
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
refbases(v::Variation)
|
|
||||||
|
|
||||||
Get the reference bases of `v`. Note that for deletions, `refbases` also returns the base
|
|
||||||
_before_ the deletion in accordance with the `REF` field of the
|
|
||||||
[VCF v4 specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf).
|
|
||||||
"""
|
|
||||||
function refbases(v::Variation)
|
|
||||||
return _refbases(mutation(v), reference(v), leftposition(v))
|
|
||||||
end
|
|
||||||
|
|
||||||
"""
|
|
||||||
altbases(v::Variation)
|
|
||||||
|
|
||||||
Get the alternate bases of `v`. Note that for insertions, `altbases` also returns the base
|
|
||||||
_before_ the insertion in accordance with the `ALT` field of the
|
|
||||||
[VCF v4 specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf).
|
|
||||||
"""
|
|
||||||
function altbases(v::Variation)
|
|
||||||
return _altbases(mutation(v), reference(v), leftposition(v))
|
|
||||||
end
|
|
|
@ -1,35 +0,0 @@
|
||||||
"""
|
|
||||||
Deletion
|
|
||||||
|
|
||||||
Represents the deletion of N symbols. The location of the deletion is stored
|
|
||||||
outside this struct
|
|
||||||
"""
|
|
||||||
struct Deletion
|
|
||||||
len::UInt
|
|
||||||
|
|
||||||
function Deletion(len::UInt)
|
|
||||||
iszero(len) && error("Deletion must be at least 1 symbol")
|
|
||||||
return new(len)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
Deletion(x::Integer) = Deletion(convert(UInt, x))
|
|
||||||
|
|
||||||
Base.length(x::Deletion) = Int(x.len)
|
|
||||||
Base.:(==)(x::Substitution, y::Substitution) = length(x) == length(y)
|
|
||||||
Base.hash(x::Deletion, h::UInt) = hash(Deletion, hash(x.len, h))
|
|
||||||
|
|
||||||
function _refbases(d::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
|
|
||||||
if pos == 1
|
|
||||||
return S(reference[UnitRange{Int}(pos, pos + length(d))])
|
|
||||||
else
|
|
||||||
return S(reference[UnitRange{Int}(pos - 1, pos + length(d) - 1)])
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function _altbases(::Deletion, reference::S, pos::UInt) where {S<:BioSequence}
|
|
||||||
if pos == 1
|
|
||||||
return S([reference[pos + 1]])
|
|
||||||
else
|
|
||||||
return S([reference[pos - 1]])
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,31 +0,0 @@
|
||||||
"""
|
|
||||||
Insertion{S <: BioSequence}
|
|
||||||
|
|
||||||
Represents the insertion of a `S` into a sequence. The location of the insertion
|
|
||||||
is stored outside the struct.
|
|
||||||
"""
|
|
||||||
struct Insertion{S<:BioSequence}
|
|
||||||
seq::S
|
|
||||||
|
|
||||||
function Insertion{S}(x::S) where {S<:BioSequence}
|
|
||||||
isempty(x) && error("Insertion must be at least 1 symbol")
|
|
||||||
return new(x)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
Insertion(s::BioSequence) = Insertion{typeof(s)}(s)
|
|
||||||
|
|
||||||
Base.length(x::Insertion) = length(x.seq)
|
|
||||||
Base.:(==)(x::Insertion, y::Insertion) = x.seq == y.seq
|
|
||||||
Base.hash(x::Insertion, h::UInt) = hash(Insertion, hash(x.seq, h))
|
|
||||||
|
|
||||||
function _refbases(::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
|
|
||||||
return S([reference[pos]])
|
|
||||||
end
|
|
||||||
|
|
||||||
function _altbases(i::Insertion, reference::S, pos::UInt) where {S<:BioSequence}
|
|
||||||
if pos == 1
|
|
||||||
return S([i.seq..., reference[pos]])
|
|
||||||
else
|
|
||||||
return S([reference[pos], i.seq...])
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,21 +0,0 @@
|
||||||
"""
|
|
||||||
Substitution
|
|
||||||
|
|
||||||
Represents the presence of a `T` at a given position. The position is stored
|
|
||||||
outside this struct.
|
|
||||||
"""
|
|
||||||
struct Substitution{T<:BioSymbol}
|
|
||||||
x::T
|
|
||||||
end
|
|
||||||
|
|
||||||
Base.length(::Substitution) = 1
|
|
||||||
Base.:(==)(x::Substitution, y::Substitution) = x.x == y.x
|
|
||||||
Base.hash(x::Substitution, h::UInt) = hash(Substitution, hash(x.x, h))
|
|
||||||
|
|
||||||
function _refbases(::Substitution, reference::S, pos::UInt) where {S<:BioSequence}
|
|
||||||
return S([reference[pos]])
|
|
||||||
end
|
|
||||||
|
|
||||||
function _altbases(s::Substitution, ::S, pos::UInt) where {S<:BioSequence}
|
|
||||||
return S([s.x])
|
|
||||||
end
|
|
|
@ -23,13 +23,12 @@ TODO now:
|
||||||
* Add tests
|
* Add tests
|
||||||
"""
|
"""
|
||||||
|
|
||||||
using Aqua
|
|
||||||
using BioAlignments
|
using BioAlignments
|
||||||
using BioSequences
|
using BioSequences
|
||||||
using SequenceVariation
|
using SequenceVariation
|
||||||
using Test
|
using Test
|
||||||
|
|
||||||
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL; gap_open=-25, gap_extend=-2)
|
const DNA_MODEL = BioAlignments.AffineGapScoreModel(EDNAFULL, gap_open=-25, gap_extend=-2)
|
||||||
|
|
||||||
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
align(a::BioSequence, b::BioSequence) = pairalign(GlobalAlignment(), a, b, DNA_MODEL).aln
|
||||||
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
seq1 = ungap!(dna"--ATGCGTGTTAGCAAC--TTATCGCG")
|
||||||
|
@ -112,40 +111,17 @@ end
|
||||||
refseq = dna"GATTACA"
|
refseq = dna"GATTACA"
|
||||||
mutseq = dna"GATTACAAAA"
|
mutseq = dna"GATTACAAAA"
|
||||||
|
|
||||||
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq),eltype(refseq)}[])
|
refvar = Variant(refseq, SequenceVariation.Edit{typeof(refseq), eltype(refseq)}[])
|
||||||
|
|
||||||
# Test for ending soft clip
|
# Test for ending soft clip
|
||||||
@test Variant(
|
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)) == refvar
|
||||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S", 1, 1)), refseq)
|
|
||||||
) == refvar
|
|
||||||
|
|
||||||
# Test for ending soft+hard clip
|
# Test for ending soft+hard clip
|
||||||
@test Variant(
|
@test Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)) == refvar
|
||||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3S2H", 1, 1)), refseq)
|
|
||||||
) == refvar
|
|
||||||
|
|
||||||
# Test that ending insertions are still valid
|
# Test that ending insertions are still valid
|
||||||
@test length(
|
@test length(Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)).edits) == 1
|
||||||
Variant(
|
|
||||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3I", 1, 1)), refseq)
|
|
||||||
).edits,
|
|
||||||
) == 1
|
|
||||||
|
|
||||||
# Test that out-of-bounds bases are still caught
|
# Test that out-of-bounds bases are still caught
|
||||||
@test_throws BoundsError Variant(
|
@test_throws BoundsError Variant(PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq))
|
||||||
PairwiseAlignment(AlignedSequence(mutseq, Alignment("7=3X", 1, 1)), refseq)
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
@testset "Aqua" begin
|
|
||||||
Aqua.test_ambiguities(SequenceVariation; recursive=false)
|
|
||||||
# TODO: Refactor `Edit` so that this test doesn't fail
|
|
||||||
# TODO: This test _should_ be set to @test_fails, but Aqua's syntax doesn't allow that
|
|
||||||
# Aqua.test_unbound_args(SequenceVariation)
|
|
||||||
Aqua.test_undefined_exports(SequenceVariation)
|
|
||||||
Aqua.test_piracy(SequenceVariation)
|
|
||||||
Aqua.test_project_extras(SequenceVariation)
|
|
||||||
Aqua.test_stale_deps(SequenceVariation)
|
|
||||||
Aqua.test_deps_compat(SequenceVariation)
|
|
||||||
Aqua.test_project_toml_formatting(SequenceVariation)
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue