mirror of
https://github.com/MillironX/XAM.jl.git
synced 2024-11-23 10:19:56 +00:00
Merge branch 'release/v0.1.0'
This commit is contained in:
commit
109f25bd52
35 changed files with 3676 additions and 2 deletions
1
.codecov.yml
Normal file
1
.codecov.yml
Normal file
|
@ -0,0 +1 @@
|
||||||
|
comment: false
|
1
.github/FUNDING.yml
vendored
Normal file
1
.github/FUNDING.yml
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
open_collective: biojulia
|
40
.github/ISSUE_TEMPLATE.md
vendored
Normal file
40
.github/ISSUE_TEMPLATE.md
vendored
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
<!--- Provide a general summary of the issue in the Title above -->
|
||||||
|
|
||||||
|
> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._
|
||||||
|
|
||||||
|
## Expected Behavior
|
||||||
|
<!--- If you're describing a bug, tell us what you expect to happen -->
|
||||||
|
<!--- If you're suggesting a change/improvement, tell us how it should work -->
|
||||||
|
|
||||||
|
## Current Behavior
|
||||||
|
<!--- If describing a bug, tell us what happens instead of the expected behavior -->
|
||||||
|
<!--- If suggesting a change/improvement, explain the difference from current behavior -->
|
||||||
|
|
||||||
|
## Possible Solution / Implementation
|
||||||
|
<!--- If describing a bug, suggest a fix/reason for the bug (optional) -->
|
||||||
|
<!--- If you're suggesting a change/improvement, suggest ideas how to implement the addition or change -->
|
||||||
|
|
||||||
|
## Steps to Reproduce (for bugs)
|
||||||
|
<!--- You may include copy/pasteable snippets or a list of steps to reproduce the bug -->
|
||||||
|
1.
|
||||||
|
2.
|
||||||
|
3.
|
||||||
|
4.
|
||||||
|
|
||||||
|
<!--- Optionally, provide a link to a live example -->
|
||||||
|
<!--- You can use [this tool](https://www.cockos.com/licecap/) -->
|
||||||
|
<!--- ...Or [this tool](https://github.com/colinkeenan/silentcast) -->
|
||||||
|
<!--- ...Or [this tool](https://github.com/GNOME/byzanz) on Linux -->
|
||||||
|
|
||||||
|
## Context
|
||||||
|
<!--- How has this issue affected you? What are you trying to accomplish? -->
|
||||||
|
<!--- Providing context helps us come up with a solution that is most useful in the real world -->
|
||||||
|
|
||||||
|
## Your Environment
|
||||||
|
<!--- Include as many relevant details about the environment you experienced the bug in -->
|
||||||
|
- Package Version used:
|
||||||
|
- Julia Version used:
|
||||||
|
- Operating System and version (desktop or mobile):
|
||||||
|
- Link to your project:
|
||||||
|
|
||||||
|
<!-- Can you list installed packages here? -->
|
47
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
47
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
# A clear and descriptive title (No issue numbers please)
|
||||||
|
|
||||||
|
> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._
|
||||||
|
|
||||||
|
## Types of changes
|
||||||
|
|
||||||
|
This PR implements the following changes:
|
||||||
|
_(Please tick any or all of the following that are applicable)_
|
||||||
|
|
||||||
|
* [ ] :sparkles: New feature (A non-breaking change which adds functionality).
|
||||||
|
* [ ] :bug: Bug fix (A non-breaking change, which fixes an issue).
|
||||||
|
* [ ] :boom: Breaking change (fix or feature that would cause existing functionality to change).
|
||||||
|
|
||||||
|
## :clipboard: Additional detail
|
||||||
|
|
||||||
|
- If you have implemented new features or behaviour
|
||||||
|
- **Provide a description of the addition** in as many details as possible.
|
||||||
|
|
||||||
|
- **Provide justification of the addition**.
|
||||||
|
|
||||||
|
- **Provide a runnable example of use of your addition**. This lets reviewers
|
||||||
|
and others try out the feature before it is merged or makes it's way to release.
|
||||||
|
|
||||||
|
- If you have changed current behaviour...
|
||||||
|
- **Describe the behaviour prior to you changes**
|
||||||
|
|
||||||
|
- **Describe the behaviour after your changes** and justify why you have made the changes,
|
||||||
|
Please describe any breakages you anticipate as a result of these changes.
|
||||||
|
|
||||||
|
- **Does your change alter APIs or existing exposed methods/types?**
|
||||||
|
If so, this may cause dependency issues and breakages, so the maintainer
|
||||||
|
will need to consider this when versioning the next release.
|
||||||
|
|
||||||
|
- If you are implementing changes that are intended to increase performance, you
|
||||||
|
should provide the results of a simple performance benchmark exercise
|
||||||
|
demonstrating the improvement. Especially if the changes make code less legible.
|
||||||
|
|
||||||
|
## :ballot_box_with_check: Checklist
|
||||||
|
|
||||||
|
- [ ] :art: The changes implemented is consistent with the [julia style guide](https://docs.julialang.org/en/stable/manual/style-guide/).
|
||||||
|
- [ ] :blue_book: I have updated and added relevant docstrings, in a manner consistent with the [documentation styleguide](https://docs.julialang.org/en/stable/manual/documentation/).
|
||||||
|
- [ ] :blue_book: I have added or updated relevant user and developer manuals/documentation in `docs/src/`.
|
||||||
|
- [ ] :ok: There are unit tests that cover the code changes I have made.
|
||||||
|
- [ ] :ok: The unit tests cover my code changes AND they pass.
|
||||||
|
- [ ] :pencil: I have added an entry to the `[UNRELEASED]` section of the manually curated `CHANGELOG.md` file for this repository.
|
||||||
|
- [ ] :ok: All changes should be compatible with the latest stable version of Julia.
|
||||||
|
- [ ] :thought_balloon: I have commented liberally for any complex pieces of internal code.
|
38
.github/workflows/CompatHelper.yml
vendored
Normal file
38
.github/workflows/CompatHelper.yml
vendored
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
name: CompatHelper
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '00 * * * *'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
CompatHelper:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
julia-version: [1.3.0]
|
||||||
|
julia-arch: [x86]
|
||||||
|
os: [ubuntu-latest]
|
||||||
|
steps:
|
||||||
|
- uses: julia-actions/setup-julia@latest
|
||||||
|
with:
|
||||||
|
version: ${{ matrix.julia-version }}
|
||||||
|
- name: Add CompatHelper
|
||||||
|
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
|
||||||
|
- name: CompatHelper.main
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
run: julia -e '
|
||||||
|
using CompatHelper, Pkg;
|
||||||
|
my_registries = [
|
||||||
|
Pkg.RegistrySpec(
|
||||||
|
name = "BioJuliaRegistry",
|
||||||
|
uuid = "ccbd2cc2-2954-11e9-1ccf-f3e7900901ca",
|
||||||
|
url = "https://github.com/BioJulia/BioJuliaRegistry.git"
|
||||||
|
),
|
||||||
|
Pkg.RegistrySpec(
|
||||||
|
name = "General",
|
||||||
|
uuid = "23338594-aafe-5451-b93e-139f81909106",
|
||||||
|
url = "https://github.com/JuliaRegistries/General.git"
|
||||||
|
)
|
||||||
|
];
|
||||||
|
CompatHelper.main(; registries = my_registries, master_branch = "master");'
|
34
.github/workflows/Documentation.yml
vendored
Normal file
34
.github/workflows/Documentation.yml
vendored
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
name: Documentation
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- 'master'
|
||||||
|
- 'develop'
|
||||||
|
- 'release/.*'
|
||||||
|
tags: '*'
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
julia-version: [1.3.0]
|
||||||
|
julia-arch: [x86]
|
||||||
|
os: [ubuntu-latest]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v1.0.0
|
||||||
|
- uses: julia-actions/setup-julia@latest
|
||||||
|
with:
|
||||||
|
version: ${{ matrix.julia-version }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
julia ci_prep.jl;
|
||||||
|
julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
|
||||||
|
- name: Build and deploy
|
||||||
|
env:
|
||||||
|
# https://github.com/JuliaDocs/Documenter.jl/issues/1177
|
||||||
|
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token
|
||||||
|
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key
|
||||||
|
run: julia --project=docs/ --color=yes docs/make.jl
|
13
.github/workflows/TagBot.yml
vendored
Normal file
13
.github/workflows/TagBot.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
name: TagBot
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 * * * *'
|
||||||
|
jobs:
|
||||||
|
TagBot:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: JuliaRegistries/TagBot@v1
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
ssh: ${{ secrets.TAGBOT_KEY }}
|
||||||
|
registry: BioJulia/BioJuliaRegistry
|
22
.github/workflows/UnitTests.yml
vendored
Normal file
22
.github/workflows/UnitTests.yml
vendored
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
name: Unit tests
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
julia-version: ['1.1', '1.2', '1.3']
|
||||||
|
julia-arch: [x64]
|
||||||
|
os: [ubuntu-latest, windows-latest, macOS-latest]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v1.0.0
|
||||||
|
- uses: julia-actions/setup-julia@v1
|
||||||
|
with:
|
||||||
|
version: ${{ matrix.julia-version }}
|
||||||
|
arch: ${{ matrix.julia-arch }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: julia ci_prep.jl
|
||||||
|
- uses: julia-actions/julia-runtest@master
|
33
Project.toml
Normal file
33
Project.toml
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
name = "XAM"
|
||||||
|
uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"
|
||||||
|
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <ward9250@gmail.com>", "Ciarán O'Mara <Ciaran.OMara@utas.edu.au>"]
|
||||||
|
version = "0.1.0"
|
||||||
|
|
||||||
|
[deps]
|
||||||
|
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
|
||||||
|
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
|
||||||
|
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
|
||||||
|
BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
|
||||||
|
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
|
||||||
|
BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
|
||||||
|
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
|
||||||
|
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
|
||||||
|
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||||
|
|
||||||
|
[compat]
|
||||||
|
Automa = "0.7, 0.8"
|
||||||
|
BGZFStreams = "0.3"
|
||||||
|
BioAlignments = "2"
|
||||||
|
BioCore = "2"
|
||||||
|
BioSequences = "2"
|
||||||
|
BufferedStreams = "1"
|
||||||
|
GenomicFeatures = "2"
|
||||||
|
Indexes = "0.1"
|
||||||
|
julia = "1.1"
|
||||||
|
|
||||||
|
[extras]
|
||||||
|
FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
|
||||||
|
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||||
|
|
||||||
|
[targets]
|
||||||
|
test = ["FormatSpecimens", "Test"]
|
77
README.md
77
README.md
|
@ -1,3 +1,76 @@
|
||||||
# XAM.jl
|
# <img src="./docs/src/assets/logo.svg" width="30%" align="right" /> XAM.jl
|
||||||
|
|
||||||
In development: minimal working separation of SAM and BAM from BioAlignments.
|
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
|
||||||
|
[![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest)
|
||||||
|
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE)
|
||||||
|
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)
|
||||||
|
[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/)
|
||||||
|
[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl)
|
||||||
|
|
||||||
|
> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original
|
||||||
|
blog post").
|
||||||
|
|
||||||
|
## Description
|
||||||
|
XAM provides I/O and utilities for manipulating SAM and BAM formatted alignment map files.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
The latest version of XAM is made available to install through BioJulia's package registry.
|
||||||
|
By default, Julia's package manager only includes the "General" package registry.
|
||||||
|
|
||||||
|
To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command:
|
||||||
|
```julia
|
||||||
|
registry add https://github.com/BioJulia/BioJuliaRegistry.git
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the registry is added to your configuration, you can install XAM while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following:
|
||||||
|
```julia
|
||||||
|
add XAM
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/XAM.jl/tree/develop) to try new features before release.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
XAM is tested against Julia `1.X` on Linux, OS X, and Windows.
|
||||||
|
|
||||||
|
**Latest build status:**
|
||||||
|
|
||||||
|
[![Unit tests](https://github.com/BioJulia/XAM.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster)
|
||||||
|
[![Documentation](https://github.com/BioJulia/XAM.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
||||||
|
[![codecov](https://codecov.io/gh/BioJulia/XAM.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/XAM.jl)
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
We appreciate [contributions](https://github.com/BioJulia/XAM.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features.
|
||||||
|
|
||||||
|
Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct.
|
||||||
|
|
||||||
|
### Financial contributions
|
||||||
|
We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia).
|
||||||
|
Anyone can file an expense.
|
||||||
|
If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed.
|
||||||
|
|
||||||
|
|
||||||
|
## Backers & Sponsors
|
||||||
|
Thank you to all our backers and sponsors!
|
||||||
|
|
||||||
|
Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer).
|
||||||
|
|
||||||
|
[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers)
|
||||||
|
|
||||||
|
Does your company use BioJulia?
|
||||||
|
Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor).
|
||||||
|
Your logo will show up here with a link to your website.
|
||||||
|
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website)
|
||||||
|
|
||||||
|
|
||||||
|
## Questions?
|
||||||
|
If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).
|
||||||
|
|
3
ci_prep.jl
Normal file
3
ci_prep.jl
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
using Pkg.Registry
|
||||||
|
Registry.add(Registry.RegistrySpec(url = "https://github.com/BioJulia/BioJuliaRegistry.git"))
|
||||||
|
Registry.add(Registry.RegistrySpec(url = "https://github.com/JuliaRegistries/General.git"))
|
2
coverage/Project.toml
Normal file
2
coverage/Project.toml
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
[deps]
|
||||||
|
Coverage = "a2441757-f6aa-5fb2-8edb-039e3f45d037"
|
11
coverage/coverage.jl
Normal file
11
coverage/coverage.jl
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
get(ENV, "TRAVIS_OS_NAME", "") == "linux" || exit()
|
||||||
|
get(ENV, "TRAVIS_JULIA_VERSION", "") == "1.3" || exit()
|
||||||
|
|
||||||
|
using Pkg
|
||||||
|
Pkg.instantiate()
|
||||||
|
|
||||||
|
using Coverage
|
||||||
|
|
||||||
|
cd(joinpath(@__DIR__, "..")) do
|
||||||
|
Codecov.submit(Codecov.process_folder())
|
||||||
|
end
|
6
docs/Project.toml
Normal file
6
docs/Project.toml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
[deps]
|
||||||
|
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
|
||||||
|
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
|
||||||
|
|
||||||
|
[compat]
|
||||||
|
Documenter = "0.24"
|
21
docs/make.jl
Normal file
21
docs/make.jl
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
using Pkg
|
||||||
|
using Documenter, XAM
|
||||||
|
|
||||||
|
makedocs(
|
||||||
|
format = Documenter.HTML(
|
||||||
|
edit_link = "develop"
|
||||||
|
),
|
||||||
|
modules = [XAM, XAM.SAM, XAM.BAM],
|
||||||
|
sitename = "XAM.jl",
|
||||||
|
pages = [
|
||||||
|
"Home" => "index.md",
|
||||||
|
"SAM and BAM" => "man/hts-files.md",
|
||||||
|
"API Reference" => "man/api.md"
|
||||||
|
],
|
||||||
|
authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors."
|
||||||
|
)
|
||||||
|
deploydocs(
|
||||||
|
repo = "github.com/BioJulia/XAM.jl.git",
|
||||||
|
devbranch = "develop",
|
||||||
|
push_preview = true
|
||||||
|
)
|
26
docs/src/assets/logo.svg
Normal file
26
docs/src/assets/logo.svg
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 136.48101 51.472999">
|
||||||
|
<g id="bubble-helix">
|
||||||
|
<circle cx="5.9792" cy="20.3" r="5.9792" fill="#945bb0"/>
|
||||||
|
<circle cx="21.317" cy="39.5" r="4.9393" fill="#4266d5"/>
|
||||||
|
<circle cx="16.378" cy="11.7" r="4.9393" fill="#c93d39"/>
|
||||||
|
<circle cx="35.095" cy="35.9" r="5.9792" fill="#945bb0"/>
|
||||||
|
<circle cx="47.184" cy="23.8" r="7.409" fill="#4266d5"/>
|
||||||
|
<circle cx="28.986" cy="9.7" r="4.5494" fill="#3b972e"/>
|
||||||
|
<circle cx="89.428" cy="46.5" r="4.9393" fill="#c93d39"/>
|
||||||
|
<circle cx="71.62" cy="5.6" r="5.5892" fill="#945bb0"/>
|
||||||
|
<circle cx="121.01" cy="15.5" r="5.5892" fill="#c93d39"/>
|
||||||
|
<circle cx="59.662" cy="30.3" r="2.4697" fill="#4266d5"/>
|
||||||
|
<circle cx="103.6" cy="19.9" r="3.2496" fill="#c93d39"/>
|
||||||
|
<circle cx="123.61" cy="36.5" r="2.7296" fill="#4266d5"/>
|
||||||
|
<circle cx="127.64" cy="5.2" r="3.8995" fill="#3b972e"/>
|
||||||
|
<circle cx="132.58" cy="39.2" r="3.8995" fill="#945bb0"/>
|
||||||
|
<circle cx="67.591" cy="37.4" r="3.3795" fill="#3b972e"/>
|
||||||
|
<circle cx="95.017" cy="13.9" r="4.2894" fill="#4266d5"/>
|
||||||
|
<circle cx="77.729" cy="42.9" r="3.8995" fill="#945bb0"/>
|
||||||
|
<circle cx="102.56" cy="42.0" r="5.8492" fill="#4266d5"/>
|
||||||
|
<circle cx="113.6" cy="28.8" r="6.7591" fill="#3b972e"/>
|
||||||
|
<circle cx="57.582" cy="11.0" r="6.3691" fill="#c93d39"/>
|
||||||
|
<circle cx="38.735" cy="14.8" r="3.3795" fill="#c93d39"/>
|
||||||
|
<circle cx="84.618" cy="7.9" r="5.3293" fill="#3b972e"/>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 1.4 KiB |
77
docs/src/index.md
Normal file
77
docs/src/index.md
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
# XAM.jl
|
||||||
|
|
||||||
|
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
|
||||||
|
[![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest)
|
||||||
|
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE)
|
||||||
|
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)
|
||||||
|
[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/)
|
||||||
|
[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl)
|
||||||
|
|
||||||
|
> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/).
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
XAM provides I/O and utilities for manipulating SAM and BAM formatted alignment map files.
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
The latest version of XAM is made available to install through BioJulia's package registry.
|
||||||
|
By default, Julia's package manager only includes the "General" package registry.
|
||||||
|
|
||||||
|
To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command:
|
||||||
|
```julia
|
||||||
|
registry add https://github.com/BioJulia/BioJuliaRegistry.git
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the registry is added to your configuration, you can install XAM while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following:
|
||||||
|
```julia
|
||||||
|
add XAM
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/XAM.jl/tree/develop) to try new features before release.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
XAM is tested against Julia `1.X` on Linux, OS X, and Windows.
|
||||||
|
|
||||||
|
**Latest build status:**
|
||||||
|
|
||||||
|
[![Unit tests](https://github.com/BioJulia/XAM.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster)
|
||||||
|
[![Documentation](https://github.com/BioJulia/XAM.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
|
||||||
|
[![codecov](https://codecov.io/gh/BioJulia/XAM.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/XAM.jl)
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
We appreciate [contributions](https://github.com/BioJulia/XAM.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features.
|
||||||
|
|
||||||
|
Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct.
|
||||||
|
|
||||||
|
### Financial contributions
|
||||||
|
We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia).
|
||||||
|
Anyone can file an expense.
|
||||||
|
If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed.
|
||||||
|
|
||||||
|
|
||||||
|
## Backers & Sponsors
|
||||||
|
Thank you to all our backers and sponsors!
|
||||||
|
|
||||||
|
Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer).
|
||||||
|
|
||||||
|
[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers)
|
||||||
|
|
||||||
|
Does your company use BioJulia?
|
||||||
|
Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor).
|
||||||
|
Your logo will show up here with a link to your website.
|
||||||
|
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website)
|
||||||
|
[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website)
|
||||||
|
|
||||||
|
|
||||||
|
## Questions?
|
||||||
|
If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).
|
38
docs/src/man/api.md
Normal file
38
docs/src/man/api.md
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
```@meta
|
||||||
|
CurrentModule = XAM
|
||||||
|
DocTestSetup = quote
|
||||||
|
using XAM
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
# API Reference
|
||||||
|
|
||||||
|
## SAM API
|
||||||
|
|
||||||
|
### Public
|
||||||
|
|
||||||
|
```@autodocs
|
||||||
|
Modules = [XAM.SAM]
|
||||||
|
private = false
|
||||||
|
```
|
||||||
|
|
||||||
|
### Internal
|
||||||
|
```@autodocs
|
||||||
|
Modules = [XAM.SAM]
|
||||||
|
public = false
|
||||||
|
```
|
||||||
|
|
||||||
|
## BAM API
|
||||||
|
|
||||||
|
### Public
|
||||||
|
|
||||||
|
```@autodocs
|
||||||
|
Modules = [XAM.BAM]
|
||||||
|
private = false
|
||||||
|
```
|
||||||
|
|
||||||
|
### Internal
|
||||||
|
```@autodocs
|
||||||
|
Modules = [XAM.BAM]
|
||||||
|
public = false
|
||||||
|
```
|
304
docs/src/man/hts-files.md
Normal file
304
docs/src/man/hts-files.md
Normal file
|
@ -0,0 +1,304 @@
|
||||||
|
# SAM and BAM
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
The `XAM` package offers high-performance tools for SAM and BAM file formats, which are the most popular file formats.
|
||||||
|
|
||||||
|
If you have questions about the SAM and BAM formats or any of the terminology used when discussing these formats, see the published [specification](https://samtools.github.io/hts-specs/SAMv1.pdf), which is maintained by the [samtools group](https://samtools.github.io/).
|
||||||
|
|
||||||
|
A very very simple SAM file looks like the following:
|
||||||
|
|
||||||
|
```
|
||||||
|
@HD VN:1.6 SO:coordinate
|
||||||
|
@SQ SN:ref LN:45
|
||||||
|
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
|
||||||
|
r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA *
|
||||||
|
r003 0 ref 9 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
|
||||||
|
r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC *
|
||||||
|
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
|
||||||
|
r001 147 ref 37 30 9M = 7 -39 CAGCGGCAT * NM:i:1
|
||||||
|
```
|
||||||
|
|
||||||
|
Where the first two lines are part of the "header", and the following lines are "records".
|
||||||
|
Each record describes how a read aligns to some reference sequence.
|
||||||
|
Sometimes one record describes one read, but there are other cases like chimeric reads and split alignments, where multiple records apply to one read.
|
||||||
|
In the example above, `r003` is a chimeric read, and `r004` is a split alignment, and `r001` are mate pair reads.
|
||||||
|
Again, we refer you to the official [specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for more details.
|
||||||
|
|
||||||
|
A BAM file stores this same information but in a binary and compressible format that does not make for pretty printing here!
|
||||||
|
|
||||||
|
## Reading SAM and BAM files
|
||||||
|
|
||||||
|
A typical script iterating over all records in a file looks like below:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
using XAM
|
||||||
|
|
||||||
|
# Open a BAM file.
|
||||||
|
reader = open(BAM.Reader, "data.bam")
|
||||||
|
|
||||||
|
# Iterate over BAM records.
|
||||||
|
for record in reader
|
||||||
|
# `record` is a BAM.Record object.
|
||||||
|
if BAM.ismapped(record)
|
||||||
|
# Print the mapped position.
|
||||||
|
println(BAM.refname(record), ':', BAM.position(record))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Close the BAM file.
|
||||||
|
close(reader)
|
||||||
|
```
|
||||||
|
|
||||||
|
The size of a BAM file is often extremely large.
|
||||||
|
The iterator interface demonstrated above allocates an object for each record and that may be a bottleneck of reading data from a BAM file.
|
||||||
|
In-place reading reuses a pre-allocated object for every record and less memory allocation happens in reading:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
reader = open(BAM.Reader, "data.bam")
|
||||||
|
record = BAM.Record()
|
||||||
|
while !eof(reader)
|
||||||
|
read!(reader, record)
|
||||||
|
# do something
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
## SAM and BAM Headers
|
||||||
|
|
||||||
|
Both `SAM.Reader` and `BAM.Reader` implement the `header` function, which returns a `SAM.Header` object.
|
||||||
|
To extract certain information out of the headers, you can use the `find` method on the header to extract information according to SAM/BAM tag.
|
||||||
|
Again we refer you to the [specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for full details of all the different tags that can occur in headers, and what they mean.
|
||||||
|
|
||||||
|
Below is an example of extracting all the info about the reference sequences from the BAM header.
|
||||||
|
In SAM/BAM, any description of a reference sequence is stored in the header, under a tag denoted `SQ` (think `reference SeQuence`!).
|
||||||
|
|
||||||
|
```jlcon
|
||||||
|
julia> reader = open(SAM.Reader, "data.sam");
|
||||||
|
|
||||||
|
julia> find(header(reader), "SQ")
|
||||||
|
7-element Array{Bio.Align.SAM.MetaInfo,1}:
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=Chr1 LN=30427671
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=Chr2 LN=19698289
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=Chr3 LN=23459830
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=Chr4 LN=18585056
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=Chr5 LN=26975502
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=chloroplast LN=154478
|
||||||
|
Bio.Align.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=mitochondria LN=366924
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
In the above we can see there were 7 sequences in the reference: 5 chromosomes, one chloroplast sequence, and one mitochondrial sequence.
|
||||||
|
|
||||||
|
## SAM and BAM Records
|
||||||
|
|
||||||
|
### SAM.Record
|
||||||
|
The `XAM` package supports the following accessors for `SAM.Record` types.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
XAM.SAM.flag
|
||||||
|
XAM.SAM.ismapped
|
||||||
|
XAM.SAM.isprimary
|
||||||
|
XAM.SAM.refname
|
||||||
|
XAM.SAM.position
|
||||||
|
XAM.SAM.rightposition
|
||||||
|
XAM.SAM.isnextmapped
|
||||||
|
XAM.SAM.nextrefname
|
||||||
|
XAM.SAM.nextposition
|
||||||
|
XAM.SAM.mappingquality
|
||||||
|
XAM.SAM.cigar
|
||||||
|
XAM.SAM.alignment
|
||||||
|
XAM.SAM.alignlength
|
||||||
|
XAM.SAM.tempname
|
||||||
|
XAM.SAM.templength
|
||||||
|
XAM.SAM.sequence
|
||||||
|
XAM.SAM.seqlength
|
||||||
|
XAM.SAM.quality
|
||||||
|
XAM.SAM.auxdata
|
||||||
|
```
|
||||||
|
|
||||||
|
### BAM.Record
|
||||||
|
The `XAM` package supports the following accessors for `BAM.Record` types.
|
||||||
|
|
||||||
|
```@docs
|
||||||
|
XAM.BAM.flag
|
||||||
|
XAM.BAM.ismapped
|
||||||
|
XAM.BAM.isprimary
|
||||||
|
XAM.BAM.refid
|
||||||
|
XAM.BAM.refname
|
||||||
|
XAM.BAM.reflen
|
||||||
|
XAM.BAM.position
|
||||||
|
XAM.BAM.rightposition
|
||||||
|
XAM.BAM.isnextmapped
|
||||||
|
XAM.BAM.nextrefid
|
||||||
|
XAM.BAM.nextrefname
|
||||||
|
XAM.BAM.nextposition
|
||||||
|
XAM.BAM.mappingquality
|
||||||
|
XAM.BAM.cigar
|
||||||
|
XAM.BAM.alignment
|
||||||
|
XAM.BAM.alignlength
|
||||||
|
XAM.BAM.tempname
|
||||||
|
XAM.BAM.templength
|
||||||
|
XAM.BAM.sequence
|
||||||
|
XAM.BAM.seqlength
|
||||||
|
XAM.BAM.quality
|
||||||
|
XAM.BAM.auxdata
|
||||||
|
```
|
||||||
|
|
||||||
|
## Accessing auxiliary data
|
||||||
|
|
||||||
|
SAM and BAM records support the storing of optional data fields associated with tags.
|
||||||
|
|
||||||
|
Tagged auxiliary data follows a format of `TAG:TYPE:VALUE`.
|
||||||
|
`TAG` is a two-letter string, and each tag can only appear once per record.
|
||||||
|
`TYPE` is a single case-sensetive letter which defined the format of `VALUE`.
|
||||||
|
|
||||||
|
| Type | Description |
|
||||||
|
|------|-----------------------------------|
|
||||||
|
| 'A' | Printable character |
|
||||||
|
| 'i' | Signed integer |
|
||||||
|
| 'f' | Single-precision floating number |
|
||||||
|
| 'Z' | Printable string, including space |
|
||||||
|
| 'H' | Byte array in Hex format |
|
||||||
|
| 'B' | Integer of numeric array |
|
||||||
|
|
||||||
|
For more information about these tags and their types we refer you to the [SAM/BAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) and the additional [optional fields specification](https://samtools.github.io/hts-specs/SAMtags.pdf) document.
|
||||||
|
|
||||||
|
There are some tags that are reserved, predefined standard tags, for specific uses.
|
||||||
|
|
||||||
|
To access optional fields stored in tags, you use `getindex` indexing syntax on the record object.
|
||||||
|
Note that accessing optional tag fields will result in type instability in Julia.
|
||||||
|
This is because the type of the optional data is not known until run-time, as the tag is being read.
|
||||||
|
This can have a significant impact on performance.
|
||||||
|
To limit this, if the user knows the type of a value in advance, specifying it as a type annotation will alleviate the problem:
|
||||||
|
|
||||||
|
Below is an example of looping over records in a bam file and using indexing syntax to get the data stored in the "NM" tag.
|
||||||
|
Note the `UInt8` type assertion to alleviate type instability.
|
||||||
|
|
||||||
|
```julia
|
||||||
|
for record in open(BAM.Reader, "data.bam")
|
||||||
|
nm = record["NM"]::UInt8
|
||||||
|
# do something
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
## Getting records in a range
|
||||||
|
|
||||||
|
The `XAM` package supports the BAI index to fetch records in a specific range from a BAM file.
|
||||||
|
[Samtools](https://samtools.github.io/) provides `index` subcommand to create an index file (.bai) from a sorted BAM file.
|
||||||
|
|
||||||
|
```console
|
||||||
|
$ samtools index -b SRR1238088.sort.bam
|
||||||
|
$ ls SRR1238088.sort.bam*
|
||||||
|
SRR1238088.sort.bam SRR1238088.sort.bam.bai
|
||||||
|
```
|
||||||
|
|
||||||
|
The method `eachoverlap(reader, chrom, range)` returns an iterator of BAM records overlapping the query interval:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
reader = open(BAM.Reader, "SRR1238088.sort.bam", index="SRR1238088.sort.bam.bai")
|
||||||
|
for record in eachoverlap(reader, "Chr2", 10000:11000)
|
||||||
|
# `record` is a BAM.Record object
|
||||||
|
# ...
|
||||||
|
end
|
||||||
|
close(reader)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Getting records overlapping genomic features
|
||||||
|
|
||||||
|
The `eachoverlap` method also accepts the `Interval` type defined in [GenomicFeatures.jl](https://github.com/BioJulia/GenomicFeatures.jl).
|
||||||
|
|
||||||
|
This allows you to do things like first read in the genomic features from a GFF3 file, and then for each feature, iterate over all the BAM records that overlap with that feature.
|
||||||
|
|
||||||
|
```julia
|
||||||
|
using GenomicFeatures
|
||||||
|
using GFF3
|
||||||
|
using XAM
|
||||||
|
|
||||||
|
# Load genomic features from a GFF3 file.
|
||||||
|
features = open(collect, GFF3.Reader, "TAIR10_GFF3_genes.gff")
|
||||||
|
|
||||||
|
# Keep mRNA features.
|
||||||
|
filter!(x -> GFF3.featuretype(x) == "mRNA", features)
|
||||||
|
|
||||||
|
# Open a BAM file and iterate over records overlapping mRNA transcripts.
|
||||||
|
reader = open(BAM.Reader, "SRR1238088.sort.bam", index = "SRR1238088.sort.bam.bai")
|
||||||
|
for feature in features
|
||||||
|
for record in eachoverlap(reader, feature)
|
||||||
|
# `record` overlaps `feature`.
|
||||||
|
# ...
|
||||||
|
end
|
||||||
|
end
|
||||||
|
close(reader)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Writing files
|
||||||
|
|
||||||
|
In order to write a BAM or SAM file, you must first create a `SAM.Header`.
|
||||||
|
|
||||||
|
A `SAM.Header` is constructed from a vector of `SAM.MetaInfo` objects.
|
||||||
|
|
||||||
|
For example, to create the following simple header:
|
||||||
|
|
||||||
|
```
|
||||||
|
@HD VN:1.6 SO:coordinate
|
||||||
|
@SQ SN:ref LN:45
|
||||||
|
```
|
||||||
|
|
||||||
|
```julia
|
||||||
|
julia> a = SAM.MetaInfo("HD", ["VN" => 1.6, "SO" => "coordinate"])
|
||||||
|
SAM.MetaInfo:
|
||||||
|
tag: HD
|
||||||
|
value: VN=1.6 SO=coordinate
|
||||||
|
|
||||||
|
julia> b = SAM.MetaInfo("SQ", ["SN" => "ref", "LN" => 45])
|
||||||
|
SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=ref LN=45
|
||||||
|
|
||||||
|
julia> h = SAM.Header([a, b])
|
||||||
|
SAM.Header(SAM.MetaInfo[SAM.MetaInfo:
|
||||||
|
tag: HD
|
||||||
|
value: VN=1.6 SO=coordinate, SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=ref LN=45])
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Then to create the writer for a SAM file, construct a `SAM.Writer` using the header and an `IO` type:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
julia> samw = SAM.Writer(open("my-data.sam", "w"), h)
|
||||||
|
SAM.Writer(IOStream(<file my-data.sam>))
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
To make a BAM Writer is slightly different, as you need to use a specific stream type from the (https://github.com/BioJulia/BGZFStreams.jl)[https://github.com/BioJulia/BGZFStreams.jl] package:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
julia> using BGZFStreams
|
||||||
|
|
||||||
|
julia> bamw = BAM.Writer(BGZFStream(open("my-data.bam", "w"), "w"))
|
||||||
|
BAM.Writer(BGZFStreams.BGZFStream{IOStream}(<mode=write>))
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Once you have a BAM or SAM writer, you can use the `write` method to write `BAM.Record`s or `SAM.Record`s to file:
|
||||||
|
|
||||||
|
```julia
|
||||||
|
julia> write(bamw, rec) # Here rec is a `BAM.Record`
|
||||||
|
330780
|
||||||
|
```
|
13
src/XAM.jl
Normal file
13
src/XAM.jl
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
module XAM
|
||||||
|
|
||||||
|
export
|
||||||
|
SAM,
|
||||||
|
BAM
|
||||||
|
|
||||||
|
include("sam/sam.jl")
|
||||||
|
include("bam/bam.jl")
|
||||||
|
|
||||||
|
using .SAM
|
||||||
|
using .BAM
|
||||||
|
|
||||||
|
end # module
|
153
src/bam/auxdata.jl
Normal file
153
src/bam/auxdata.jl
Normal file
|
@ -0,0 +1,153 @@
|
||||||
|
# BAM Auxiliary Data
|
||||||
|
# ==================
|
||||||
|
|
||||||
|
struct AuxData <: AbstractDict{String,Any}
|
||||||
|
data::Vector{UInt8}
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.getindex(aux::AuxData, tag::AbstractString)
|
||||||
|
checkauxtag(tag)
|
||||||
|
return getauxvalue(aux.data, 1, length(aux.data), UInt8(tag[1]), UInt8(tag[2]))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.length(aux::AuxData)
|
||||||
|
data = aux.data
|
||||||
|
p = 1
|
||||||
|
len = 0
|
||||||
|
while p ≤ length(data)
|
||||||
|
len += 1
|
||||||
|
p = next_tag_position(data, p)
|
||||||
|
end
|
||||||
|
return len
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.iterate(aux::AuxData, pos=1)
|
||||||
|
if pos > length(aux.data)
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
data = aux.data
|
||||||
|
@label doit
|
||||||
|
t1 = data[pos]
|
||||||
|
t2 = data[pos+1]
|
||||||
|
pos, typ = loadauxtype(data, pos + 2)
|
||||||
|
pos, value = loadauxvalue(data, pos, typ)
|
||||||
|
if t1 == t2 == 0xff
|
||||||
|
@goto doit
|
||||||
|
end
|
||||||
|
return Pair{String,Any}(String([t1, t2]), value), pos
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Internals
|
||||||
|
# ---------
|
||||||
|
|
||||||
|
function checkauxtag(tag::AbstractString)
|
||||||
|
if sizeof(tag) != 2
|
||||||
|
throw(ArgumentError("tag length must be 2"))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
|
||||||
|
pos = findauxtag(data, start, stop, t1, t2)
|
||||||
|
if pos == 0
|
||||||
|
throw(KeyError(String([t1, t2])))
|
||||||
|
end
|
||||||
|
pos, T = loadauxtype(data, pos + 2)
|
||||||
|
_, val = loadauxvalue(data, pos, T)
|
||||||
|
return val
|
||||||
|
end
|
||||||
|
|
||||||
|
function loadauxtype(data::Vector{UInt8}, p::Int)
|
||||||
|
function auxtype(b)
|
||||||
|
return (
|
||||||
|
b == UInt8('A') ? Char :
|
||||||
|
b == UInt8('c') ? Int8 :
|
||||||
|
b == UInt8('C') ? UInt8 :
|
||||||
|
b == UInt8('s') ? Int16 :
|
||||||
|
b == UInt8('S') ? UInt16 :
|
||||||
|
b == UInt8('i') ? Int32 :
|
||||||
|
b == UInt8('I') ? UInt32 :
|
||||||
|
b == UInt8('f') ? Float32 :
|
||||||
|
b == UInt8('Z') ? String :
|
||||||
|
error("invalid type tag: '$(Char(b))'"))
|
||||||
|
end
|
||||||
|
t = data[p]
|
||||||
|
if t == UInt8('B')
|
||||||
|
return p + 2, Vector{auxtype(data[p+1])}
|
||||||
|
else
|
||||||
|
return p + 1, auxtype(t)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T
|
||||||
|
return p + sizeof(T), unsafe_load(Ptr{T}(pointer(data, p)))
|
||||||
|
end
|
||||||
|
|
||||||
|
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Char})
|
||||||
|
return p + 1, Char(unsafe_load(pointer(data, p)))
|
||||||
|
end
|
||||||
|
|
||||||
|
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T
|
||||||
|
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
|
||||||
|
p += 4
|
||||||
|
xs = Vector{T}(undef, n)
|
||||||
|
unsafe_copyto!(pointer(xs), Ptr{T}(pointer(data, p)), n)
|
||||||
|
return p + n * sizeof(T), xs
|
||||||
|
end
|
||||||
|
|
||||||
|
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{String})
|
||||||
|
dataptr = pointer(data, p)
|
||||||
|
endptr = ccall(:memchr, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), dataptr, '\0', length(data) - p + 1)
|
||||||
|
q::Int = p + (endptr - dataptr) - 1
|
||||||
|
return q + 2, String(data[p:q])
|
||||||
|
end
|
||||||
|
|
||||||
|
function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
|
||||||
|
pos = start
|
||||||
|
while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2)
|
||||||
|
pos = next_tag_position(data, pos)
|
||||||
|
end
|
||||||
|
if pos > stop
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return pos
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find the starting position of a next tag in `data` after `p`.
|
||||||
|
# `(data[p], data[p+1])` is supposed to be a current tag.
|
||||||
|
function next_tag_position(data::Vector{UInt8}, p::Int)
|
||||||
|
typ = Char(data[p+2])
|
||||||
|
p += 3
|
||||||
|
if typ == 'A'
|
||||||
|
p += 1
|
||||||
|
elseif typ == 'c' || typ == 'C'
|
||||||
|
p += 1
|
||||||
|
elseif typ == 's' || typ == 'S'
|
||||||
|
p += 2
|
||||||
|
elseif typ == 'i' || typ == 'I'
|
||||||
|
p += 4
|
||||||
|
elseif typ == 'f'
|
||||||
|
p += 4
|
||||||
|
elseif typ == 'd'
|
||||||
|
p += 8
|
||||||
|
elseif typ == 'Z' || typ == 'H'
|
||||||
|
while data[p] != 0x00 # NULL-terminalted string
|
||||||
|
p += 1
|
||||||
|
end
|
||||||
|
p += 1
|
||||||
|
elseif typ == 'B'
|
||||||
|
eltyp = Char(data[p])
|
||||||
|
elsize = eltyp == 'c' || eltyp == 'C' ? 1 :
|
||||||
|
eltyp == 's' || eltyp == 'S' ? 2 :
|
||||||
|
eltyp == 'i' || eltye == 'I' || eltyp == 'f' ? 4 :
|
||||||
|
error("invalid type tag: '$(Char(eltyp))'")
|
||||||
|
p += 1
|
||||||
|
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
|
||||||
|
p += 4 + elsize * n
|
||||||
|
else
|
||||||
|
error("invalid type tag: '$(Char(typ))'")
|
||||||
|
end
|
||||||
|
return p
|
||||||
|
end
|
55
src/bam/bai.jl
Normal file
55
src/bam/bai.jl
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
# BAI
|
||||||
|
# ===
|
||||||
|
#
|
||||||
|
# Index for BAM files.
|
||||||
|
|
||||||
|
|
||||||
|
# An index type for the BAM file format.
|
||||||
|
struct BAI
|
||||||
|
# BGZF file index
|
||||||
|
index::Indexes.BGZFIndex
|
||||||
|
|
||||||
|
# number of unmapped reads
|
||||||
|
n_no_coors::Union{Nothing, Int}
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
BAI(filename::AbstractString)
|
||||||
|
|
||||||
|
Load a BAI index from `filename`.
|
||||||
|
"""
|
||||||
|
function BAI(filename::AbstractString)
|
||||||
|
return open(read_bai, filename)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
BAI(input::IO)
|
||||||
|
|
||||||
|
Load a BAI index from `input`.
|
||||||
|
"""
|
||||||
|
function BAI(input::IO)
|
||||||
|
return read_bai(input)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Read a BAI object from `input`.
|
||||||
|
function read_bai(input::IO)
|
||||||
|
# check magic bytes
|
||||||
|
B = read(input, UInt8)
|
||||||
|
A = read(input, UInt8)
|
||||||
|
I = read(input, UInt8)
|
||||||
|
x = read(input, UInt8)
|
||||||
|
if B != UInt8('B') || A != UInt8('A') || I != UInt8('I') || x != 0x01
|
||||||
|
error("input is not a valid BAI file")
|
||||||
|
end
|
||||||
|
|
||||||
|
# read contents
|
||||||
|
n_refs = read(input, Int32)
|
||||||
|
index = Indexes.read_bgzfindex(input, n_refs)
|
||||||
|
if !eof(input)
|
||||||
|
n_no_coors = read(input, UInt64)
|
||||||
|
else
|
||||||
|
n_no_coors = nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
return BAI(index, n_no_coors)
|
||||||
|
end
|
26
src/bam/bam.jl
Normal file
26
src/bam/bam.jl
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# BAM File Format
|
||||||
|
# ===============
|
||||||
|
|
||||||
|
module BAM
|
||||||
|
|
||||||
|
using BioCore
|
||||||
|
using GenomicFeatures
|
||||||
|
using XAM.SAM
|
||||||
|
|
||||||
|
import BGZFStreams
|
||||||
|
import BioAlignments
|
||||||
|
import Indexes
|
||||||
|
import BioSequences
|
||||||
|
import BioCore: isfilled, header
|
||||||
|
|
||||||
|
import GenomicFeatures: eachoverlap
|
||||||
|
|
||||||
|
|
||||||
|
include("bai.jl")
|
||||||
|
include("auxdata.jl")
|
||||||
|
include("reader.jl")
|
||||||
|
include("record.jl")
|
||||||
|
include("writer.jl")
|
||||||
|
include("overlap.jl")
|
||||||
|
|
||||||
|
end
|
95
src/bam/overlap.jl
Normal file
95
src/bam/overlap.jl
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# BAM Overlap
|
||||||
|
# ===========
|
||||||
|
|
||||||
|
struct OverlapIterator{T}
|
||||||
|
reader::Reader{T}
|
||||||
|
refname::String
|
||||||
|
interval::UnitRange{Int}
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.IteratorSize(::Type{OverlapIterator{T}}) where T
|
||||||
|
return Base.SizeUnknown()
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.eltype(::Type{OverlapIterator{T}}) where T
|
||||||
|
return Record
|
||||||
|
end
|
||||||
|
|
||||||
|
function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval)
|
||||||
|
return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last)
|
||||||
|
end
|
||||||
|
|
||||||
|
function GenomicFeatures.eachoverlap(reader::Reader, interval)
|
||||||
|
return GenomicFeatures.eachoverlap(reader, convert(Interval, interval))
|
||||||
|
end
|
||||||
|
|
||||||
|
function GenomicFeatures.eachoverlap(reader::Reader, refname::AbstractString, interval::UnitRange)
|
||||||
|
return OverlapIterator(reader, String(refname), interval)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Iterator
|
||||||
|
# --------
|
||||||
|
|
||||||
|
mutable struct OverlapIteratorState
|
||||||
|
# reference index
|
||||||
|
refindex::Int
|
||||||
|
|
||||||
|
# possibly overlapping chunks
|
||||||
|
chunks::Vector{Indexes.Chunk}
|
||||||
|
|
||||||
|
# current chunk index
|
||||||
|
chunkid::Int
|
||||||
|
|
||||||
|
# pre-allocated record
|
||||||
|
record::Record
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.iterate(iter::OverlapIterator)
|
||||||
|
refindex = findfirst(isequal(iter.refname), iter.reader.refseqnames)
|
||||||
|
if refindex == 0
|
||||||
|
throw(ArgumentError("sequence name $(iter.refname) is not found in the header"))
|
||||||
|
end
|
||||||
|
@assert iter.reader.index !== nothing
|
||||||
|
chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval)
|
||||||
|
if !isempty(chunks)
|
||||||
|
seek(iter.reader, first(chunks).start)
|
||||||
|
end
|
||||||
|
state = OverlapIteratorState(refindex, chunks, 1, Record())
|
||||||
|
return iterate(iter, state)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.iterate(iter::OverlapIterator, state)
|
||||||
|
while state.chunkid ≤ lastindex(state.chunks)
|
||||||
|
chunk = state.chunks[state.chunkid]
|
||||||
|
while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop
|
||||||
|
read!(iter.reader, state.record)
|
||||||
|
c = compare_intervals(state.record, (state.refindex, iter.interval))
|
||||||
|
if c == 0
|
||||||
|
return copy(state.record), state
|
||||||
|
elseif c > 0
|
||||||
|
# no more overlapping records in this chunk since records are sorted
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
state.chunkid += 1
|
||||||
|
if state.chunkid ≤ lastindex(state.chunks)
|
||||||
|
seek(iter.reader, state.chunks[state.chunkid].start)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}})
|
||||||
|
rid = refid(record)
|
||||||
|
if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2]))
|
||||||
|
# strictly left
|
||||||
|
return -1
|
||||||
|
elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2]))
|
||||||
|
# strictly right
|
||||||
|
return +1
|
||||||
|
else
|
||||||
|
# overlapping
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
end
|
141
src/bam/reader.jl
Normal file
141
src/bam/reader.jl
Normal file
|
@ -0,0 +1,141 @@
|
||||||
|
# BAM Reader
|
||||||
|
# ==========
|
||||||
|
|
||||||
|
"""
|
||||||
|
BAM.Reader(input::IO; index=nothing)
|
||||||
|
|
||||||
|
Create a data reader of the BAM file format.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
* `input`: data source
|
||||||
|
* `index=nothing`: filepath to a random access index (currently *bai* is supported)
|
||||||
|
"""
|
||||||
|
mutable struct Reader{T} <: BioCore.IO.AbstractReader
|
||||||
|
stream::BGZFStreams.BGZFStream{T}
|
||||||
|
header::SAM.Header
|
||||||
|
start_offset::BGZFStreams.VirtualOffset
|
||||||
|
refseqnames::Vector{String}
|
||||||
|
refseqlens::Vector{Int}
|
||||||
|
index::Union{Nothing, BAI}
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.eltype(::Type{Reader{T}}) where T
|
||||||
|
return Record
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.IO.stream(reader::Reader)
|
||||||
|
return reader.stream
|
||||||
|
end
|
||||||
|
|
||||||
|
function Reader(input::IO; index=nothing)
|
||||||
|
if isa(index, AbstractString)
|
||||||
|
index = BAI(index)
|
||||||
|
else
|
||||||
|
if index != nothing
|
||||||
|
error("unrecognizable index argument")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
reader = init_bam_reader(input)
|
||||||
|
reader.index = index
|
||||||
|
return reader
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, reader::Reader)
|
||||||
|
println(io, summary(reader), ":")
|
||||||
|
print(io, " number of contigs: ", length(reader.refseqnames))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
header(reader::Reader; fillSQ::Bool=false)::SAM.Header
|
||||||
|
|
||||||
|
Get the header of `reader`.
|
||||||
|
|
||||||
|
If `fillSQ` is `true`, this function fills missing "SQ" metainfo in the header.
|
||||||
|
"""
|
||||||
|
function header(reader::Reader; fillSQ::Bool=false)::SAM.Header
|
||||||
|
header = reader.header
|
||||||
|
if fillSQ
|
||||||
|
if !isempty(findall(reader.header, "SQ"))
|
||||||
|
throw(ArgumentError("SAM header already has SQ records"))
|
||||||
|
end
|
||||||
|
header = copy(header)
|
||||||
|
for (name, len) in zip(reader.refseqnames, reader.refseqlens)
|
||||||
|
push!(header, SAM.MetaInfo("SQ", ["SN" => name, "LN" => len]))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return header
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset)
|
||||||
|
seek(reader.stream, voffset)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.seekstart(reader::Reader)
|
||||||
|
seek(reader.stream, reader.start_offset)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.iterate(reader::Reader, rec=Record())
|
||||||
|
if eof(reader)
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
read!(reader, rec)
|
||||||
|
return copy(rec), rec
|
||||||
|
end
|
||||||
|
|
||||||
|
# Initialize a BAM reader by reading the header section.
|
||||||
|
function init_bam_reader(input::BGZFStreams.BGZFStream)
|
||||||
|
# magic bytes
|
||||||
|
B = read(input, UInt8)
|
||||||
|
A = read(input, UInt8)
|
||||||
|
M = read(input, UInt8)
|
||||||
|
x = read(input, UInt8)
|
||||||
|
if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01
|
||||||
|
error("input was not a valid BAM file")
|
||||||
|
end
|
||||||
|
|
||||||
|
# SAM header
|
||||||
|
textlen = read(input, Int32)
|
||||||
|
samreader = SAM.Reader(IOBuffer(read(input, textlen)))
|
||||||
|
|
||||||
|
# reference sequences
|
||||||
|
refseqnames = String[]
|
||||||
|
refseqlens = Int[]
|
||||||
|
n_refs = read(input, Int32)
|
||||||
|
for _ in 1:n_refs
|
||||||
|
namelen = read(input, Int32)
|
||||||
|
data = read(input, namelen)
|
||||||
|
seqname = unsafe_string(pointer(data))
|
||||||
|
seqlen = read(input, Int32)
|
||||||
|
push!(refseqnames, seqname)
|
||||||
|
push!(refseqlens, seqlen)
|
||||||
|
end
|
||||||
|
|
||||||
|
voffset = isa(input.io, Base.AbstractPipe) ?
|
||||||
|
BGZFStreams.VirtualOffset(0, 0) :
|
||||||
|
BGZFStreams.virtualoffset(input)
|
||||||
|
return Reader(
|
||||||
|
input,
|
||||||
|
samreader.header,
|
||||||
|
voffset,
|
||||||
|
refseqnames,
|
||||||
|
refseqlens,
|
||||||
|
nothing)
|
||||||
|
end
|
||||||
|
|
||||||
|
function init_bam_reader(input::IO)
|
||||||
|
return init_bam_reader(BGZFStreams.BGZFStream(input))
|
||||||
|
end
|
||||||
|
|
||||||
|
function _read!(reader::Reader, record)
|
||||||
|
unsafe_read(
|
||||||
|
reader.stream,
|
||||||
|
pointer_from_objref(record),
|
||||||
|
FIXED_FIELDS_BYTES)
|
||||||
|
dsize = data_size(record)
|
||||||
|
if length(record.data) < dsize
|
||||||
|
resize!(record.data, dsize)
|
||||||
|
end
|
||||||
|
unsafe_read(reader.stream, pointer(record.data), dsize)
|
||||||
|
record.reader = reader
|
||||||
|
return record
|
||||||
|
end
|
634
src/bam/record.jl
Normal file
634
src/bam/record.jl
Normal file
|
@ -0,0 +1,634 @@
|
||||||
|
# BAM Record
|
||||||
|
# ==========
|
||||||
|
|
||||||
|
"""
|
||||||
|
BAM.Record()
|
||||||
|
|
||||||
|
Create an unfilled BAM record.
|
||||||
|
"""
|
||||||
|
mutable struct Record
|
||||||
|
# fixed-length fields (see BMA specs for the details)
|
||||||
|
block_size::Int32
|
||||||
|
refid::Int32
|
||||||
|
pos::Int32
|
||||||
|
bin_mq_nl::UInt32
|
||||||
|
flag_nc::UInt32
|
||||||
|
l_seq::Int32
|
||||||
|
next_refid::Int32
|
||||||
|
next_pos::Int32
|
||||||
|
tlen::Int32
|
||||||
|
# variable length data
|
||||||
|
data::Vector{UInt8}
|
||||||
|
reader::Reader
|
||||||
|
|
||||||
|
function Record()
|
||||||
|
return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# the data size of fixed-length fields (block_size-tlen)
|
||||||
|
const FIXED_FIELDS_BYTES = 36
|
||||||
|
|
||||||
|
function Record(data::Vector{UInt8})
|
||||||
|
return convert(Record, data)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.convert(::Type{Record}, data::Vector{UInt8})
|
||||||
|
length(data) < FIXED_FIELDS_BYTES && throw(ArgumentError("data too short"))
|
||||||
|
record = Record()
|
||||||
|
dst_pointer = Ptr{UInt8}(pointer_from_objref(record))
|
||||||
|
unsafe_copyto!(dst_pointer, pointer(data), FIXED_FIELDS_BYTES)
|
||||||
|
dsize = data_size(record)
|
||||||
|
resize!(record.data, dsize)
|
||||||
|
length(data) < dsize + FIXED_FIELDS_BYTES && throw(ArgumentError("data too short"))
|
||||||
|
unsafe_copyto!(record.data, 1, data, FIXED_FIELDS_BYTES + 1, dsize)
|
||||||
|
return record
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.copy(record::Record)
|
||||||
|
copy = Record()
|
||||||
|
copy.block_size = record.block_size
|
||||||
|
copy.refid = record.refid
|
||||||
|
copy.pos = record.pos
|
||||||
|
copy.bin_mq_nl = record.bin_mq_nl
|
||||||
|
copy.flag_nc = record.flag_nc
|
||||||
|
copy.l_seq = record.l_seq
|
||||||
|
copy.next_refid = record.next_refid
|
||||||
|
copy.next_pos = record.next_pos
|
||||||
|
copy.tlen = record.tlen
|
||||||
|
copy.data = record.data[1:data_size(record)]
|
||||||
|
if isdefined(record, :reader)
|
||||||
|
copy.reader = record.reader
|
||||||
|
end
|
||||||
|
return copy
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, record::Record)
|
||||||
|
print(io, summary(record), ':')
|
||||||
|
if isfilled(record)
|
||||||
|
println(io)
|
||||||
|
println(io, " template name: ", tempname(record))
|
||||||
|
println(io, " flag: ", flag(record))
|
||||||
|
println(io, " reference ID: ", refid(record))
|
||||||
|
println(io, " position: ", position(record))
|
||||||
|
println(io, " mapping quality: ", mappingquality(record))
|
||||||
|
println(io, " CIGAR: ", cigar(record))
|
||||||
|
println(io, " next reference ID: ", nextrefid(record))
|
||||||
|
println(io, " next position: ", nextposition(record))
|
||||||
|
println(io, " template length: ", templength(record))
|
||||||
|
println(io, " sequence: ", sequence(record))
|
||||||
|
# TODO: pretty print base quality
|
||||||
|
println(io, " base quality: ", quality(record))
|
||||||
|
print(io, " auxiliary data:")
|
||||||
|
for field in keys(auxdata(record))
|
||||||
|
print(io, ' ', field, '=', record[field])
|
||||||
|
end
|
||||||
|
else
|
||||||
|
print(io, " <not filled>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.read!(reader::Reader, record::Record)
|
||||||
|
return _read!(reader, record)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Accessor Fuctions
|
||||||
|
# -----------------
|
||||||
|
|
||||||
|
"""
|
||||||
|
flag(record::Record)::UInt16
|
||||||
|
|
||||||
|
Get the bitwise flag of `record`.
|
||||||
|
"""
|
||||||
|
function flag(record::Record)::UInt16
|
||||||
|
checkfilled(record)
|
||||||
|
return UInt16(record.flag_nc >> 16)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasflag(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
ismapped(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if `record` is mapped.
|
||||||
|
"""
|
||||||
|
function ismapped(record::Record)::Bool
|
||||||
|
return flag(record) & SAM.FLAG_UNMAP == 0
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
isprimary(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if `record` is a primary line of the read.
|
||||||
|
|
||||||
|
This is equivalent to `flag(record) & 0x900 == 0`.
|
||||||
|
"""
|
||||||
|
function isprimary(record::Record)::Bool
|
||||||
|
return flag(record) & 0x900 == 0
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
ispositivestrand(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if `record` is aligned to the positive strand.
|
||||||
|
|
||||||
|
This is equivalent to `flag(record) & 0x10 == 0`.
|
||||||
|
"""
|
||||||
|
function ispositivestrand(record::Record)::Bool
|
||||||
|
flag(record) & 0x10 == 0
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
refid(record::Record)::Int
|
||||||
|
|
||||||
|
Get the reference sequence ID of `record`.
|
||||||
|
|
||||||
|
The ID is 1-based (i.e. the first sequence is 1) and is 0 for a record without a mapping position.
|
||||||
|
|
||||||
|
See also: `BAM.rname`
|
||||||
|
"""
|
||||||
|
function refid(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return record.refid + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasrefid(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function checked_refid(record::Record)
|
||||||
|
id = refid(record)
|
||||||
|
if id == 0
|
||||||
|
throw(ArgumentError("record is not mapped"))
|
||||||
|
elseif !isdefined(record, :reader)
|
||||||
|
throw(ArgumentError("reader is not defined"))
|
||||||
|
end
|
||||||
|
return id
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
refname(record::Record)::String
|
||||||
|
|
||||||
|
Get the reference sequence name of `record`.
|
||||||
|
|
||||||
|
See also: `BAM.refid`
|
||||||
|
"""
|
||||||
|
function refname(record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
id = checked_refid(record)
|
||||||
|
return record.reader.refseqnames[id]
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
reflen(record::Record)::Int
|
||||||
|
|
||||||
|
Get the length of the reference sequence this record applies to.
|
||||||
|
"""
|
||||||
|
function reflen(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
id = checked_refid(record)
|
||||||
|
return record.reader.refseqlens[id]
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasrefname(record::Record)
|
||||||
|
return hasrefid(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
position(record::Record)::Int
|
||||||
|
|
||||||
|
Get the 1-based leftmost mapping position of `record`.
|
||||||
|
"""
|
||||||
|
function position(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return record.pos + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasposition(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
rightposition(record::Record)::Int
|
||||||
|
|
||||||
|
Get the 1-based rightmost mapping position of `record`.
|
||||||
|
"""
|
||||||
|
function rightposition(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return Int32(position(record) + alignlength(record) - 1)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasrightposition(record::Record)
|
||||||
|
return isfilled(record) && ismapped(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
isnextmapped(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if the mate/next read of `record` is mapped.
|
||||||
|
"""
|
||||||
|
function isnextmapped(record::Record)::Bool
|
||||||
|
return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
nextrefid(record::Record)::Int
|
||||||
|
|
||||||
|
Get the next/mate reference sequence ID of `record`.
|
||||||
|
"""
|
||||||
|
function nextrefid(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return record.next_refid + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasnextrefid(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
nextrefname(record::Record)::String
|
||||||
|
|
||||||
|
Get the reference name of the mate/next read of `record`.
|
||||||
|
"""
|
||||||
|
function nextrefname(record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
id = nextrefid(record)
|
||||||
|
if id == 0
|
||||||
|
throw(ArgumentError("next record is not mapped"))
|
||||||
|
elseif !isdefined(record, :reader)
|
||||||
|
throw(ArgumentError("reader is not defined"))
|
||||||
|
end
|
||||||
|
return record.reader.refseqnames[id]
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasnextrefname(record::Record)
|
||||||
|
return isfilled(record) && isnextmapped(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
nextposition(record::Record)::Int
|
||||||
|
|
||||||
|
Get the 1-based leftmost mapping position of the next/mate read of `record`.
|
||||||
|
"""
|
||||||
|
function nextposition(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return record.next_pos + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasnextposition(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
mappingquality(record::Record)::UInt8
|
||||||
|
|
||||||
|
Get the mapping quality of `record`.
|
||||||
|
"""
|
||||||
|
function mappingquality(record::Record)::UInt8
|
||||||
|
return UInt8((record.bin_mq_nl >> 8) & 0xff)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasmappingquality(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
n_cigar_op(record::Record, checkCG::Bool = true)
|
||||||
|
|
||||||
|
Return the number of operations in the CIGAR string of `record`.
|
||||||
|
|
||||||
|
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
|
||||||
|
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
|
||||||
|
|
||||||
|
Calling this method with `checkCG` set to `true` (default) this method will always yield the number of operations in the true cigar string, because this is probably what you want, the vast majority of the time.
|
||||||
|
|
||||||
|
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to get the number of operations in the `cigar` field of the BAM record, then set `checkCG` to `false`.
|
||||||
|
"""
|
||||||
|
function n_cigar_op(record::Record, checkCG::Bool = true)
|
||||||
|
return cigar_position(record, checkCG)[2]
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
cigar(record::Record)::String
|
||||||
|
|
||||||
|
Get the CIGAR string of `record`.
|
||||||
|
|
||||||
|
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
|
||||||
|
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
|
||||||
|
|
||||||
|
Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time.
|
||||||
|
|
||||||
|
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`.
|
||||||
|
|
||||||
|
See also `BAM.cigar_rle`.
|
||||||
|
"""
|
||||||
|
function cigar(record::Record, checkCG::Bool = true)::String
|
||||||
|
buf = IOBuffer()
|
||||||
|
for (op, len) in zip(cigar_rle(record, checkCG)...)
|
||||||
|
print(buf, len, convert(Char, op))
|
||||||
|
end
|
||||||
|
return String(take!(buf))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}}
|
||||||
|
|
||||||
|
Get a run-length encoded tuple `(ops, lens)` of the CIGAR string in `record`.
|
||||||
|
|
||||||
|
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
|
||||||
|
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
|
||||||
|
|
||||||
|
Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time.
|
||||||
|
|
||||||
|
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`.
|
||||||
|
|
||||||
|
See also `BAM.cigar`.
|
||||||
|
"""
|
||||||
|
function cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}}
|
||||||
|
checkfilled(record)
|
||||||
|
idx, nops = cigar_position(record, checkCG)
|
||||||
|
ops, lens = extract_cigar_rle(record.data, idx, nops)
|
||||||
|
return ops, lens
|
||||||
|
end
|
||||||
|
|
||||||
|
function extract_cigar_rle(data::Vector{UInt8}, offset, n)
|
||||||
|
ops = Vector{BioAlignments.Operation}()
|
||||||
|
lens = Vector{Int}()
|
||||||
|
for i in offset:4:offset + (n - 1) * 4
|
||||||
|
x = unsafe_load(Ptr{UInt32}(pointer(data, i)))
|
||||||
|
op = BioAlignments.Operation(x & 0x0F)
|
||||||
|
push!(ops, op)
|
||||||
|
push!(lens, x >> 4)
|
||||||
|
end
|
||||||
|
return ops, lens
|
||||||
|
end
|
||||||
|
|
||||||
|
function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int}
|
||||||
|
cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF
|
||||||
|
if !checkCG
|
||||||
|
return cigaridx, nops
|
||||||
|
end
|
||||||
|
if nops != 2
|
||||||
|
return cigaridx, nops
|
||||||
|
end
|
||||||
|
x = unsafe_load(Ptr{UInt32}(pointer(record.data, cigaridx)))
|
||||||
|
if x != UInt32(seqlength(record) << 4 | 4)
|
||||||
|
return cigaridx, nops
|
||||||
|
end
|
||||||
|
start = auxdata_position(record)
|
||||||
|
stop = data_size(record)
|
||||||
|
tagidx = findauxtag(record.data, start, stop, UInt8('C'), UInt8('G'))
|
||||||
|
if tagidx == 0
|
||||||
|
return cigaridx, nops
|
||||||
|
end
|
||||||
|
# Tag exists, validate type is BI.
|
||||||
|
typ = unsafe_load(Ptr{UInt16}(pointer(record.data, tagidx += 2)))
|
||||||
|
if typ != (UInt16('I') << 8 | UInt16('B'))
|
||||||
|
return cigaridx, nops
|
||||||
|
end
|
||||||
|
# If got this far, the CG tag is valid and contains the cigar.
|
||||||
|
# Get the true n_cigar_ops, and return it and the idx of the first
|
||||||
|
nops = UInt32(unsafe_load(Ptr{Int32}(pointer(record.data, tagidx += 2))))
|
||||||
|
tagidx += 4
|
||||||
|
return tagidx, nops
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
alignment(record::Record)::BioAlignments.Alignment
|
||||||
|
|
||||||
|
Get the alignment of `record`.
|
||||||
|
"""
|
||||||
|
function alignment(record::Record)::BioAlignments.Alignment
|
||||||
|
checkfilled(record)
|
||||||
|
if !ismapped(record)
|
||||||
|
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
|
||||||
|
end
|
||||||
|
seqpos = 0
|
||||||
|
refpos = position(record) - 1
|
||||||
|
anchors = [BioAlignments.AlignmentAnchor(seqpos, refpos, BioAlignments.OP_START)]
|
||||||
|
for (op, len) in zip(cigar_rle(record)...)
|
||||||
|
if BioAlignments.ismatchop(op)
|
||||||
|
seqpos += len
|
||||||
|
refpos += len
|
||||||
|
elseif BioAlignments.isinsertop(op)
|
||||||
|
seqpos += len
|
||||||
|
elseif BioAlignments.isdeleteop(op)
|
||||||
|
refpos += len
|
||||||
|
else
|
||||||
|
error("operation $(op) is not supported")
|
||||||
|
end
|
||||||
|
push!(anchors, BioAlignments.AlignmentAnchor(seqpos, refpos, op))
|
||||||
|
end
|
||||||
|
return BioAlignments.Alignment(anchors)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasalignment(record::Record)
|
||||||
|
return ismapped(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
alignlength(record::Record)::Int
|
||||||
|
|
||||||
|
Get the alignment length of `record`.
|
||||||
|
"""
|
||||||
|
function alignlength(record::Record)::Int
|
||||||
|
offset = seqname_length(record)
|
||||||
|
length::Int = 0
|
||||||
|
for i in offset + 1:4:offset + n_cigar_op(record, false) * 4
|
||||||
|
x = unsafe_load(Ptr{UInt32}(pointer(record.data, i)))
|
||||||
|
op = BioAlignments.Operation(x & 0x0F)
|
||||||
|
if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op)
|
||||||
|
length += x >> 4
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return length
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
tempname(record::Record)::String
|
||||||
|
|
||||||
|
Get the query template name of `record`.
|
||||||
|
"""
|
||||||
|
function tempname(record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
# drop the last NUL character
|
||||||
|
return unsafe_string(pointer(record.data), max(seqname_length(record) - 1, 0))
|
||||||
|
end
|
||||||
|
|
||||||
|
function hastempname(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
templength(record::Record)::Int
|
||||||
|
|
||||||
|
Get the template length of `record`.
|
||||||
|
"""
|
||||||
|
function templength(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return record.tlen
|
||||||
|
end
|
||||||
|
|
||||||
|
function hastemplength(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
sequence(record::Record)::BioSequences.LongDNASeq
|
||||||
|
|
||||||
|
Get the segment sequence of `record`.
|
||||||
|
"""
|
||||||
|
function sequence(record::Record)::BioSequences.LongDNASeq
|
||||||
|
checkfilled(record)
|
||||||
|
seqlen = seqlength(record)
|
||||||
|
data = Vector{UInt64}(undef, cld(seqlen, 16))
|
||||||
|
src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1)
|
||||||
|
for i in 1:lastindex(data)
|
||||||
|
# copy data flipping high and low nybble
|
||||||
|
x = unsafe_load(src, i)
|
||||||
|
data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4
|
||||||
|
end
|
||||||
|
return BioSequences.LongDNASeq(data, 1:seqlen, false)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hassequence(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
seqlength(record::Record)::Int
|
||||||
|
|
||||||
|
Get the sequence length of `record`.
|
||||||
|
"""
|
||||||
|
function seqlength(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
return record.l_seq % Int
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasseqlength(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
quality(record::Record)::Vector{UInt8}
|
||||||
|
|
||||||
|
Get the base quality of `record`.
|
||||||
|
"""
|
||||||
|
function quality(record::Record)::Vector{UInt8}
|
||||||
|
checkfilled(record)
|
||||||
|
seqlen = seqlength(record)
|
||||||
|
offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2)
|
||||||
|
return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen]
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasquality(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
auxdata(record::Record)::BAM.AuxData
|
||||||
|
|
||||||
|
Get the auxiliary data of `record`.
|
||||||
|
"""
|
||||||
|
function auxdata(record::Record)
|
||||||
|
checkfilled(record)
|
||||||
|
return AuxData(record.data[auxdata_position(record):data_size(record)])
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasauxdata(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.getindex(record::Record, tag::AbstractString)
|
||||||
|
checkauxtag(tag)
|
||||||
|
start = auxdata_position(record)
|
||||||
|
stop = data_size(record)
|
||||||
|
return getauxvalue(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2]))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.haskey(record::Record, tag::AbstractString)
|
||||||
|
checkauxtag(tag)
|
||||||
|
start = auxdata_position(record)
|
||||||
|
stop = data_size(record)
|
||||||
|
return findauxtag(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) > 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.keys(record::Record)
|
||||||
|
return collect(keys(auxdata(record)))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.values(record::Record)
|
||||||
|
return [record[key] for key in keys(record)]
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# BioCore Methods
|
||||||
|
# -----------
|
||||||
|
|
||||||
|
function BioCore.isfilled(record::Record)
|
||||||
|
return record.block_size != 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.seqname(record::Record)
|
||||||
|
return tempname(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hasseqname(record::Record)
|
||||||
|
return hastempname(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.sequence(record::Record)
|
||||||
|
return sequence(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hassequence(record::Record)
|
||||||
|
return hassequence(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.leftposition(record::Record)
|
||||||
|
return position(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hasleftposition(record::Record)
|
||||||
|
return hasposition(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.rightposition(record::Record)
|
||||||
|
return rightposition(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hasrightposition(record::Record)
|
||||||
|
return hasrightposition(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Helper Functions
|
||||||
|
# ----------------
|
||||||
|
|
||||||
|
# Return the size of the `.data` field.
|
||||||
|
function data_size(record::Record)
|
||||||
|
if isfilled(record)
|
||||||
|
return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size)
|
||||||
|
else
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function checkfilled(record::Record)
|
||||||
|
if !isfilled(record)
|
||||||
|
throw(ArgumentError("unfilled BAM record"))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function auxdata_position(record::Record)
|
||||||
|
seqlen = seqlength(record)
|
||||||
|
return seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + seqlen + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return the length of the read name.
|
||||||
|
function seqname_length(record::Record)
|
||||||
|
return record.bin_mq_nl & 0xff
|
||||||
|
end
|
62
src/bam/writer.jl
Normal file
62
src/bam/writer.jl
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
# BAM Writer
|
||||||
|
# ==========
|
||||||
|
|
||||||
|
"""
|
||||||
|
BAM.Writer(output::BGZFStream, header::SAM.Header)
|
||||||
|
|
||||||
|
Create a data writer of the BAM file format.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
* `output`: data sink
|
||||||
|
* `header`: SAM header object
|
||||||
|
"""
|
||||||
|
mutable struct Writer <: BioCore.IO.AbstractWriter
|
||||||
|
stream::BGZFStreams.BGZFStream
|
||||||
|
end
|
||||||
|
|
||||||
|
function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header)
|
||||||
|
refseqnames = String[]
|
||||||
|
refseqlens = Int[]
|
||||||
|
for metainfo in findall(header, "SQ")
|
||||||
|
push!(refseqnames, metainfo["SN"])
|
||||||
|
push!(refseqlens, parse(Int, metainfo["LN"]))
|
||||||
|
end
|
||||||
|
write_header(stream, header, refseqnames, refseqlens)
|
||||||
|
return Writer(stream)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.IO.stream(writer::Writer)
|
||||||
|
return writer.stream
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.write(writer::Writer, record::Record)
|
||||||
|
n = 0
|
||||||
|
n += unsafe_write(writer.stream, pointer_from_objref(record), FIXED_FIELDS_BYTES)
|
||||||
|
n += unsafe_write(writer.stream, pointer(record.data), data_size(record))
|
||||||
|
return n
|
||||||
|
end
|
||||||
|
|
||||||
|
function write_header(stream, header, refseqnames, refseqlens)
|
||||||
|
@assert length(refseqnames) == length(refseqlens)
|
||||||
|
n = 0
|
||||||
|
|
||||||
|
# magic bytes
|
||||||
|
n += write(stream, "BAM\1")
|
||||||
|
|
||||||
|
# SAM header
|
||||||
|
buf = IOBuffer()
|
||||||
|
l = write(SAM.Writer(buf), header)
|
||||||
|
n += write(stream, Int32(l))
|
||||||
|
n += write(stream, take!(buf))
|
||||||
|
|
||||||
|
# reference sequences
|
||||||
|
n += write(stream, Int32(length(refseqnames)))
|
||||||
|
for (seqname, seqlen) in zip(refseqnames, refseqlens)
|
||||||
|
namelen = length(seqname)
|
||||||
|
n += write(stream, Int32(namelen + 1))
|
||||||
|
n += write(stream, seqname, '\0')
|
||||||
|
n += write(stream, Int32(seqlen))
|
||||||
|
end
|
||||||
|
|
||||||
|
return n
|
||||||
|
end
|
26
src/sam/flags.jl
Normal file
26
src/sam/flags.jl
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# SAM Flags
|
||||||
|
# =========
|
||||||
|
#
|
||||||
|
# Bitwise flags (or FLAG).
|
||||||
|
|
||||||
|
for (name, bits, doc) in [
|
||||||
|
(:PAIRED, UInt16(0x001), "the read is paired in sequencing, no matter whether it is mapped in a pair"),
|
||||||
|
(:PROPER_PAIR, UInt16(0x002), "the read is mapped in a proper pair" ),
|
||||||
|
(:UNMAP, UInt16(0x004), "the read itself is unmapped; conflictive with SAM.FLAG_PROPER_PAIR" ),
|
||||||
|
(:MUNMAP, UInt16(0x008), "the mate is unmapped" ),
|
||||||
|
(:REVERSE, UInt16(0x010), "the read is mapped to the reverse strand" ),
|
||||||
|
(:MREVERSE, UInt16(0x020), "the mate is mapped to the reverse strand" ),
|
||||||
|
(:READ1, UInt16(0x040), "this is read1" ),
|
||||||
|
(:READ2, UInt16(0x080), "this is read2" ),
|
||||||
|
(:SECONDARY, UInt16(0x100), "not primary alignment" ),
|
||||||
|
(:QCFAIL, UInt16(0x200), "QC failure" ),
|
||||||
|
(:DUP, UInt16(0x400), "optical or PCR duplicate" ),
|
||||||
|
(:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),]
|
||||||
|
@assert bits isa UInt16
|
||||||
|
sym = Symbol("FLAG_", name)
|
||||||
|
doc = string(@sprintf("0x%04x: ", bits), doc)
|
||||||
|
@eval begin
|
||||||
|
@doc $(doc) const $(sym) = $(bits)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
53
src/sam/header.jl
Normal file
53
src/sam/header.jl
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
# SAM Header
|
||||||
|
# ==========
|
||||||
|
|
||||||
|
struct Header
|
||||||
|
metainfo::Vector{MetaInfo}
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
SAM.Header()
|
||||||
|
|
||||||
|
Create an empty header.
|
||||||
|
"""
|
||||||
|
function Header()
|
||||||
|
return Header(MetaInfo[])
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.copy(header::Header)
|
||||||
|
return Header(header.metainfo)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.eltype(::Type{Header})
|
||||||
|
return MetaInfo
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.length(header::Header)
|
||||||
|
return length(header.metainfo)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.iterate(header::Header, i=1)
|
||||||
|
if i > length(header.metainfo)
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
return header.metainfo[i], i + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
find(header::Header, key::AbstractString)::Vector{MetaInfo}
|
||||||
|
|
||||||
|
Find metainfo objects satisfying `SAM.tag(metainfo) == key`.
|
||||||
|
"""
|
||||||
|
function Base.findall(header::Header, key::AbstractString)::Vector{MetaInfo}
|
||||||
|
return filter(m -> isequalkey(m, key), header.metainfo)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.pushfirst!(header::Header, metainfo::MetaInfo)
|
||||||
|
pushfirst!(header.metainfo, metainfo)
|
||||||
|
return header
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.push!(header::Header, metainfo::MetaInfo)
|
||||||
|
push!(header.metainfo, metainfo)
|
||||||
|
return header
|
||||||
|
end
|
235
src/sam/metainfo.jl
Normal file
235
src/sam/metainfo.jl
Normal file
|
@ -0,0 +1,235 @@
|
||||||
|
# SAM Meta-Information
|
||||||
|
# ====================
|
||||||
|
|
||||||
|
mutable struct MetaInfo
|
||||||
|
# data and filled range
|
||||||
|
data::Vector{UInt8}
|
||||||
|
filled::UnitRange{Int}
|
||||||
|
# indexes
|
||||||
|
tag::UnitRange{Int}
|
||||||
|
val::UnitRange{Int}
|
||||||
|
dictkey::Vector{UnitRange{Int}}
|
||||||
|
dictval::Vector{UnitRange{Int}}
|
||||||
|
end
|
||||||
|
|
||||||
|
function MetaInfo(data::Vector{UInt8}=UInt8[])
|
||||||
|
metainfo = MetaInfo(data, 1:0, 1:0, 1:0, UnitRange{Int}[], UnitRange{Int}[])
|
||||||
|
if !isempty(data)
|
||||||
|
index!(metainfo)
|
||||||
|
end
|
||||||
|
return metainfo
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
MetaInfo(str::AbstractString)
|
||||||
|
|
||||||
|
Create a SAM metainfo from `str`.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
|
julia> SAM.MetaInfo("@CO\tsome comment")
|
||||||
|
BioAlignments.SAM.MetaInfo:
|
||||||
|
tag: CO
|
||||||
|
value: some comment
|
||||||
|
|
||||||
|
julia> SAM.MetaInfo("@SQ\tSN:chr1\tLN:12345")
|
||||||
|
BioAlignments.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=chr1 LN=12345
|
||||||
|
|
||||||
|
"""
|
||||||
|
function MetaInfo(str::AbstractString)
|
||||||
|
return MetaInfo(Vector{UInt8}(str))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
MetaInfo(tag::AbstractString, value)
|
||||||
|
|
||||||
|
Create a SAM metainfo with `tag` and `value`.
|
||||||
|
|
||||||
|
`tag` is a two-byte ASCII string. If `tag` is `"CO"`, `value` must be a string; otherwise, `value` is an iterable object with key and value pairs.
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
|
julia> SAM.MetaInfo("CO", "some comment")
|
||||||
|
BioAlignments.SAM.MetaInfo:
|
||||||
|
tag: CO
|
||||||
|
value: some comment
|
||||||
|
|
||||||
|
julia> string(ans)
|
||||||
|
"@CO\tsome comment"
|
||||||
|
|
||||||
|
julia> SAM.MetaInfo("SQ", ["SN" => "chr1", "LN" => 12345])
|
||||||
|
BioAlignments.SAM.MetaInfo:
|
||||||
|
tag: SQ
|
||||||
|
value: SN=chr1 LN=12345
|
||||||
|
|
||||||
|
julia> string(ans)
|
||||||
|
"@SQ\tSN:chr1\tLN:12345"
|
||||||
|
|
||||||
|
"""
|
||||||
|
function MetaInfo(tag::AbstractString, value)
|
||||||
|
buf = IOBuffer()
|
||||||
|
if tag == "CO" # comment
|
||||||
|
if !isa(value, AbstractString)
|
||||||
|
throw(ArgumentError("value must be a string"))
|
||||||
|
end
|
||||||
|
write(buf, "@CO\t", value)
|
||||||
|
elseif occursin(r"[A-Z][A-Z]", tag)
|
||||||
|
print(buf, '@', tag)
|
||||||
|
for (key, val) in value
|
||||||
|
print(buf, '\t', key, ':', val)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
throw(ArgumentError("tag must match r\"[A-Z][A-Z]\""))
|
||||||
|
end
|
||||||
|
return MetaInfo(take!(buf))
|
||||||
|
end
|
||||||
|
|
||||||
|
function initialize!(metainfo::MetaInfo)
|
||||||
|
metainfo.filled = 1:0
|
||||||
|
metainfo.tag = 1:0
|
||||||
|
metainfo.val = 1:0
|
||||||
|
empty!(metainfo.dictkey)
|
||||||
|
empty!(metainfo.dictval)
|
||||||
|
return metainfo
|
||||||
|
end
|
||||||
|
|
||||||
|
function isfilled(metainfo::MetaInfo)
|
||||||
|
return !isempty(metainfo.filled)
|
||||||
|
end
|
||||||
|
|
||||||
|
function datarange(metainfo::MetaInfo)
|
||||||
|
return metainfo.filled
|
||||||
|
end
|
||||||
|
|
||||||
|
function checkfilled(metainfo::MetaInfo)
|
||||||
|
if !isfilled(metainfo)
|
||||||
|
throw(ArgumentError("unfilled SAM metainfo"))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function isequalkey(metainfo::MetaInfo, key::AbstractString)
|
||||||
|
if !isfilled(metainfo) || sizeof(key) != 2
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
k1, k2 = UInt8(key[1]), UInt8(key[2])
|
||||||
|
return metainfo.data[metainfo.tag[1]] == k1 && metainfo.data[metainfo.tag[2]] == k2
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, metainfo::MetaInfo)
|
||||||
|
print(io, summary(metainfo), ':')
|
||||||
|
if isfilled(metainfo)
|
||||||
|
println(io)
|
||||||
|
println(io, " tag: ", tag(metainfo))
|
||||||
|
print(io, " value:")
|
||||||
|
if !iscomment(metainfo)
|
||||||
|
for (key, val) in zip(keys(metainfo), values(metainfo))
|
||||||
|
print(io, ' ', key, '=', val)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
print(io, ' ', value(metainfo))
|
||||||
|
end
|
||||||
|
else
|
||||||
|
print(io, " <not filled>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.print(io::IO, metainfo::MetaInfo)
|
||||||
|
write(io, metainfo)
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.write(io::IO, metainfo::MetaInfo)
|
||||||
|
checkfilled(metainfo)
|
||||||
|
r = datarange(metainfo)
|
||||||
|
return unsafe_write(io, pointer(metainfo.data, first(r)), length(r))
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Accessor Functions
|
||||||
|
# ------------------
|
||||||
|
|
||||||
|
"""
|
||||||
|
iscomment(metainfo::MetaInfo)::Bool
|
||||||
|
|
||||||
|
Test if `metainfo` is a comment (i.e. its tag is "CO").
|
||||||
|
"""
|
||||||
|
function iscomment(metainfo::MetaInfo)::Bool
|
||||||
|
return isequalkey(metainfo, "CO")
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
tag(metainfo::MetaInfo)::String
|
||||||
|
|
||||||
|
Get the tag of `metainfo`.
|
||||||
|
"""
|
||||||
|
function tag(metainfo::MetaInfo)::String
|
||||||
|
checkfilled(metainfo)
|
||||||
|
return String(metainfo.data[metainfo.tag])
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
value(metainfo::MetaInfo)::String
|
||||||
|
|
||||||
|
Get the value of `metainfo` as a string.
|
||||||
|
"""
|
||||||
|
function value(metainfo::MetaInfo)::String
|
||||||
|
checkfilled(metainfo)
|
||||||
|
return String(metainfo.data[metainfo.val])
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}}
|
||||||
|
|
||||||
|
Get the values of `metainfo` as string pairs.
|
||||||
|
"""
|
||||||
|
function keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}}
|
||||||
|
checkfilled(metainfo)
|
||||||
|
if iscomment(metainfo)
|
||||||
|
throw(ArgumentError("not a dictionary"))
|
||||||
|
end
|
||||||
|
return Pair{String,String}[key => val for (key, val) in zip(keys(metainfo), values(metainfo))]
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.keys(metainfo::MetaInfo)
|
||||||
|
checkfilled(metainfo)
|
||||||
|
if iscomment(metainfo)
|
||||||
|
throw(ArgumentError("not a dictionary"))
|
||||||
|
end
|
||||||
|
return [String(metainfo.data[r]) for r in metainfo.dictkey]
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.values(metainfo::MetaInfo)
|
||||||
|
checkfilled(metainfo)
|
||||||
|
if iscomment(metainfo)
|
||||||
|
throw(ArgumentError("not a dictionary"))
|
||||||
|
end
|
||||||
|
return [String(metainfo.data[r]) for r in metainfo.dictval]
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.haskey(metainfo::MetaInfo, key::AbstractString)
|
||||||
|
return findkey(metainfo, key) > 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.getindex(metainfo::MetaInfo, key::AbstractString)
|
||||||
|
i = findkey(metainfo, key)
|
||||||
|
if i == 0
|
||||||
|
throw(KeyError(key))
|
||||||
|
end
|
||||||
|
return String(metainfo.data[metainfo.dictval[i]])
|
||||||
|
end
|
||||||
|
|
||||||
|
function findkey(metainfo::MetaInfo, key::AbstractString)
|
||||||
|
checkfilled(metainfo)
|
||||||
|
if sizeof(key) != 2
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
t1, t2 = UInt8(key[1]), UInt8(key[2])
|
||||||
|
for (i, k) in enumerate(metainfo.dictkey)
|
||||||
|
if metainfo.data[first(k)] == t1 && metainfo.data[first(k)+1] == t2
|
||||||
|
return i
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
end
|
261
src/sam/reader.jl
Normal file
261
src/sam/reader.jl
Normal file
|
@ -0,0 +1,261 @@
|
||||||
|
# SAM Reader
|
||||||
|
# =========
|
||||||
|
|
||||||
|
mutable struct Reader <: BioCore.IO.AbstractReader
|
||||||
|
state::BioCore.Ragel.State
|
||||||
|
header::Header
|
||||||
|
|
||||||
|
function Reader(input::BufferedStreams.BufferedInputStream)
|
||||||
|
reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header())
|
||||||
|
readheader!(reader)
|
||||||
|
reader.state.cs = sam_body_machine.start_state
|
||||||
|
return reader
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
SAM.Reader(input::IO)
|
||||||
|
|
||||||
|
Create a data reader of the SAM file format.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
* `input`: data source
|
||||||
|
"""
|
||||||
|
function Reader(input::IO)
|
||||||
|
return Reader(BufferedStreams.BufferedInputStream(input))
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.IO.stream(reader::Reader)
|
||||||
|
return reader.state.stream
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
header(reader::Reader)::Header
|
||||||
|
|
||||||
|
Get the header of `reader`.
|
||||||
|
"""
|
||||||
|
function header(reader::Reader)::Header
|
||||||
|
return reader.header
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.eltype(::Type{Reader})
|
||||||
|
return Record
|
||||||
|
end
|
||||||
|
|
||||||
|
# file = header . body
|
||||||
|
# header = metainfo*
|
||||||
|
# body = record*
|
||||||
|
isinteractive() && info("compiling SAM")
|
||||||
|
const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function ()
|
||||||
|
cat = Automa.RegExp.cat
|
||||||
|
rep = Automa.RegExp.rep
|
||||||
|
alt = Automa.RegExp.alt
|
||||||
|
opt = Automa.RegExp.opt
|
||||||
|
any = Automa.RegExp.any
|
||||||
|
|
||||||
|
metainfo = let
|
||||||
|
tag = re"[A-Z][A-Z]" \ cat("CO")
|
||||||
|
tag.actions[:enter] = [:mark1]
|
||||||
|
tag.actions[:exit] = [:metainfo_tag]
|
||||||
|
|
||||||
|
dict = let
|
||||||
|
key = re"[A-Za-z][A-Za-z0-9]"
|
||||||
|
key.actions[:enter] = [:mark2]
|
||||||
|
key.actions[:exit] = [:metainfo_dict_key]
|
||||||
|
val = re"[ -~]+"
|
||||||
|
val.actions[:enter] = [:mark2]
|
||||||
|
val.actions[:exit] = [:metainfo_dict_val]
|
||||||
|
keyval = cat(key, ':', val)
|
||||||
|
|
||||||
|
cat(keyval, rep(cat('\t', keyval)))
|
||||||
|
end
|
||||||
|
dict.actions[:enter] = [:mark1]
|
||||||
|
dict.actions[:exit] = [:metainfo_val]
|
||||||
|
|
||||||
|
co = cat("CO")
|
||||||
|
co.actions[:enter] = [:mark1]
|
||||||
|
co.actions[:exit] = [:metainfo_tag]
|
||||||
|
|
||||||
|
comment = re"[^\r\n]*"
|
||||||
|
comment.actions[:enter] = [:mark1]
|
||||||
|
comment.actions[:exit] = [:metainfo_val]
|
||||||
|
|
||||||
|
cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment)))
|
||||||
|
end
|
||||||
|
metainfo.actions[:enter] = [:anchor]
|
||||||
|
metainfo.actions[:exit] = [:metainfo]
|
||||||
|
|
||||||
|
record = let
|
||||||
|
qname = re"[!-?A-~]+"
|
||||||
|
qname.actions[:enter] = [:mark]
|
||||||
|
qname.actions[:exit] = [:record_qname]
|
||||||
|
|
||||||
|
flag = re"[0-9]+"
|
||||||
|
flag.actions[:enter] = [:mark]
|
||||||
|
flag.actions[:exit] = [:record_flag]
|
||||||
|
|
||||||
|
rname = re"\*|[!-()+-<>-~][!-~]*"
|
||||||
|
rname.actions[:enter] = [:mark]
|
||||||
|
rname.actions[:exit] = [:record_rname]
|
||||||
|
|
||||||
|
pos = re"[0-9]+"
|
||||||
|
pos.actions[:enter] = [:mark]
|
||||||
|
pos.actions[:exit] = [:record_pos]
|
||||||
|
|
||||||
|
mapq = re"[0-9]+"
|
||||||
|
mapq.actions[:enter] = [:mark]
|
||||||
|
mapq.actions[:exit] = [:record_mapq]
|
||||||
|
|
||||||
|
cigar = re"\*|([0-9]+[MIDNSHPX=])+"
|
||||||
|
cigar.actions[:enter] = [:mark]
|
||||||
|
cigar.actions[:exit] = [:record_cigar]
|
||||||
|
|
||||||
|
rnext = re"\*|=|[!-()+-<>-~][!-~]*"
|
||||||
|
rnext.actions[:enter] = [:mark]
|
||||||
|
rnext.actions[:exit] = [:record_rnext]
|
||||||
|
|
||||||
|
pnext = re"[0-9]+"
|
||||||
|
pnext.actions[:enter] = [:mark]
|
||||||
|
pnext.actions[:exit] = [:record_pnext]
|
||||||
|
|
||||||
|
tlen = re"[-+]?[0-9]+"
|
||||||
|
tlen.actions[:enter] = [:mark]
|
||||||
|
tlen.actions[:exit] = [:record_tlen]
|
||||||
|
|
||||||
|
seq = re"\*|[A-Za-z=.]+"
|
||||||
|
seq.actions[:enter] = [:mark]
|
||||||
|
seq.actions[:exit] = [:record_seq]
|
||||||
|
|
||||||
|
qual = re"[!-~]+"
|
||||||
|
qual.actions[:enter] = [:mark]
|
||||||
|
qual.actions[:exit] = [:record_qual]
|
||||||
|
|
||||||
|
field = let
|
||||||
|
tag = re"[A-Za-z][A-Za-z0-9]"
|
||||||
|
val = alt(
|
||||||
|
re"A:[!-~]",
|
||||||
|
re"i:[-+]?[0-9]+",
|
||||||
|
re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?",
|
||||||
|
re"Z:[ !-~]*",
|
||||||
|
re"H:([0-9A-F][0-9A-F])*",
|
||||||
|
re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+")
|
||||||
|
|
||||||
|
cat(tag, ':', val)
|
||||||
|
end
|
||||||
|
field.actions[:enter] = [:mark]
|
||||||
|
field.actions[:exit] = [:record_field]
|
||||||
|
|
||||||
|
cat(
|
||||||
|
qname, '\t',
|
||||||
|
flag, '\t',
|
||||||
|
rname, '\t',
|
||||||
|
pos, '\t',
|
||||||
|
mapq, '\t',
|
||||||
|
cigar, '\t',
|
||||||
|
rnext, '\t',
|
||||||
|
pnext, '\t',
|
||||||
|
tlen, '\t',
|
||||||
|
seq, '\t',
|
||||||
|
qual,
|
||||||
|
rep(cat('\t', field)))
|
||||||
|
end
|
||||||
|
record.actions[:enter] = [:anchor]
|
||||||
|
record.actions[:exit] = [:record]
|
||||||
|
|
||||||
|
newline = let
|
||||||
|
lf = re"\n"
|
||||||
|
lf.actions[:enter] = [:countline]
|
||||||
|
|
||||||
|
cat(re"\r?", lf)
|
||||||
|
end
|
||||||
|
|
||||||
|
header′ = rep(cat(metainfo, newline))
|
||||||
|
header′.actions[:exit] = [:header]
|
||||||
|
header = cat(header′, opt(any() \ cat('@'))) # look ahead
|
||||||
|
|
||||||
|
body = rep(cat(record, newline))
|
||||||
|
|
||||||
|
return map(Automa.compile, (metainfo, record, header, body))
|
||||||
|
end)()
|
||||||
|
|
||||||
|
const sam_metainfo_actions = Dict(
|
||||||
|
:metainfo_tag => :(record.tag = (mark1:p-1) .- offset),
|
||||||
|
:metainfo_val => :(record.val = (mark1:p-1) .- offset),
|
||||||
|
:metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)),
|
||||||
|
:metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)),
|
||||||
|
:metainfo => quote
|
||||||
|
BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1)
|
||||||
|
record.filled = (offset+1:p-1) .- offset
|
||||||
|
end,
|
||||||
|
:anchor => :(),
|
||||||
|
:mark1 => :(mark1 = p),
|
||||||
|
:mark2 => :(mark2 = p))
|
||||||
|
eval(
|
||||||
|
BioCore.ReaderHelper.generate_index_function(
|
||||||
|
MetaInfo,
|
||||||
|
sam_metainfo_machine,
|
||||||
|
:(mark1 = mark2 = offset = 0),
|
||||||
|
sam_metainfo_actions))
|
||||||
|
eval(
|
||||||
|
BioCore.ReaderHelper.generate_readheader_function(
|
||||||
|
Reader,
|
||||||
|
MetaInfo,
|
||||||
|
sam_header_machine,
|
||||||
|
:(mark1 = mark2 = offset = 0),
|
||||||
|
merge(sam_metainfo_actions, Dict(
|
||||||
|
:metainfo => quote
|
||||||
|
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
|
||||||
|
record.filled = (offset+1:p-1) .- offset
|
||||||
|
@assert isfilled(record)
|
||||||
|
push!(reader.header.metainfo, record)
|
||||||
|
BioCore.ReaderHelper.ensure_margin!(stream)
|
||||||
|
record = MetaInfo()
|
||||||
|
end,
|
||||||
|
:header => :(finish_header = true; @escape),
|
||||||
|
:countline => :(linenum += 1),
|
||||||
|
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))),
|
||||||
|
quote
|
||||||
|
if !eof(stream)
|
||||||
|
stream.position -= 1 # cancel look-ahead
|
||||||
|
end
|
||||||
|
end))
|
||||||
|
|
||||||
|
const sam_record_actions = Dict(
|
||||||
|
:record_qname => :(record.qname = (mark:p-1) .- offset),
|
||||||
|
:record_flag => :(record.flag = (mark:p-1) .- offset),
|
||||||
|
:record_rname => :(record.rname = (mark:p-1) .- offset),
|
||||||
|
:record_pos => :(record.pos = (mark:p-1) .- offset),
|
||||||
|
:record_mapq => :(record.mapq = (mark:p-1) .- offset),
|
||||||
|
:record_cigar => :(record.cigar = (mark:p-1) .- offset),
|
||||||
|
:record_rnext => :(record.rnext = (mark:p-1) .- offset),
|
||||||
|
:record_pnext => :(record.pnext = (mark:p-1) .- offset),
|
||||||
|
:record_tlen => :(record.tlen = (mark:p-1) .- offset),
|
||||||
|
:record_seq => :(record.seq = (mark:p-1) .- offset),
|
||||||
|
:record_qual => :(record.qual = (mark:p-1) .- offset),
|
||||||
|
:record_field => :(push!(record.fields, (mark:p-1) .- offset)),
|
||||||
|
:record => quote
|
||||||
|
BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1)
|
||||||
|
record.filled = (offset+1:p-1) .- offset
|
||||||
|
end,
|
||||||
|
:anchor => :(),
|
||||||
|
:mark => :(mark = p))
|
||||||
|
eval(
|
||||||
|
BioCore.ReaderHelper.generate_index_function(
|
||||||
|
Record,
|
||||||
|
sam_record_machine,
|
||||||
|
:(mark = offset = 0),
|
||||||
|
sam_record_actions))
|
||||||
|
eval(
|
||||||
|
BioCore.ReaderHelper.generate_read_function(
|
||||||
|
Reader,
|
||||||
|
sam_body_machine,
|
||||||
|
:(mark = offset = 0),
|
||||||
|
merge(sam_record_actions, Dict(
|
||||||
|
:record => quote
|
||||||
|
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
|
||||||
|
record.filled = (offset+1:p-1) .- offset
|
||||||
|
found_record = true
|
||||||
|
@escape
|
||||||
|
end,
|
||||||
|
:countline => :(linenum += 1),
|
||||||
|
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1)))))
|
624
src/sam/record.jl
Normal file
624
src/sam/record.jl
Normal file
|
@ -0,0 +1,624 @@
|
||||||
|
# SAM Record
|
||||||
|
# ==========
|
||||||
|
|
||||||
|
mutable struct Record
|
||||||
|
# data and filled range
|
||||||
|
data::Vector{UInt8}
|
||||||
|
filled::UnitRange{Int}
|
||||||
|
# indexes
|
||||||
|
qname::UnitRange{Int}
|
||||||
|
flag::UnitRange{Int}
|
||||||
|
rname::UnitRange{Int}
|
||||||
|
pos::UnitRange{Int}
|
||||||
|
mapq::UnitRange{Int}
|
||||||
|
cigar::UnitRange{Int}
|
||||||
|
rnext::UnitRange{Int}
|
||||||
|
pnext::UnitRange{Int}
|
||||||
|
tlen::UnitRange{Int}
|
||||||
|
seq::UnitRange{Int}
|
||||||
|
qual::UnitRange{Int}
|
||||||
|
fields::Vector{UnitRange{Int}}
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
SAM.Record()
|
||||||
|
|
||||||
|
Create an unfilled SAM record.
|
||||||
|
"""
|
||||||
|
function Record()
|
||||||
|
return Record(
|
||||||
|
UInt8[], 1:0,
|
||||||
|
# qname-mapq
|
||||||
|
1:0, 1:0, 1:0, 1:0, 1:0,
|
||||||
|
# cigar-seq
|
||||||
|
1:0, 1:0, 1:0, 1:0, 1:0,
|
||||||
|
# qual and fields
|
||||||
|
1:0, UnitRange{Int}[])
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
SAM.Record(data::Vector{UInt8})
|
||||||
|
|
||||||
|
Create a SAM record from `data`.
|
||||||
|
This function verifies the format and indexes fields for accessors.
|
||||||
|
Note that the ownership of `data` is transferred to a new record object.
|
||||||
|
"""
|
||||||
|
function Record(data::Vector{UInt8})
|
||||||
|
return convert(Record, data)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.convert(::Type{Record}, data::Vector{UInt8})
|
||||||
|
record = Record(
|
||||||
|
data, 1:0,
|
||||||
|
# qname-mapq
|
||||||
|
1:0, 1:0, 1:0, 1:0, 1:0,
|
||||||
|
# cigar-seq
|
||||||
|
1:0, 1:0, 1:0, 1:0, 1:0,
|
||||||
|
# qual and fields
|
||||||
|
1:0, UnitRange{Int}[])
|
||||||
|
index!(record)
|
||||||
|
return record
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
SAM.Record(str::AbstractString)
|
||||||
|
|
||||||
|
Create a SAM record from `str`.
|
||||||
|
This function verifies the format and indexes fields for accessors.
|
||||||
|
"""
|
||||||
|
function Record(str::AbstractString)
|
||||||
|
return convert(Record, str)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.convert(::Type{Record}, str::AbstractString)
|
||||||
|
return Record(Vector{UInt8}(str))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.show(io::IO, record::Record)
|
||||||
|
print(io, summary(record), ':')
|
||||||
|
if isfilled(record)
|
||||||
|
println(io)
|
||||||
|
println(io, " template name: ", hastempname(record) ? tempname(record) : "<missing>")
|
||||||
|
println(io, " flag: ", hasflag(record) ? flag(record) : "<missing>")
|
||||||
|
println(io, " reference: ", hasrefname(record) ? refname(record) : "<missing>")
|
||||||
|
println(io, " position: ", hasposition(record) ? position(record) : "<missing>")
|
||||||
|
println(io, " mapping quality: ", hasmappingquality(record) ? mappingquality(record) : "<missing>")
|
||||||
|
println(io, " CIGAR: ", hascigar(record) ? cigar(record) : "<missing>")
|
||||||
|
println(io, " next reference: ", hasnextrefname(record) ? nextrefname(record) : "<missing>")
|
||||||
|
println(io, " next position: ", hasnextposition(record) ? nextposition(record) : "<missing>")
|
||||||
|
println(io, " template length: ", hastemplength(record) ? templength(record) : "<missing>")
|
||||||
|
println(io, " sequence: ", hassequence(record) ? sequence(String, record) : "<missing>")
|
||||||
|
println(io, " base quality: ", hasquality(record) ? quality(String, record) : "<missing>")
|
||||||
|
print(io, " auxiliary data:")
|
||||||
|
for field in record.fields
|
||||||
|
print(io, ' ', String(record.data[field]))
|
||||||
|
end
|
||||||
|
else
|
||||||
|
print(io, " <not filled>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.print(io::IO, record::Record)
|
||||||
|
write(io, record)
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.write(io::IO, record::Record)
|
||||||
|
checkfilled(record)
|
||||||
|
return unsafe_write(io, pointer(record.data, first(record.filled)), length(record.filled))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.copy(record::Record)
|
||||||
|
return Record(
|
||||||
|
copy(record.data),
|
||||||
|
record.filled,
|
||||||
|
record.qname,
|
||||||
|
record.flag,
|
||||||
|
record.rname,
|
||||||
|
record.pos,
|
||||||
|
record.mapq,
|
||||||
|
record.cigar,
|
||||||
|
record.rnext,
|
||||||
|
record.pnext,
|
||||||
|
record.tlen,
|
||||||
|
record.seq,
|
||||||
|
record.qual,
|
||||||
|
copy(record.fields))
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Accessor Functions
|
||||||
|
# ------------------
|
||||||
|
|
||||||
|
"""
|
||||||
|
flag(record::Record)::UInt16
|
||||||
|
|
||||||
|
Get the bitwise flag of `record`.
|
||||||
|
"""
|
||||||
|
function flag(record::Record)::UInt16
|
||||||
|
checkfilled(record)
|
||||||
|
return unsafe_parse_decimal(UInt16, record.data, record.flag)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasflag(record::Record)
|
||||||
|
return isfilled(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
ismapped(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if `record` is mapped.
|
||||||
|
"""
|
||||||
|
function ismapped(record::Record)::Bool
|
||||||
|
return isfilled(record) && (flag(record) & FLAG_UNMAP == 0)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
isprimary(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if `record` is a primary line of the read.
|
||||||
|
|
||||||
|
This is equivalent to `flag(record) & 0x900 == 0`.
|
||||||
|
"""
|
||||||
|
function isprimary(record::Record)::Bool
|
||||||
|
return flag(record) & 0x900 == 0
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
refname(record::Record)::String
|
||||||
|
|
||||||
|
Get the reference sequence name of `record`.
|
||||||
|
"""
|
||||||
|
function refname(record::Record)
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.rname)
|
||||||
|
missingerror(:refname)
|
||||||
|
end
|
||||||
|
return String(record.data[record.rname])
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasrefname(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.rname)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
position(record::Record)::Int
|
||||||
|
|
||||||
|
Get the 1-based leftmost mapping position of `record`.
|
||||||
|
"""
|
||||||
|
function position(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
pos = unsafe_parse_decimal(Int, record.data, record.pos)
|
||||||
|
if pos == 0
|
||||||
|
missingerror(:position)
|
||||||
|
end
|
||||||
|
return pos
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasposition(record::Record)
|
||||||
|
return isfilled(record) && (length(record.pos) != 1 || record.data[first(record.pos)] != UInt8('0'))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
rightposition(record::Record)::Int
|
||||||
|
|
||||||
|
Get the 1-based rightmost mapping position of `record`.
|
||||||
|
"""
|
||||||
|
function rightposition(record::Record)
|
||||||
|
return position(record) + alignlength(record) - 1
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasrightposition(record::Record)
|
||||||
|
return hasposition(record) && hasalignment(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
isnextmapped(record::Record)::Bool
|
||||||
|
|
||||||
|
Test if the mate/next read of `record` is mapped.
|
||||||
|
"""
|
||||||
|
function isnextmapped(record::Record)::Bool
|
||||||
|
return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
nextrefname(record::Record)::String
|
||||||
|
|
||||||
|
Get the reference name of the mate/next read of `record`.
|
||||||
|
"""
|
||||||
|
function nextrefname(record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.rnext)
|
||||||
|
missingerror(:nextrefname)
|
||||||
|
end
|
||||||
|
return String(record.data[record.rnext])
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasnextrefname(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.rnext)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
nextposition(record::Record)::Int
|
||||||
|
|
||||||
|
Get the position of the mate/next read of `record`.
|
||||||
|
"""
|
||||||
|
function nextposition(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
pos = unsafe_parse_decimal(Int, record.data, record.pnext)
|
||||||
|
if pos == 0
|
||||||
|
missingerror(:nextposition)
|
||||||
|
end
|
||||||
|
return pos
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasnextposition(record::Record)
|
||||||
|
return isfilled(record) && (length(record.pnext) != 1 || record.data[first(record.pnext)] != UInt8('0'))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
mappingquality(record::Record)::UInt8
|
||||||
|
|
||||||
|
Get the mapping quality of `record`.
|
||||||
|
"""
|
||||||
|
function mappingquality(record::Record)::UInt8
|
||||||
|
checkfilled(record)
|
||||||
|
qual = unsafe_parse_decimal(UInt8, record.data, record.mapq)
|
||||||
|
if qual == 0xff
|
||||||
|
missingerror(:mappingquality)
|
||||||
|
end
|
||||||
|
return qual
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasmappingquality(record::Record)
|
||||||
|
return isfilled(record) && unsafe_parse_decimal(UInt8, record.data, record.mapq) != 0xff
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
cigar(record::Record)::String
|
||||||
|
|
||||||
|
Get the CIGAR string of `record`.
|
||||||
|
"""
|
||||||
|
function cigar(record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.cigar)
|
||||||
|
missingerror(:cigar)
|
||||||
|
end
|
||||||
|
return String(record.data[record.cigar])
|
||||||
|
end
|
||||||
|
|
||||||
|
function hascigar(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.cigar)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
alignment(record::Record)::BioAlignments.Alignment
|
||||||
|
|
||||||
|
Get the alignment of `record`.
|
||||||
|
"""
|
||||||
|
function alignment(record::Record)::BioAlignments.Alignment
|
||||||
|
if ismapped(record)
|
||||||
|
return BioAlignments.Alignment(cigar(record), 1, position(record))
|
||||||
|
else
|
||||||
|
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasalignment(record::Record)
|
||||||
|
return isfilled(record) && hascigar(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
alignlength(record::Record)::Int
|
||||||
|
|
||||||
|
Get the alignment length of `record`.
|
||||||
|
"""
|
||||||
|
function alignlength(record::Record)::Int
|
||||||
|
if length(record.cigar) == 1 && record.data[first(record.cigar)] == UInt8('*')
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
ret::Int = 0
|
||||||
|
len = 0 # operation length
|
||||||
|
for i in record.cigar
|
||||||
|
c = record.data[i]
|
||||||
|
if c ∈ UInt8('0'):UInt8('9')
|
||||||
|
len = len * 10 + (c - UInt8('0'))
|
||||||
|
else
|
||||||
|
op = convert(BioAlignments.Operation, Char(c))
|
||||||
|
if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op)
|
||||||
|
ret += len
|
||||||
|
len = 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
tempname(record::Record)::String
|
||||||
|
|
||||||
|
Get the query template name of `record`.
|
||||||
|
"""
|
||||||
|
function tempname(record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.qname)
|
||||||
|
missingerror(:tempname)
|
||||||
|
end
|
||||||
|
return String(record.data[record.qname])
|
||||||
|
end
|
||||||
|
|
||||||
|
function hastempname(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.qname)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
templength(record::Record)::Int
|
||||||
|
|
||||||
|
Get the template length of `record`.
|
||||||
|
"""
|
||||||
|
function templength(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
len = unsafe_parse_decimal(Int, record.data, record.tlen)
|
||||||
|
if len == 0
|
||||||
|
missingerror(:tlen)
|
||||||
|
end
|
||||||
|
return len
|
||||||
|
end
|
||||||
|
|
||||||
|
function hastemplength(record::Record)
|
||||||
|
return isfilled(record) && (length(record.tlen) != 1 || record.data[first(record.tlen)] != UInt8('0'))
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
sequence(record::Record)::BioSequences.LongDNASeq
|
||||||
|
|
||||||
|
Get the segment sequence of `record`.
|
||||||
|
"""
|
||||||
|
function sequence(record::Record)::BioSequences.LongDNASeq
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.seq)
|
||||||
|
missingerror(:sequence)
|
||||||
|
end
|
||||||
|
seqlen = length(record.seq)
|
||||||
|
ret = BioSequences.LongDNASeq(seqlen)
|
||||||
|
BioSequences.encode_copy!(ret, 1, record.data, first(record.seq), seqlen)
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
function hassequence(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.seq)
|
||||||
|
end
|
||||||
|
"""
|
||||||
|
sequence(::Type{String}, record::Record)::String
|
||||||
|
|
||||||
|
Get the segment sequence of `record` as `String`.
|
||||||
|
"""
|
||||||
|
function sequence(::Type{String}, record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
return String(record.data[record.seq])
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
seqlength(record::Record)::Int
|
||||||
|
|
||||||
|
Get the sequence length of `record`.
|
||||||
|
"""
|
||||||
|
function seqlength(record::Record)::Int
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.seq)
|
||||||
|
missingerror(:seq)
|
||||||
|
end
|
||||||
|
return length(record.seq)
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasseqlength(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.seq)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
quality(record::Record)::Vector{UInt8}
|
||||||
|
|
||||||
|
Get the Phred-scaled base quality of `record`.
|
||||||
|
"""
|
||||||
|
function quality(record::Record)::Vector{UInt8}
|
||||||
|
checkfilled(record)
|
||||||
|
if ismissing(record, record.qual)
|
||||||
|
missingerror(:quality)
|
||||||
|
end
|
||||||
|
qual = record.data[record.qual]
|
||||||
|
for i in 1:lastindex(qual)
|
||||||
|
@inbounds qual[i] -= 33
|
||||||
|
end
|
||||||
|
return qual
|
||||||
|
end
|
||||||
|
|
||||||
|
function hasquality(record::Record)
|
||||||
|
return isfilled(record) && !ismissing(record, record.qual)
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
quality(::Type{String}, record::Record)::String
|
||||||
|
|
||||||
|
Get the ASCII-encoded base quality of `record`.
|
||||||
|
"""
|
||||||
|
function quality(::Type{String}, record::Record)::String
|
||||||
|
checkfilled(record)
|
||||||
|
return String(record.data[record.qual])
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
auxdata(record::Record)::Dict{String,Any}
|
||||||
|
|
||||||
|
Get the auxiliary data (optional fields) of `record`.
|
||||||
|
"""
|
||||||
|
function auxdata(record::Record)::Dict{String,Any}
|
||||||
|
checkfilled(record)
|
||||||
|
return Dict(k => record[k] for k in keys(record))
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.haskey(record::Record, tag::AbstractString)
|
||||||
|
return findauxtag(record, tag) > 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.getindex(record::Record, tag::AbstractString)
|
||||||
|
i = findauxtag(record, tag)
|
||||||
|
if i == 0
|
||||||
|
throw(KeyError(tag))
|
||||||
|
end
|
||||||
|
field = record.fields[i]
|
||||||
|
# data type
|
||||||
|
typ = record.data[first(field)+3]
|
||||||
|
lo = first(field) + 5
|
||||||
|
if i == lastindex(record.fields)
|
||||||
|
hi = last(field)
|
||||||
|
else
|
||||||
|
hi = first(record.fields[i+1]) - 2
|
||||||
|
end
|
||||||
|
if typ == UInt8('A')
|
||||||
|
@assert lo == hi
|
||||||
|
return Char(record.data[lo])
|
||||||
|
elseif typ == UInt8('i')
|
||||||
|
return unsafe_parse_decimal(Int, record.data, lo:hi)
|
||||||
|
elseif typ == UInt8('f')
|
||||||
|
# TODO: Call a C function directly for speed?
|
||||||
|
return parse(Float32, SubString(record.data[lo:hi]))
|
||||||
|
elseif typ == UInt8('Z')
|
||||||
|
return String(record.data[lo:hi])
|
||||||
|
elseif typ == UInt8('H')
|
||||||
|
return parse_hexarray(record.data, lo:hi)
|
||||||
|
elseif typ == UInt8('B')
|
||||||
|
return parse_typedarray(record.data, lo:hi)
|
||||||
|
else
|
||||||
|
throw(ArgumentError("type code '$(Char(typ))' is not defined"))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.keys(record::Record)
|
||||||
|
checkfilled(record)
|
||||||
|
return [String(record.data[first(f):first(f)+1]) for f in record.fields]
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.values(record::Record)
|
||||||
|
return [record[k] for k in keys(record)]
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Bio Methods
|
||||||
|
# -----------
|
||||||
|
|
||||||
|
function BioCore.isfilled(record::Record)
|
||||||
|
return !isempty(record.filled)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.seqname(record::Record)
|
||||||
|
return tempname(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hasseqname(record::Record)
|
||||||
|
return hastempname(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.sequence(record::Record)
|
||||||
|
return sequence(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hassequence(record::Record)
|
||||||
|
return hassequence(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.rightposition(record::Record)
|
||||||
|
return rightposition(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hasrightposition(record::Record)
|
||||||
|
return hasrightposition(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.leftposition(record::Record)
|
||||||
|
return position(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.hasleftposition(record::Record)
|
||||||
|
return hasposition(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Helper Functions
|
||||||
|
# ----------------
|
||||||
|
|
||||||
|
function initialize!(record::Record)
|
||||||
|
record.filled = 1:0
|
||||||
|
record.qname = 1:0
|
||||||
|
record.flag = 1:0
|
||||||
|
record.rname = 1:0
|
||||||
|
record.pos = 1:0
|
||||||
|
record.mapq = 1:0
|
||||||
|
record.cigar = 1:0
|
||||||
|
record.rnext = 1:0
|
||||||
|
record.pnext = 1:0
|
||||||
|
record.tlen = 1:0
|
||||||
|
record.seq = 1:0
|
||||||
|
record.qual = 1:0
|
||||||
|
empty!(record.fields)
|
||||||
|
return record
|
||||||
|
end
|
||||||
|
|
||||||
|
function checkfilled(record::Record)
|
||||||
|
if !isfilled(record)
|
||||||
|
throw(ArgumentError("unfilled SAM record"))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function findauxtag(record::Record, tag::AbstractString)
|
||||||
|
checkfilled(record)
|
||||||
|
if sizeof(tag) != 2
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
t1, t2 = UInt8(tag[1]), UInt8(tag[2])
|
||||||
|
for (i, field) in enumerate(record.fields)
|
||||||
|
p = first(field)
|
||||||
|
if record.data[p] == t1 && record.data[p+1] == t2
|
||||||
|
return i
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function parse_hexarray(data::Vector{UInt8}, range::UnitRange{Int})
|
||||||
|
@assert iseven(length(range))
|
||||||
|
ret = Vector{UInt8}(length(range) >> 1)
|
||||||
|
byte2hex(b) = b ∈ 0x30:0x39 ? (b - 0x30) : b ∈ 0x41:0x46 ? (b - 0x41 + 0x0A) : error("not in [0-9A-F]")
|
||||||
|
j = 1
|
||||||
|
for i in first(range):2:last(range)-1
|
||||||
|
ret[j] = (byte2hex(data[range[i]]) << 4) | byte2hex(data[range[i+1]])
|
||||||
|
j += 1
|
||||||
|
end
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int})
|
||||||
|
# format: [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+
|
||||||
|
t = data[first(range)]
|
||||||
|
xs = split(String(data[first(range)+2:last(range)]))
|
||||||
|
if t == UInt8('c')
|
||||||
|
return [parse(Int8, x) for x in xs]
|
||||||
|
elseif t == UInt8('C')
|
||||||
|
return [parse(UInt8, x) for x in xs]
|
||||||
|
elseif t == UInt8('s')
|
||||||
|
return [parse(Int16, x) for x in xs]
|
||||||
|
elseif t == UInt8('S')
|
||||||
|
return [parse(UInt16, x) for x in xs]
|
||||||
|
elseif t == UInt8('i')
|
||||||
|
return [parse(Int32, x) for x in xs]
|
||||||
|
elseif t == UInt8('I')
|
||||||
|
return [parse(UInt32, x) for x in xs]
|
||||||
|
elseif t == UInt8('f')
|
||||||
|
return [parse(Float32, x) for x in xs]
|
||||||
|
else
|
||||||
|
throw(ArgumentError("type code '$(Char(t))' is not defined"))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function ismissing(record::Record, range::UnitRange{Int})
|
||||||
|
return length(range) == 1 && record.data[first(range)] == UInt8('*')
|
||||||
|
end
|
25
src/sam/sam.jl
Normal file
25
src/sam/sam.jl
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# SAM File Format
|
||||||
|
# ===============
|
||||||
|
|
||||||
|
module SAM
|
||||||
|
|
||||||
|
using BioCore
|
||||||
|
|
||||||
|
import Automa
|
||||||
|
import Automa.RegExp: @re_str
|
||||||
|
import BioAlignments
|
||||||
|
import BioCore.Exceptions: missingerror
|
||||||
|
import BioCore.RecordHelper: unsafe_parse_decimal
|
||||||
|
import BioCore: isfilled, header
|
||||||
|
import BioSequences
|
||||||
|
import BufferedStreams
|
||||||
|
using Printf: @sprintf
|
||||||
|
|
||||||
|
include("flags.jl")
|
||||||
|
include("metainfo.jl")
|
||||||
|
include("record.jl")
|
||||||
|
include("header.jl")
|
||||||
|
include("reader.jl")
|
||||||
|
include("writer.jl")
|
||||||
|
|
||||||
|
end
|
43
src/sam/writer.jl
Normal file
43
src/sam/writer.jl
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# SAM Writer
|
||||||
|
# ==========
|
||||||
|
|
||||||
|
"""
|
||||||
|
Writer(output::IO, header::Header=Header())
|
||||||
|
|
||||||
|
Create a data writer of the SAM file format.
|
||||||
|
|
||||||
|
# Arguments
|
||||||
|
* `output`: data sink
|
||||||
|
* `header=Header()`: SAM header object
|
||||||
|
"""
|
||||||
|
mutable struct Writer <: BioCore.IO.AbstractWriter
|
||||||
|
stream::IO
|
||||||
|
|
||||||
|
function Writer(output::IO, header::Header=Header())
|
||||||
|
writer = new(output)
|
||||||
|
write(writer, header)
|
||||||
|
return writer
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function BioCore.IO.stream(writer::Writer)
|
||||||
|
return writer.stream
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.write(writer::Writer, header::Header)
|
||||||
|
n = 0
|
||||||
|
for metainfo in header
|
||||||
|
n += write(writer, metainfo)
|
||||||
|
end
|
||||||
|
return n
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.write(writer::Writer, metainfo::MetaInfo)
|
||||||
|
checkfilled(metainfo)
|
||||||
|
return write(writer.stream, metainfo, '\n')
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.write(writer::Writer, record::Record)
|
||||||
|
checkfilled(record)
|
||||||
|
return write(writer.stream, record, '\n')
|
||||||
|
end
|
438
test/runtests.jl
Normal file
438
test/runtests.jl
Normal file
|
@ -0,0 +1,438 @@
|
||||||
|
using Test
|
||||||
|
using GenomicFeatures
|
||||||
|
using XAM
|
||||||
|
import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE
|
||||||
|
using FormatSpecimens
|
||||||
|
import BGZFStreams: BGZFStream
|
||||||
|
import BioCore.Exceptions: MissingFieldException
|
||||||
|
import BioSequences: @dna_str, @aa_str
|
||||||
|
|
||||||
|
|
||||||
|
import BioCore:
|
||||||
|
header,
|
||||||
|
isfilled,
|
||||||
|
seqname,
|
||||||
|
hasseqname,
|
||||||
|
sequence,
|
||||||
|
hassequence,
|
||||||
|
leftposition,
|
||||||
|
rightposition,
|
||||||
|
hasleftposition,
|
||||||
|
hasrightposition
|
||||||
|
|
||||||
|
# Generate a random range within `range`.
|
||||||
|
function randrange(range)
|
||||||
|
x = rand(range)
|
||||||
|
y = rand(range)
|
||||||
|
if x < y
|
||||||
|
return x:y
|
||||||
|
else
|
||||||
|
return y:x
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "SAM" begin
|
||||||
|
samdir = path_of_format("SAM")
|
||||||
|
|
||||||
|
@testset "MetaInfo" begin
|
||||||
|
metainfo = SAM.MetaInfo()
|
||||||
|
@test !isfilled(metainfo)
|
||||||
|
@test occursin("not filled", repr(metainfo))
|
||||||
|
|
||||||
|
metainfo = SAM.MetaInfo("CO", "some comment (parens)")
|
||||||
|
@test isfilled(metainfo)
|
||||||
|
@test string(metainfo) == "@CO\tsome comment (parens)"
|
||||||
|
@test occursin("CO", repr(metainfo))
|
||||||
|
@test SAM.tag(metainfo) == "CO"
|
||||||
|
@test SAM.value(metainfo) == "some comment (parens)"
|
||||||
|
@test_throws ArgumentError keys(metainfo)
|
||||||
|
@test_throws ArgumentError values(metainfo)
|
||||||
|
|
||||||
|
metainfo = SAM.MetaInfo("HD", ["VN" => "1.0", "SO" => "coordinate"])
|
||||||
|
@test isfilled(metainfo)
|
||||||
|
@test string(metainfo) == "@HD\tVN:1.0\tSO:coordinate"
|
||||||
|
@test occursin("HD", repr(metainfo))
|
||||||
|
@test SAM.tag(metainfo) == "HD"
|
||||||
|
@test SAM.value(metainfo) == "VN:1.0\tSO:coordinate"
|
||||||
|
@test keys(metainfo) == ["VN", "SO"]
|
||||||
|
@test values(metainfo) == ["1.0", "coordinate"]
|
||||||
|
@test SAM.keyvalues(metainfo) == ["VN" => "1.0", "SO" => "coordinate"]
|
||||||
|
@test haskey(metainfo, "VN")
|
||||||
|
@test haskey(metainfo, "SO")
|
||||||
|
@test !haskey(metainfo, "GO")
|
||||||
|
@test metainfo["VN"] == "1.0"
|
||||||
|
@test metainfo["SO"] == "coordinate"
|
||||||
|
@test_throws KeyError metainfo["GO"]
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Header" begin
|
||||||
|
header = SAM.Header()
|
||||||
|
@test isempty(header)
|
||||||
|
push!(header, SAM.MetaInfo("@HD\tVN:1.0\tSO:coordinate"))
|
||||||
|
@test !isempty(header)
|
||||||
|
@test length(header) == 1
|
||||||
|
push!(header, SAM.MetaInfo("@CO\tsome comment"))
|
||||||
|
@test length(header) == 2
|
||||||
|
@test isa(collect(header), Vector{SAM.MetaInfo})
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Record" begin
|
||||||
|
record = SAM.Record()
|
||||||
|
@test !isfilled(record)
|
||||||
|
@test !SAM.ismapped(record)
|
||||||
|
@test repr(record) == "XAM.SAM.Record: <not filled>"
|
||||||
|
@test_throws ArgumentError SAM.flag(record)
|
||||||
|
|
||||||
|
record = SAM.Record("r001\t99\tchr1\t7\t30\t8M2I4M1D3M\t=\t37\t39\tTTAGATAAAGGATACTG\t*")
|
||||||
|
@test isfilled(record)
|
||||||
|
@test occursin(r"^XAM.SAM.Record:\n", repr(record))
|
||||||
|
@test SAM.ismapped(record)
|
||||||
|
@test SAM.isprimary(record)
|
||||||
|
@test SAM.hastempname(record)
|
||||||
|
@test SAM.tempname(record) == "r001"
|
||||||
|
@test SAM.hasflag(record)
|
||||||
|
@test SAM.flag(record) === UInt16(99)
|
||||||
|
@test SAM.hasrefname(record)
|
||||||
|
@test SAM.refname(record) == "chr1"
|
||||||
|
@test SAM.hasposition(record)
|
||||||
|
@test SAM.position(record) === 7
|
||||||
|
@test SAM.hasmappingquality(record)
|
||||||
|
@test SAM.mappingquality(record) === UInt8(30)
|
||||||
|
@test SAM.hascigar(record)
|
||||||
|
@test SAM.cigar(record) == "8M2I4M1D3M"
|
||||||
|
@test SAM.hasnextrefname(record)
|
||||||
|
@test SAM.nextrefname(record) == "="
|
||||||
|
@test SAM.hasnextposition(record)
|
||||||
|
@test SAM.nextposition(record) === 37
|
||||||
|
@test SAM.hastemplength(record)
|
||||||
|
@test SAM.templength(record) === 39
|
||||||
|
@test SAM.hassequence(record)
|
||||||
|
@test SAM.sequence(record) == dna"TTAGATAAAGGATACTG"
|
||||||
|
@test !SAM.hasquality(record)
|
||||||
|
@test_throws MissingFieldException SAM.quality(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Reader" begin
|
||||||
|
reader = open(SAM.Reader, joinpath(samdir, "ce#1.sam"))
|
||||||
|
@test isa(reader, SAM.Reader)
|
||||||
|
@test eltype(reader) === SAM.Record
|
||||||
|
|
||||||
|
# header
|
||||||
|
h = header(reader)
|
||||||
|
@test string.(findall(header(reader), "SQ")) == ["@SQ\tSN:CHROMOSOME_I\tLN:1009800"]
|
||||||
|
|
||||||
|
# first record
|
||||||
|
record = SAM.Record()
|
||||||
|
read!(reader, record)
|
||||||
|
@test SAM.ismapped(record)
|
||||||
|
@test SAM.refname(record) == "CHROMOSOME_I"
|
||||||
|
@test SAM.position(record) == leftposition(record) == 2
|
||||||
|
@test SAM.rightposition(record) == rightposition(record) == 102
|
||||||
|
@test SAM.tempname(record) == seqname(record) == "SRR065390.14978392"
|
||||||
|
@test SAM.sequence(record) == sequence(record) == dna"CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA"
|
||||||
|
@test SAM.sequence(String, record) == "CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA"
|
||||||
|
@test SAM.seqlength(record) == 100
|
||||||
|
@test SAM.quality(record) == (b"#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" .- 33)
|
||||||
|
@test SAM.quality(String, record) == "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
|
||||||
|
@test SAM.flag(record) == 16
|
||||||
|
@test SAM.cigar(record) == "27M1D73M"
|
||||||
|
@test SAM.alignment(record) == Alignment([
|
||||||
|
AlignmentAnchor( 0, 1, OP_START),
|
||||||
|
AlignmentAnchor( 27, 28, OP_MATCH),
|
||||||
|
AlignmentAnchor( 27, 29, OP_DELETE),
|
||||||
|
AlignmentAnchor(100, 102, OP_MATCH)])
|
||||||
|
@test record["XG"] == 1
|
||||||
|
@test record["XM"] == 5
|
||||||
|
@test record["XN"] == 0
|
||||||
|
@test record["XO"] == 1
|
||||||
|
@test record["AS"] == -18
|
||||||
|
@test record["XS"] == -18
|
||||||
|
@test record["YT"] == "UU"
|
||||||
|
@test eof(reader)
|
||||||
|
close(reader)
|
||||||
|
|
||||||
|
# iterator
|
||||||
|
@test length(collect(open(SAM.Reader, joinpath(samdir, "ce#1.sam")))) == 1
|
||||||
|
@test length(collect(open(SAM.Reader, joinpath(samdir, "ce#2.sam")))) == 2
|
||||||
|
|
||||||
|
# IOStream
|
||||||
|
@test length(collect(SAM.Reader(open(joinpath(samdir, "ce#1.sam"))))) == 1
|
||||||
|
@test length(collect(SAM.Reader(open(joinpath(samdir, "ce#2.sam"))))) == 2
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Round trip" begin
|
||||||
|
function compare_records(xs, ys)
|
||||||
|
if length(xs) != length(ys)
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
for (x, y) in zip(xs, ys)
|
||||||
|
if x.data[x.filled] != y.data[y.filled]
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
for specimen in list_valid_specimens("SAM")
|
||||||
|
filepath = joinpath(samdir, filename(specimen))
|
||||||
|
mktemp() do path, io
|
||||||
|
# copy
|
||||||
|
reader = open(SAM.Reader, filepath)
|
||||||
|
writer = SAM.Writer(io, header(reader))
|
||||||
|
records = SAM.Record[]
|
||||||
|
for record in reader
|
||||||
|
push!(records, record)
|
||||||
|
write(writer, record)
|
||||||
|
end
|
||||||
|
close(reader)
|
||||||
|
close(writer)
|
||||||
|
@test compare_records(open(collect, SAM.Reader, path), records)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "BAM" begin
|
||||||
|
bamdir = path_of_format("BAM")
|
||||||
|
|
||||||
|
@testset "AuxData" begin
|
||||||
|
auxdata = BAM.AuxData(UInt8[])
|
||||||
|
@test isempty(auxdata)
|
||||||
|
|
||||||
|
buf = IOBuffer()
|
||||||
|
write(buf, "NM", UInt8('s'), Int16(1))
|
||||||
|
auxdata = BAM.AuxData(take!(buf))
|
||||||
|
@test length(auxdata) == 1
|
||||||
|
@test auxdata["NM"] === Int16(1)
|
||||||
|
@test collect(auxdata) == ["NM" => Int16(1)]
|
||||||
|
|
||||||
|
buf = IOBuffer()
|
||||||
|
write(buf, "AS", UInt8('c'), Int8(-18))
|
||||||
|
write(buf, "NM", UInt8('s'), Int16(1))
|
||||||
|
write(buf, "XA", UInt8('f'), Float32(3.14))
|
||||||
|
write(buf, "XB", UInt8('Z'), "some text\0")
|
||||||
|
write(buf, "XC", UInt8('B'), UInt8('i'), Int32(3), Int32[10, -5, 8])
|
||||||
|
auxdata = BAM.AuxData(take!(buf))
|
||||||
|
@test length(auxdata) == 5
|
||||||
|
@test auxdata["AS"] === Int8(-18)
|
||||||
|
@test auxdata["NM"] === Int16(1)
|
||||||
|
@test auxdata["XA"] === Float32(3.14)
|
||||||
|
@test auxdata["XB"] == "some text"
|
||||||
|
@test auxdata["XC"] == Int32[10, -5, 8]
|
||||||
|
@test convert(Dict{String,Any}, auxdata) == Dict(
|
||||||
|
"AS" => Int8(-18),
|
||||||
|
"NM" => Int16(1),
|
||||||
|
"XA" => Float32(3.14),
|
||||||
|
"XB" => "some text",
|
||||||
|
"XC" => Int32[10, -5, 8])
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Record" begin
|
||||||
|
record = BAM.Record()
|
||||||
|
@test !isfilled(record)
|
||||||
|
@test repr(record) == "XAM.BAM.Record: <not filled>"
|
||||||
|
@test_throws ArgumentError BAM.flag(record)
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Reader" begin
|
||||||
|
reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam"))
|
||||||
|
@test isa(reader, BAM.Reader)
|
||||||
|
@test eltype(reader) === BAM.Record
|
||||||
|
@test startswith(repr(reader), "XAM.BAM.Reader{IOStream}:")
|
||||||
|
|
||||||
|
# header
|
||||||
|
h = header(reader)
|
||||||
|
@test isa(h, SAM.Header)
|
||||||
|
|
||||||
|
# first record
|
||||||
|
record = BAM.Record()
|
||||||
|
read!(reader, record)
|
||||||
|
@test BAM.ismapped(record)
|
||||||
|
@test BAM.isprimary(record)
|
||||||
|
@test ! BAM.ispositivestrand(record)
|
||||||
|
@test BAM.refname(record) == "CHROMOSOME_I"
|
||||||
|
@test BAM.refid(record) === 1
|
||||||
|
@test BAM.hasnextrefid(record)
|
||||||
|
@test BAM.nextrefid(record) === 0
|
||||||
|
@test BAM.hasposition(record) === hasleftposition(record) === true
|
||||||
|
@test BAM.position(record) === leftposition(record) === 2
|
||||||
|
@test BAM.hasnextposition(record)
|
||||||
|
@test BAM.nextposition(record) === 0
|
||||||
|
@test rightposition(record) == 102
|
||||||
|
@test BAM.hastempname(record) === hasseqname(record) === true
|
||||||
|
@test BAM.tempname(record) == seqname(record) == "SRR065390.14978392"
|
||||||
|
@test BAM.hassequence(record) === hassequence(record) === true
|
||||||
|
@test BAM.sequence(record) == sequence(record) == dna"""
|
||||||
|
CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCT
|
||||||
|
AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA
|
||||||
|
"""
|
||||||
|
@test BAM.seqlength(record) === 100
|
||||||
|
@test BAM.hasquality(record)
|
||||||
|
@test eltype(BAM.quality(record)) == UInt8
|
||||||
|
@test BAM.quality(record) == [Int(x) - 33 for x in "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"]
|
||||||
|
@test BAM.flag(record) === UInt16(16)
|
||||||
|
@test BAM.cigar(record) == "27M1D73M"
|
||||||
|
@test BAM.alignment(record) == Alignment([
|
||||||
|
AlignmentAnchor( 0, 1, OP_START),
|
||||||
|
AlignmentAnchor( 27, 28, OP_MATCH),
|
||||||
|
AlignmentAnchor( 27, 29, OP_DELETE),
|
||||||
|
AlignmentAnchor(100, 102, OP_MATCH)])
|
||||||
|
@test record["XG"] == 1
|
||||||
|
@test record["XM"] == 5
|
||||||
|
@test record["XN"] == 0
|
||||||
|
@test record["XO"] == 1
|
||||||
|
@test record["AS"] == -18
|
||||||
|
@test record["XS"] == -18
|
||||||
|
@test record["YT"] == "UU"
|
||||||
|
@test keys(record) == ["XG","XM","XN","XO","AS","XS","YT"]
|
||||||
|
@test values(record) == [1, 5, 0, 1, -18, -18, "UU"]
|
||||||
|
@test eof(reader)
|
||||||
|
close(reader)
|
||||||
|
|
||||||
|
# Test conversion from byte array to record
|
||||||
|
dsize = BAM.data_size(record)
|
||||||
|
array = Vector{UInt8}(undef, BAM.FIXED_FIELDS_BYTES + dsize)
|
||||||
|
GC.@preserve array record begin
|
||||||
|
ptr = Ptr{UInt8}(pointer_from_objref(record))
|
||||||
|
unsafe_copyto!(pointer(array), ptr, BAM.FIXED_FIELDS_BYTES)
|
||||||
|
unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize)
|
||||||
|
end
|
||||||
|
new_record = convert(BAM.Record, array)
|
||||||
|
@test record.bin_mq_nl == new_record.bin_mq_nl
|
||||||
|
@test record.block_size == new_record.block_size
|
||||||
|
@test record.flag_nc == new_record.flag_nc
|
||||||
|
@test record.l_seq == new_record.l_seq
|
||||||
|
@test record.next_refid == new_record.next_refid
|
||||||
|
@test record.next_pos == new_record.next_pos
|
||||||
|
@test record.refid == new_record.refid
|
||||||
|
@test record.pos == new_record.pos
|
||||||
|
@test record.tlen == new_record.tlen
|
||||||
|
@test record.data == new_record.data
|
||||||
|
|
||||||
|
# iterator
|
||||||
|
@test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#1.bam")))) == 1
|
||||||
|
@test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#2.bam")))) == 2
|
||||||
|
|
||||||
|
# IOStream
|
||||||
|
@test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#1.bam"))))) == 1
|
||||||
|
@test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#2.bam"))))) == 2
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Read long CIGARs" begin
|
||||||
|
function get_cigar_lens(rec::BAM.Record)
|
||||||
|
cigar_ops, cigar_n = BAM.cigar_rle(rec)
|
||||||
|
field_ops, field_n = BAM.cigar_rle(rec, false)
|
||||||
|
cigar_l = length(cigar_ops)
|
||||||
|
field_l = length(field_ops)
|
||||||
|
return cigar_l, field_l
|
||||||
|
end
|
||||||
|
|
||||||
|
function check_cigar_vs_field(rec::BAM.Record)
|
||||||
|
cigar = BAM.cigar(rec)
|
||||||
|
field = BAM.cigar(rec, false)
|
||||||
|
cigar_l, field_l = get_cigar_lens(rec)
|
||||||
|
return cigar != field && cigar_l != field_l
|
||||||
|
end
|
||||||
|
|
||||||
|
function check_cigar_lens(rec::BAM.Record, field_len, cigar_len)
|
||||||
|
cigar_l, field_l = get_cigar_lens(rec)
|
||||||
|
return cigar_l == cigar_len && field_l == field_len
|
||||||
|
end
|
||||||
|
|
||||||
|
reader = open(BAM.Reader, joinpath(bamdir, "cigar-64k.bam"))
|
||||||
|
rec = BAM.Record()
|
||||||
|
read!(reader, rec)
|
||||||
|
@test !check_cigar_vs_field(rec)
|
||||||
|
read!(reader, rec)
|
||||||
|
@test check_cigar_vs_field(rec)
|
||||||
|
@test check_cigar_lens(rec, 2, 72091)
|
||||||
|
end
|
||||||
|
|
||||||
|
function compare_records(xs, ys)
|
||||||
|
if length(xs) != length(ys)
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
for (x, y) in zip(xs, ys)
|
||||||
|
if !(
|
||||||
|
x.block_size == y.block_size &&
|
||||||
|
x.refid == y.refid &&
|
||||||
|
x.pos == y.pos &&
|
||||||
|
x.bin_mq_nl == y.bin_mq_nl &&
|
||||||
|
x.flag_nc == y.flag_nc &&
|
||||||
|
x.l_seq == y.l_seq &&
|
||||||
|
x.next_refid == y.next_refid &&
|
||||||
|
x.next_pos == y.next_pos &&
|
||||||
|
x.tlen == y.tlen &&
|
||||||
|
x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)])
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Round trip" begin
|
||||||
|
for specimen in list_valid_specimens("BAM")
|
||||||
|
filepath = joinpath(bamdir, filename(specimen))
|
||||||
|
mktemp() do path, _
|
||||||
|
# copy
|
||||||
|
if hastags(specimen) && in("bai", tags(specimen))
|
||||||
|
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
|
||||||
|
else
|
||||||
|
reader = open(BAM.Reader, filepath)
|
||||||
|
end
|
||||||
|
writer = BAM.Writer(
|
||||||
|
BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ"))))
|
||||||
|
records = BAM.Record[]
|
||||||
|
for record in reader
|
||||||
|
push!(records, record)
|
||||||
|
write(writer, record)
|
||||||
|
end
|
||||||
|
close(reader)
|
||||||
|
close(writer)
|
||||||
|
@test compare_records(open(collect, BAM.Reader, path), records)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Random access" begin
|
||||||
|
filepath = joinpath(bamdir, "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam")
|
||||||
|
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
|
||||||
|
|
||||||
|
@test isa(eachoverlap(reader, "chr1", 1:100), BAM.OverlapIterator)
|
||||||
|
@test isa(eachoverlap(reader, GenomicFeatures.Interval("chr1", 1, 100)), BAM.OverlapIterator)
|
||||||
|
|
||||||
|
# expected values are counted using samtools
|
||||||
|
for (refname, interval, expected) in [
|
||||||
|
("chr1", 1_000:10000, 21),
|
||||||
|
("chr1", 8_000:10000, 20),
|
||||||
|
("chr1", 766_000:800_000, 142),
|
||||||
|
("chr1", 786_000:800_000, 1),
|
||||||
|
("chr1", 796_000:800_000, 0)]
|
||||||
|
intsect = eachoverlap(reader, refname, interval)
|
||||||
|
@test eltype(intsect) == BAM.Record
|
||||||
|
@test count(_ -> true, intsect) == expected
|
||||||
|
# check that the intersection iterator is stateless
|
||||||
|
@test count(_ -> true, intsect) == expected
|
||||||
|
end
|
||||||
|
|
||||||
|
# randomized tests
|
||||||
|
for n in 1:50
|
||||||
|
refindex = 1
|
||||||
|
refname = "chr1"
|
||||||
|
range = randrange(1:1_000_000)
|
||||||
|
seekstart(reader)
|
||||||
|
# linear scan
|
||||||
|
expected = filter(collect(reader)) do record
|
||||||
|
BAM.compare_intervals(record, (refindex, range)) == 0
|
||||||
|
end
|
||||||
|
# indexed scan
|
||||||
|
actual = collect(eachoverlap(reader, refname, range))
|
||||||
|
@test compare_records(actual, expected)
|
||||||
|
end
|
||||||
|
close(reader)
|
||||||
|
|
||||||
|
filepath = joinpath(bamdir, "R_12h_D06.uniq.q40.bam")
|
||||||
|
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
|
||||||
|
@test isempty(collect(eachoverlap(reader, "chr19", 5823708:5846478)))
|
||||||
|
close(reader)
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue