1
0
Fork 0
mirror of https://github.com/MillironX/XAM.jl.git synced 2024-12-23 21:28:18 +00:00

Merge branch 'release/v0.1.0'

This commit is contained in:
Ciarán O'Mara 2020-02-28 16:18:23 +11:00
commit 109f25bd52
35 changed files with 3676 additions and 2 deletions

1
.codecov.yml Normal file
View file

@ -0,0 +1 @@
comment: false

1
.github/FUNDING.yml vendored Normal file
View file

@ -0,0 +1 @@
open_collective: biojulia

40
.github/ISSUE_TEMPLATE.md vendored Normal file
View file

@ -0,0 +1,40 @@
<!--- Provide a general summary of the issue in the Title above -->
> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._
## Expected Behavior
<!--- If you're describing a bug, tell us what you expect to happen -->
<!--- If you're suggesting a change/improvement, tell us how it should work -->
## Current Behavior
<!--- If describing a bug, tell us what happens instead of the expected behavior -->
<!--- If suggesting a change/improvement, explain the difference from current behavior -->
## Possible Solution / Implementation
<!--- If describing a bug, suggest a fix/reason for the bug (optional) -->
<!--- If you're suggesting a change/improvement, suggest ideas how to implement the addition or change -->
## Steps to Reproduce (for bugs)
<!--- You may include copy/pasteable snippets or a list of steps to reproduce the bug -->
1.
2.
3.
4.
<!--- Optionally, provide a link to a live example -->
<!--- You can use [this tool](https://www.cockos.com/licecap/) -->
<!--- ...Or [this tool](https://github.com/colinkeenan/silentcast) -->
<!--- ...Or [this tool](https://github.com/GNOME/byzanz) on Linux -->
## Context
<!--- How has this issue affected you? What are you trying to accomplish? -->
<!--- Providing context helps us come up with a solution that is most useful in the real world -->
## Your Environment
<!--- Include as many relevant details about the environment you experienced the bug in -->
- Package Version used:
- Julia Version used:
- Operating System and version (desktop or mobile):
- Link to your project:
<!-- Can you list installed packages here? -->

47
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View file

@ -0,0 +1,47 @@
# A clear and descriptive title (No issue numbers please)
> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._
## Types of changes
This PR implements the following changes:
_(Please tick any or all of the following that are applicable)_
* [ ] :sparkles: New feature (A non-breaking change which adds functionality).
* [ ] :bug: Bug fix (A non-breaking change, which fixes an issue).
* [ ] :boom: Breaking change (fix or feature that would cause existing functionality to change).
## :clipboard: Additional detail
- If you have implemented new features or behaviour
- **Provide a description of the addition** in as many details as possible.
- **Provide justification of the addition**.
- **Provide a runnable example of use of your addition**. This lets reviewers
and others try out the feature before it is merged or makes it's way to release.
- If you have changed current behaviour...
- **Describe the behaviour prior to you changes**
- **Describe the behaviour after your changes** and justify why you have made the changes,
Please describe any breakages you anticipate as a result of these changes.
- **Does your change alter APIs or existing exposed methods/types?**
If so, this may cause dependency issues and breakages, so the maintainer
will need to consider this when versioning the next release.
- If you are implementing changes that are intended to increase performance, you
should provide the results of a simple performance benchmark exercise
demonstrating the improvement. Especially if the changes make code less legible.
## :ballot_box_with_check: Checklist
- [ ] :art: The changes implemented is consistent with the [julia style guide](https://docs.julialang.org/en/stable/manual/style-guide/).
- [ ] :blue_book: I have updated and added relevant docstrings, in a manner consistent with the [documentation styleguide](https://docs.julialang.org/en/stable/manual/documentation/).
- [ ] :blue_book: I have added or updated relevant user and developer manuals/documentation in `docs/src/`.
- [ ] :ok: There are unit tests that cover the code changes I have made.
- [ ] :ok: The unit tests cover my code changes AND they pass.
- [ ] :pencil: I have added an entry to the `[UNRELEASED]` section of the manually curated `CHANGELOG.md` file for this repository.
- [ ] :ok: All changes should be compatible with the latest stable version of Julia.
- [ ] :thought_balloon: I have commented liberally for any complex pieces of internal code.

38
.github/workflows/CompatHelper.yml vendored Normal file
View file

@ -0,0 +1,38 @@
name: CompatHelper
on:
schedule:
- cron: '00 * * * *'
jobs:
CompatHelper:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.3.0]
julia-arch: [x86]
os: [ubuntu-latest]
steps:
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Add CompatHelper
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: julia -e '
using CompatHelper, Pkg;
my_registries = [
Pkg.RegistrySpec(
name = "BioJuliaRegistry",
uuid = "ccbd2cc2-2954-11e9-1ccf-f3e7900901ca",
url = "https://github.com/BioJulia/BioJuliaRegistry.git"
),
Pkg.RegistrySpec(
name = "General",
uuid = "23338594-aafe-5451-b93e-139f81909106",
url = "https://github.com/JuliaRegistries/General.git"
)
];
CompatHelper.main(; registries = my_registries, master_branch = "master");'

34
.github/workflows/Documentation.yml vendored Normal file
View file

@ -0,0 +1,34 @@
name: Documentation
on:
push:
branches:
- 'master'
- 'develop'
- 'release/.*'
tags: '*'
pull_request:
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.3.0]
julia-arch: [x86]
os: [ubuntu-latest]
steps:
- uses: actions/checkout@v1.0.0
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Install dependencies
run: |
julia ci_prep.jl;
julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
- name: Build and deploy
env:
# https://github.com/JuliaDocs/Documenter.jl/issues/1177
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key
run: julia --project=docs/ --color=yes docs/make.jl

13
.github/workflows/TagBot.yml vendored Normal file
View file

@ -0,0 +1,13 @@
name: TagBot
on:
schedule:
- cron: '0 * * * *'
jobs:
TagBot:
runs-on: ubuntu-latest
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.TAGBOT_KEY }}
registry: BioJulia/BioJuliaRegistry

22
.github/workflows/UnitTests.yml vendored Normal file
View file

@ -0,0 +1,22 @@
name: Unit tests
on: [push, pull_request]
jobs:
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: ['1.1', '1.2', '1.3']
julia-arch: [x64]
os: [ubuntu-latest, windows-latest, macOS-latest]
steps:
- uses: actions/checkout@v1.0.0
- uses: julia-actions/setup-julia@v1
with:
version: ${{ matrix.julia-version }}
arch: ${{ matrix.julia-arch }}
- name: Install dependencies
run: julia ci_prep.jl
- uses: julia-actions/julia-runtest@master

33
Project.toml Normal file
View file

@ -0,0 +1,33 @@
name = "XAM"
uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c"
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <ward9250@gmail.com>", "Ciarán O'Mara <Ciaran.OMara@utas.edu.au>"]
version = "0.1.0"
[deps]
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e"
BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
[compat]
Automa = "0.7, 0.8"
BGZFStreams = "0.3"
BioAlignments = "2"
BioCore = "2"
BioSequences = "2"
BufferedStreams = "1"
GenomicFeatures = "2"
Indexes = "0.1"
julia = "1.1"
[extras]
FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
test = ["FormatSpecimens", "Test"]

View file

@ -1,3 +1,76 @@
# XAM.jl
# <img src="./docs/src/assets/logo.svg" width="30%" align="right" /> XAM.jl
In development: minimal working separation of SAM and BAM from BioAlignments.
[![Project Status: Active The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
[![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest)
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE)
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)
[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/)
[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl)
> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original
blog post").
## Description
XAM provides I/O and utilities for manipulating SAM and BAM formatted alignment map files.
## Installation
The latest version of XAM is made available to install through BioJulia's package registry.
By default, Julia's package manager only includes the "General" package registry.
To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command:
```julia
registry add https://github.com/BioJulia/BioJuliaRegistry.git
```
Once the registry is added to your configuration, you can install XAM while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following:
```julia
add XAM
```
If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/XAM.jl/tree/develop) to try new features before release.
## Testing
XAM is tested against Julia `1.X` on Linux, OS X, and Windows.
**Latest build status:**
[![Unit tests](https://github.com/BioJulia/XAM.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster)
[![Documentation](https://github.com/BioJulia/XAM.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
[![codecov](https://codecov.io/gh/BioJulia/XAM.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/XAM.jl)
## Contributing
We appreciate [contributions](https://github.com/BioJulia/XAM.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features.
Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct.
### Financial contributions
We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia).
Anyone can file an expense.
If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed.
## Backers & Sponsors
Thank you to all our backers and sponsors!
Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer).
[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers)
Does your company use BioJulia?
Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor).
Your logo will show up here with a link to your website.
[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website)
[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website)
[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website)
[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website)
[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website)
[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website)
[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website)
[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website)
[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website)
[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website)
## Questions?
If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).

3
ci_prep.jl Normal file
View file

@ -0,0 +1,3 @@
using Pkg.Registry
Registry.add(Registry.RegistrySpec(url = "https://github.com/BioJulia/BioJuliaRegistry.git"))
Registry.add(Registry.RegistrySpec(url = "https://github.com/JuliaRegistries/General.git"))

2
coverage/Project.toml Normal file
View file

@ -0,0 +1,2 @@
[deps]
Coverage = "a2441757-f6aa-5fb2-8edb-039e3f45d037"

11
coverage/coverage.jl Normal file
View file

@ -0,0 +1,11 @@
get(ENV, "TRAVIS_OS_NAME", "") == "linux" || exit()
get(ENV, "TRAVIS_JULIA_VERSION", "") == "1.3" || exit()
using Pkg
Pkg.instantiate()
using Coverage
cd(joinpath(@__DIR__, "..")) do
Codecov.submit(Codecov.process_folder())
end

6
docs/Project.toml Normal file
View file

@ -0,0 +1,6 @@
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
[compat]
Documenter = "0.24"

21
docs/make.jl Normal file
View file

@ -0,0 +1,21 @@
using Pkg
using Documenter, XAM
makedocs(
format = Documenter.HTML(
edit_link = "develop"
),
modules = [XAM, XAM.SAM, XAM.BAM],
sitename = "XAM.jl",
pages = [
"Home" => "index.md",
"SAM and BAM" => "man/hts-files.md",
"API Reference" => "man/api.md"
],
authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors."
)
deploydocs(
repo = "github.com/BioJulia/XAM.jl.git",
devbranch = "develop",
push_preview = true
)

26
docs/src/assets/logo.svg Normal file
View file

@ -0,0 +1,26 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 136.48101 51.472999">
<g id="bubble-helix">
<circle cx="5.9792" cy="20.3" r="5.9792" fill="#945bb0"/>
<circle cx="21.317" cy="39.5" r="4.9393" fill="#4266d5"/>
<circle cx="16.378" cy="11.7" r="4.9393" fill="#c93d39"/>
<circle cx="35.095" cy="35.9" r="5.9792" fill="#945bb0"/>
<circle cx="47.184" cy="23.8" r="7.409" fill="#4266d5"/>
<circle cx="28.986" cy="9.7" r="4.5494" fill="#3b972e"/>
<circle cx="89.428" cy="46.5" r="4.9393" fill="#c93d39"/>
<circle cx="71.62" cy="5.6" r="5.5892" fill="#945bb0"/>
<circle cx="121.01" cy="15.5" r="5.5892" fill="#c93d39"/>
<circle cx="59.662" cy="30.3" r="2.4697" fill="#4266d5"/>
<circle cx="103.6" cy="19.9" r="3.2496" fill="#c93d39"/>
<circle cx="123.61" cy="36.5" r="2.7296" fill="#4266d5"/>
<circle cx="127.64" cy="5.2" r="3.8995" fill="#3b972e"/>
<circle cx="132.58" cy="39.2" r="3.8995" fill="#945bb0"/>
<circle cx="67.591" cy="37.4" r="3.3795" fill="#3b972e"/>
<circle cx="95.017" cy="13.9" r="4.2894" fill="#4266d5"/>
<circle cx="77.729" cy="42.9" r="3.8995" fill="#945bb0"/>
<circle cx="102.56" cy="42.0" r="5.8492" fill="#4266d5"/>
<circle cx="113.6" cy="28.8" r="6.7591" fill="#3b972e"/>
<circle cx="57.582" cy="11.0" r="6.3691" fill="#c93d39"/>
<circle cx="38.735" cy="14.8" r="3.3795" fill="#c93d39"/>
<circle cx="84.618" cy="7.9" r="5.3293" fill="#3b972e"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 1.4 KiB

77
docs/src/index.md Normal file
View file

@ -0,0 +1,77 @@
# XAM.jl
[![Project Status: Active The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
[![Latest Release](https://img.shields.io/github/release/BioJulia/XAM.jl.svg)](https://github.com/BioJulia/XAM.jl/releases/latest)
[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/XAM.jl/blob/master/LICENSE)
[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/XAM.jl/stable)
[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/XAM.jl/dev/)
[![Join the chat at https://gitter.im/BioJulia/XAM.jl](https://badges.gitter.im/BioJulia/XAM.jl.svg)](https://gitter.im/BioJulia/XAM.jl)
> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/).
## Description
XAM provides I/O and utilities for manipulating SAM and BAM formatted alignment map files.
## Installation
The latest version of XAM is made available to install through BioJulia's package registry.
By default, Julia's package manager only includes the "General" package registry.
To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command:
```julia
registry add https://github.com/BioJulia/BioJuliaRegistry.git
```
Once the registry is added to your configuration, you can install XAM while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following:
```julia
add XAM
```
If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/XAM.jl/tree/develop) to try new features before release.
## Testing
XAM is tested against Julia `1.X` on Linux, OS X, and Windows.
**Latest build status:**
[![Unit tests](https://github.com/BioJulia/XAM.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster)
[![Documentation](https://github.com/BioJulia/XAM.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/XAM.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
[![codecov](https://codecov.io/gh/BioJulia/XAM.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/XAM.jl)
## Contributing
We appreciate [contributions](https://github.com/BioJulia/XAM.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features.
Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct.
### Financial contributions
We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia).
Anyone can file an expense.
If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed.
## Backers & Sponsors
Thank you to all our backers and sponsors!
Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer).
[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers)
Does your company use BioJulia?
Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor).
Your logo will show up here with a link to your website.
[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website)
[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website)
[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website)
[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website)
[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website)
[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website)
[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website)
[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website)
[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website)
[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website)
## Questions?
If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).

38
docs/src/man/api.md Normal file
View file

@ -0,0 +1,38 @@
```@meta
CurrentModule = XAM
DocTestSetup = quote
using XAM
end
```
# API Reference
## SAM API
### Public
```@autodocs
Modules = [XAM.SAM]
private = false
```
### Internal
```@autodocs
Modules = [XAM.SAM]
public = false
```
## BAM API
### Public
```@autodocs
Modules = [XAM.BAM]
private = false
```
### Internal
```@autodocs
Modules = [XAM.BAM]
public = false
```

304
docs/src/man/hts-files.md Normal file
View file

@ -0,0 +1,304 @@
# SAM and BAM
## Introduction
The `XAM` package offers high-performance tools for SAM and BAM file formats, which are the most popular file formats.
If you have questions about the SAM and BAM formats or any of the terminology used when discussing these formats, see the published [specification](https://samtools.github.io/hts-specs/SAMv1.pdf), which is maintained by the [samtools group](https://samtools.github.io/).
A very very simple SAM file looks like the following:
```
@HD VN:1.6 SO:coordinate
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA *
r003 0 ref 9 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC *
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 37 30 9M = 7 -39 CAGCGGCAT * NM:i:1
```
Where the first two lines are part of the "header", and the following lines are "records".
Each record describes how a read aligns to some reference sequence.
Sometimes one record describes one read, but there are other cases like chimeric reads and split alignments, where multiple records apply to one read.
In the example above, `r003` is a chimeric read, and `r004` is a split alignment, and `r001` are mate pair reads.
Again, we refer you to the official [specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for more details.
A BAM file stores this same information but in a binary and compressible format that does not make for pretty printing here!
## Reading SAM and BAM files
A typical script iterating over all records in a file looks like below:
```julia
using XAM
# Open a BAM file.
reader = open(BAM.Reader, "data.bam")
# Iterate over BAM records.
for record in reader
# `record` is a BAM.Record object.
if BAM.ismapped(record)
# Print the mapped position.
println(BAM.refname(record), ':', BAM.position(record))
end
end
# Close the BAM file.
close(reader)
```
The size of a BAM file is often extremely large.
The iterator interface demonstrated above allocates an object for each record and that may be a bottleneck of reading data from a BAM file.
In-place reading reuses a pre-allocated object for every record and less memory allocation happens in reading:
```julia
reader = open(BAM.Reader, "data.bam")
record = BAM.Record()
while !eof(reader)
read!(reader, record)
# do something
end
```
## SAM and BAM Headers
Both `SAM.Reader` and `BAM.Reader` implement the `header` function, which returns a `SAM.Header` object.
To extract certain information out of the headers, you can use the `find` method on the header to extract information according to SAM/BAM tag.
Again we refer you to the [specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for full details of all the different tags that can occur in headers, and what they mean.
Below is an example of extracting all the info about the reference sequences from the BAM header.
In SAM/BAM, any description of a reference sequence is stored in the header, under a tag denoted `SQ` (think `reference SeQuence`!).
```jlcon
julia> reader = open(SAM.Reader, "data.sam");
julia> find(header(reader), "SQ")
7-element Array{Bio.Align.SAM.MetaInfo,1}:
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=Chr1 LN=30427671
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=Chr2 LN=19698289
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=Chr3 LN=23459830
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=Chr4 LN=18585056
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=Chr5 LN=26975502
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=chloroplast LN=154478
Bio.Align.SAM.MetaInfo:
tag: SQ
value: SN=mitochondria LN=366924
```
In the above we can see there were 7 sequences in the reference: 5 chromosomes, one chloroplast sequence, and one mitochondrial sequence.
## SAM and BAM Records
### SAM.Record
The `XAM` package supports the following accessors for `SAM.Record` types.
```@docs
XAM.SAM.flag
XAM.SAM.ismapped
XAM.SAM.isprimary
XAM.SAM.refname
XAM.SAM.position
XAM.SAM.rightposition
XAM.SAM.isnextmapped
XAM.SAM.nextrefname
XAM.SAM.nextposition
XAM.SAM.mappingquality
XAM.SAM.cigar
XAM.SAM.alignment
XAM.SAM.alignlength
XAM.SAM.tempname
XAM.SAM.templength
XAM.SAM.sequence
XAM.SAM.seqlength
XAM.SAM.quality
XAM.SAM.auxdata
```
### BAM.Record
The `XAM` package supports the following accessors for `BAM.Record` types.
```@docs
XAM.BAM.flag
XAM.BAM.ismapped
XAM.BAM.isprimary
XAM.BAM.refid
XAM.BAM.refname
XAM.BAM.reflen
XAM.BAM.position
XAM.BAM.rightposition
XAM.BAM.isnextmapped
XAM.BAM.nextrefid
XAM.BAM.nextrefname
XAM.BAM.nextposition
XAM.BAM.mappingquality
XAM.BAM.cigar
XAM.BAM.alignment
XAM.BAM.alignlength
XAM.BAM.tempname
XAM.BAM.templength
XAM.BAM.sequence
XAM.BAM.seqlength
XAM.BAM.quality
XAM.BAM.auxdata
```
## Accessing auxiliary data
SAM and BAM records support the storing of optional data fields associated with tags.
Tagged auxiliary data follows a format of `TAG:TYPE:VALUE`.
`TAG` is a two-letter string, and each tag can only appear once per record.
`TYPE` is a single case-sensetive letter which defined the format of `VALUE`.
| Type | Description |
|------|-----------------------------------|
| 'A' | Printable character |
| 'i' | Signed integer |
| 'f' | Single-precision floating number |
| 'Z' | Printable string, including space |
| 'H' | Byte array in Hex format |
| 'B' | Integer of numeric array |
For more information about these tags and their types we refer you to the [SAM/BAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) and the additional [optional fields specification](https://samtools.github.io/hts-specs/SAMtags.pdf) document.
There are some tags that are reserved, predefined standard tags, for specific uses.
To access optional fields stored in tags, you use `getindex` indexing syntax on the record object.
Note that accessing optional tag fields will result in type instability in Julia.
This is because the type of the optional data is not known until run-time, as the tag is being read.
This can have a significant impact on performance.
To limit this, if the user knows the type of a value in advance, specifying it as a type annotation will alleviate the problem:
Below is an example of looping over records in a bam file and using indexing syntax to get the data stored in the "NM" tag.
Note the `UInt8` type assertion to alleviate type instability.
```julia
for record in open(BAM.Reader, "data.bam")
nm = record["NM"]::UInt8
# do something
end
```
## Getting records in a range
The `XAM` package supports the BAI index to fetch records in a specific range from a BAM file.
[Samtools](https://samtools.github.io/) provides `index` subcommand to create an index file (.bai) from a sorted BAM file.
```console
$ samtools index -b SRR1238088.sort.bam
$ ls SRR1238088.sort.bam*
SRR1238088.sort.bam SRR1238088.sort.bam.bai
```
The method `eachoverlap(reader, chrom, range)` returns an iterator of BAM records overlapping the query interval:
```julia
reader = open(BAM.Reader, "SRR1238088.sort.bam", index="SRR1238088.sort.bam.bai")
for record in eachoverlap(reader, "Chr2", 10000:11000)
# `record` is a BAM.Record object
# ...
end
close(reader)
```
## Getting records overlapping genomic features
The `eachoverlap` method also accepts the `Interval` type defined in [GenomicFeatures.jl](https://github.com/BioJulia/GenomicFeatures.jl).
This allows you to do things like first read in the genomic features from a GFF3 file, and then for each feature, iterate over all the BAM records that overlap with that feature.
```julia
using GenomicFeatures
using GFF3
using XAM
# Load genomic features from a GFF3 file.
features = open(collect, GFF3.Reader, "TAIR10_GFF3_genes.gff")
# Keep mRNA features.
filter!(x -> GFF3.featuretype(x) == "mRNA", features)
# Open a BAM file and iterate over records overlapping mRNA transcripts.
reader = open(BAM.Reader, "SRR1238088.sort.bam", index = "SRR1238088.sort.bam.bai")
for feature in features
for record in eachoverlap(reader, feature)
# `record` overlaps `feature`.
# ...
end
end
close(reader)
```
## Writing files
In order to write a BAM or SAM file, you must first create a `SAM.Header`.
A `SAM.Header` is constructed from a vector of `SAM.MetaInfo` objects.
For example, to create the following simple header:
```
@HD VN:1.6 SO:coordinate
@SQ SN:ref LN:45
```
```julia
julia> a = SAM.MetaInfo("HD", ["VN" => 1.6, "SO" => "coordinate"])
SAM.MetaInfo:
tag: HD
value: VN=1.6 SO=coordinate
julia> b = SAM.MetaInfo("SQ", ["SN" => "ref", "LN" => 45])
SAM.MetaInfo:
tag: SQ
value: SN=ref LN=45
julia> h = SAM.Header([a, b])
SAM.Header(SAM.MetaInfo[SAM.MetaInfo:
tag: HD
value: VN=1.6 SO=coordinate, SAM.MetaInfo:
tag: SQ
value: SN=ref LN=45])
```
Then to create the writer for a SAM file, construct a `SAM.Writer` using the header and an `IO` type:
```julia
julia> samw = SAM.Writer(open("my-data.sam", "w"), h)
SAM.Writer(IOStream(<file my-data.sam>))
```
To make a BAM Writer is slightly different, as you need to use a specific stream type from the (https://github.com/BioJulia/BGZFStreams.jl)[https://github.com/BioJulia/BGZFStreams.jl] package:
```julia
julia> using BGZFStreams
julia> bamw = BAM.Writer(BGZFStream(open("my-data.bam", "w"), "w"))
BAM.Writer(BGZFStreams.BGZFStream{IOStream}(<mode=write>))
```
Once you have a BAM or SAM writer, you can use the `write` method to write `BAM.Record`s or `SAM.Record`s to file:
```julia
julia> write(bamw, rec) # Here rec is a `BAM.Record`
330780
```

13
src/XAM.jl Normal file
View file

@ -0,0 +1,13 @@
module XAM
export
SAM,
BAM
include("sam/sam.jl")
include("bam/bam.jl")
using .SAM
using .BAM
end # module

153
src/bam/auxdata.jl Normal file
View file

@ -0,0 +1,153 @@
# BAM Auxiliary Data
# ==================
struct AuxData <: AbstractDict{String,Any}
data::Vector{UInt8}
end
function Base.getindex(aux::AuxData, tag::AbstractString)
checkauxtag(tag)
return getauxvalue(aux.data, 1, length(aux.data), UInt8(tag[1]), UInt8(tag[2]))
end
function Base.length(aux::AuxData)
data = aux.data
p = 1
len = 0
while p length(data)
len += 1
p = next_tag_position(data, p)
end
return len
end
function Base.iterate(aux::AuxData, pos=1)
if pos > length(aux.data)
return nothing
end
data = aux.data
@label doit
t1 = data[pos]
t2 = data[pos+1]
pos, typ = loadauxtype(data, pos + 2)
pos, value = loadauxvalue(data, pos, typ)
if t1 == t2 == 0xff
@goto doit
end
return Pair{String,Any}(String([t1, t2]), value), pos
end
# Internals
# ---------
function checkauxtag(tag::AbstractString)
if sizeof(tag) != 2
throw(ArgumentError("tag length must be 2"))
end
end
function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
pos = findauxtag(data, start, stop, t1, t2)
if pos == 0
throw(KeyError(String([t1, t2])))
end
pos, T = loadauxtype(data, pos + 2)
_, val = loadauxvalue(data, pos, T)
return val
end
function loadauxtype(data::Vector{UInt8}, p::Int)
function auxtype(b)
return (
b == UInt8('A') ? Char :
b == UInt8('c') ? Int8 :
b == UInt8('C') ? UInt8 :
b == UInt8('s') ? Int16 :
b == UInt8('S') ? UInt16 :
b == UInt8('i') ? Int32 :
b == UInt8('I') ? UInt32 :
b == UInt8('f') ? Float32 :
b == UInt8('Z') ? String :
error("invalid type tag: '$(Char(b))'"))
end
t = data[p]
if t == UInt8('B')
return p + 2, Vector{auxtype(data[p+1])}
else
return p + 1, auxtype(t)
end
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T
return p + sizeof(T), unsafe_load(Ptr{T}(pointer(data, p)))
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Char})
return p + 1, Char(unsafe_load(pointer(data, p)))
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
p += 4
xs = Vector{T}(undef, n)
unsafe_copyto!(pointer(xs), Ptr{T}(pointer(data, p)), n)
return p + n * sizeof(T), xs
end
function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{String})
dataptr = pointer(data, p)
endptr = ccall(:memchr, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), dataptr, '\0', length(data) - p + 1)
q::Int = p + (endptr - dataptr) - 1
return q + 2, String(data[p:q])
end
function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8)
pos = start
while pos stop && !(data[pos] == t1 && data[pos+1] == t2)
pos = next_tag_position(data, pos)
end
if pos > stop
return 0
else
return pos
end
end
# Find the starting position of a next tag in `data` after `p`.
# `(data[p], data[p+1])` is supposed to be a current tag.
function next_tag_position(data::Vector{UInt8}, p::Int)
typ = Char(data[p+2])
p += 3
if typ == 'A'
p += 1
elseif typ == 'c' || typ == 'C'
p += 1
elseif typ == 's' || typ == 'S'
p += 2
elseif typ == 'i' || typ == 'I'
p += 4
elseif typ == 'f'
p += 4
elseif typ == 'd'
p += 8
elseif typ == 'Z' || typ == 'H'
while data[p] != 0x00 # NULL-terminalted string
p += 1
end
p += 1
elseif typ == 'B'
eltyp = Char(data[p])
elsize = eltyp == 'c' || eltyp == 'C' ? 1 :
eltyp == 's' || eltyp == 'S' ? 2 :
eltyp == 'i' || eltye == 'I' || eltyp == 'f' ? 4 :
error("invalid type tag: '$(Char(eltyp))'")
p += 1
n = unsafe_load(Ptr{Int32}(pointer(data, p)))
p += 4 + elsize * n
else
error("invalid type tag: '$(Char(typ))'")
end
return p
end

55
src/bam/bai.jl Normal file
View file

@ -0,0 +1,55 @@
# BAI
# ===
#
# Index for BAM files.
# An index type for the BAM file format.
struct BAI
# BGZF file index
index::Indexes.BGZFIndex
# number of unmapped reads
n_no_coors::Union{Nothing, Int}
end
"""
BAI(filename::AbstractString)
Load a BAI index from `filename`.
"""
function BAI(filename::AbstractString)
return open(read_bai, filename)
end
"""
BAI(input::IO)
Load a BAI index from `input`.
"""
function BAI(input::IO)
return read_bai(input)
end
# Read a BAI object from `input`.
function read_bai(input::IO)
# check magic bytes
B = read(input, UInt8)
A = read(input, UInt8)
I = read(input, UInt8)
x = read(input, UInt8)
if B != UInt8('B') || A != UInt8('A') || I != UInt8('I') || x != 0x01
error("input is not a valid BAI file")
end
# read contents
n_refs = read(input, Int32)
index = Indexes.read_bgzfindex(input, n_refs)
if !eof(input)
n_no_coors = read(input, UInt64)
else
n_no_coors = nothing
end
return BAI(index, n_no_coors)
end

26
src/bam/bam.jl Normal file
View file

@ -0,0 +1,26 @@
# BAM File Format
# ===============
module BAM
using BioCore
using GenomicFeatures
using XAM.SAM
import BGZFStreams
import BioAlignments
import Indexes
import BioSequences
import BioCore: isfilled, header
import GenomicFeatures: eachoverlap
include("bai.jl")
include("auxdata.jl")
include("reader.jl")
include("record.jl")
include("writer.jl")
include("overlap.jl")
end

95
src/bam/overlap.jl Normal file
View file

@ -0,0 +1,95 @@
# BAM Overlap
# ===========
struct OverlapIterator{T}
reader::Reader{T}
refname::String
interval::UnitRange{Int}
end
function Base.IteratorSize(::Type{OverlapIterator{T}}) where T
return Base.SizeUnknown()
end
function Base.eltype(::Type{OverlapIterator{T}}) where T
return Record
end
function GenomicFeatures.eachoverlap(reader::Reader, interval::Interval)
return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last)
end
function GenomicFeatures.eachoverlap(reader::Reader, interval)
return GenomicFeatures.eachoverlap(reader, convert(Interval, interval))
end
function GenomicFeatures.eachoverlap(reader::Reader, refname::AbstractString, interval::UnitRange)
return OverlapIterator(reader, String(refname), interval)
end
# Iterator
# --------
mutable struct OverlapIteratorState
# reference index
refindex::Int
# possibly overlapping chunks
chunks::Vector{Indexes.Chunk}
# current chunk index
chunkid::Int
# pre-allocated record
record::Record
end
function Base.iterate(iter::OverlapIterator)
refindex = findfirst(isequal(iter.refname), iter.reader.refseqnames)
if refindex == 0
throw(ArgumentError("sequence name $(iter.refname) is not found in the header"))
end
@assert iter.reader.index !== nothing
chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval)
if !isempty(chunks)
seek(iter.reader, first(chunks).start)
end
state = OverlapIteratorState(refindex, chunks, 1, Record())
return iterate(iter, state)
end
function Base.iterate(iter::OverlapIterator, state)
while state.chunkid lastindex(state.chunks)
chunk = state.chunks[state.chunkid]
while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop
read!(iter.reader, state.record)
c = compare_intervals(state.record, (state.refindex, iter.interval))
if c == 0
return copy(state.record), state
elseif c > 0
# no more overlapping records in this chunk since records are sorted
break
end
end
state.chunkid += 1
if state.chunkid lastindex(state.chunks)
seek(iter.reader, state.chunks[state.chunkid].start)
end
end
return nothing
end
function compare_intervals(record::Record, interval::Tuple{Int,UnitRange{Int}})
rid = refid(record)
if rid < interval[1] || (rid == interval[1] && rightposition(record) < first(interval[2]))
# strictly left
return -1
elseif rid > interval[1] || (rid == interval[1] && position(record) > last(interval[2]))
# strictly right
return +1
else
# overlapping
return 0
end
end

141
src/bam/reader.jl Normal file
View file

@ -0,0 +1,141 @@
# BAM Reader
# ==========
"""
BAM.Reader(input::IO; index=nothing)
Create a data reader of the BAM file format.
# Arguments
* `input`: data source
* `index=nothing`: filepath to a random access index (currently *bai* is supported)
"""
mutable struct Reader{T} <: BioCore.IO.AbstractReader
stream::BGZFStreams.BGZFStream{T}
header::SAM.Header
start_offset::BGZFStreams.VirtualOffset
refseqnames::Vector{String}
refseqlens::Vector{Int}
index::Union{Nothing, BAI}
end
function Base.eltype(::Type{Reader{T}}) where T
return Record
end
function BioCore.IO.stream(reader::Reader)
return reader.stream
end
function Reader(input::IO; index=nothing)
if isa(index, AbstractString)
index = BAI(index)
else
if index != nothing
error("unrecognizable index argument")
end
end
reader = init_bam_reader(input)
reader.index = index
return reader
end
function Base.show(io::IO, reader::Reader)
println(io, summary(reader), ":")
print(io, " number of contigs: ", length(reader.refseqnames))
end
"""
header(reader::Reader; fillSQ::Bool=false)::SAM.Header
Get the header of `reader`.
If `fillSQ` is `true`, this function fills missing "SQ" metainfo in the header.
"""
function header(reader::Reader; fillSQ::Bool=false)::SAM.Header
header = reader.header
if fillSQ
if !isempty(findall(reader.header, "SQ"))
throw(ArgumentError("SAM header already has SQ records"))
end
header = copy(header)
for (name, len) in zip(reader.refseqnames, reader.refseqlens)
push!(header, SAM.MetaInfo("SQ", ["SN" => name, "LN" => len]))
end
end
return header
end
function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset)
seek(reader.stream, voffset)
end
function Base.seekstart(reader::Reader)
seek(reader.stream, reader.start_offset)
end
function Base.iterate(reader::Reader, rec=Record())
if eof(reader)
return nothing
end
read!(reader, rec)
return copy(rec), rec
end
# Initialize a BAM reader by reading the header section.
function init_bam_reader(input::BGZFStreams.BGZFStream)
# magic bytes
B = read(input, UInt8)
A = read(input, UInt8)
M = read(input, UInt8)
x = read(input, UInt8)
if B != UInt8('B') || A != UInt8('A') || M != UInt8('M') || x != 0x01
error("input was not a valid BAM file")
end
# SAM header
textlen = read(input, Int32)
samreader = SAM.Reader(IOBuffer(read(input, textlen)))
# reference sequences
refseqnames = String[]
refseqlens = Int[]
n_refs = read(input, Int32)
for _ in 1:n_refs
namelen = read(input, Int32)
data = read(input, namelen)
seqname = unsafe_string(pointer(data))
seqlen = read(input, Int32)
push!(refseqnames, seqname)
push!(refseqlens, seqlen)
end
voffset = isa(input.io, Base.AbstractPipe) ?
BGZFStreams.VirtualOffset(0, 0) :
BGZFStreams.virtualoffset(input)
return Reader(
input,
samreader.header,
voffset,
refseqnames,
refseqlens,
nothing)
end
function init_bam_reader(input::IO)
return init_bam_reader(BGZFStreams.BGZFStream(input))
end
function _read!(reader::Reader, record)
unsafe_read(
reader.stream,
pointer_from_objref(record),
FIXED_FIELDS_BYTES)
dsize = data_size(record)
if length(record.data) < dsize
resize!(record.data, dsize)
end
unsafe_read(reader.stream, pointer(record.data), dsize)
record.reader = reader
return record
end

634
src/bam/record.jl Normal file
View file

@ -0,0 +1,634 @@
# BAM Record
# ==========
"""
BAM.Record()
Create an unfilled BAM record.
"""
mutable struct Record
# fixed-length fields (see BMA specs for the details)
block_size::Int32
refid::Int32
pos::Int32
bin_mq_nl::UInt32
flag_nc::UInt32
l_seq::Int32
next_refid::Int32
next_pos::Int32
tlen::Int32
# variable length data
data::Vector{UInt8}
reader::Reader
function Record()
return new(0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[])
end
end
# the data size of fixed-length fields (block_size-tlen)
const FIXED_FIELDS_BYTES = 36
function Record(data::Vector{UInt8})
return convert(Record, data)
end
function Base.convert(::Type{Record}, data::Vector{UInt8})
length(data) < FIXED_FIELDS_BYTES && throw(ArgumentError("data too short"))
record = Record()
dst_pointer = Ptr{UInt8}(pointer_from_objref(record))
unsafe_copyto!(dst_pointer, pointer(data), FIXED_FIELDS_BYTES)
dsize = data_size(record)
resize!(record.data, dsize)
length(data) < dsize + FIXED_FIELDS_BYTES && throw(ArgumentError("data too short"))
unsafe_copyto!(record.data, 1, data, FIXED_FIELDS_BYTES + 1, dsize)
return record
end
function Base.copy(record::Record)
copy = Record()
copy.block_size = record.block_size
copy.refid = record.refid
copy.pos = record.pos
copy.bin_mq_nl = record.bin_mq_nl
copy.flag_nc = record.flag_nc
copy.l_seq = record.l_seq
copy.next_refid = record.next_refid
copy.next_pos = record.next_pos
copy.tlen = record.tlen
copy.data = record.data[1:data_size(record)]
if isdefined(record, :reader)
copy.reader = record.reader
end
return copy
end
function Base.show(io::IO, record::Record)
print(io, summary(record), ':')
if isfilled(record)
println(io)
println(io, " template name: ", tempname(record))
println(io, " flag: ", flag(record))
println(io, " reference ID: ", refid(record))
println(io, " position: ", position(record))
println(io, " mapping quality: ", mappingquality(record))
println(io, " CIGAR: ", cigar(record))
println(io, " next reference ID: ", nextrefid(record))
println(io, " next position: ", nextposition(record))
println(io, " template length: ", templength(record))
println(io, " sequence: ", sequence(record))
# TODO: pretty print base quality
println(io, " base quality: ", quality(record))
print(io, " auxiliary data:")
for field in keys(auxdata(record))
print(io, ' ', field, '=', record[field])
end
else
print(io, " <not filled>")
end
end
function Base.read!(reader::Reader, record::Record)
return _read!(reader, record)
end
# Accessor Fuctions
# -----------------
"""
flag(record::Record)::UInt16
Get the bitwise flag of `record`.
"""
function flag(record::Record)::UInt16
checkfilled(record)
return UInt16(record.flag_nc >> 16)
end
function hasflag(record::Record)
return isfilled(record)
end
"""
ismapped(record::Record)::Bool
Test if `record` is mapped.
"""
function ismapped(record::Record)::Bool
return flag(record) & SAM.FLAG_UNMAP == 0
end
"""
isprimary(record::Record)::Bool
Test if `record` is a primary line of the read.
This is equivalent to `flag(record) & 0x900 == 0`.
"""
function isprimary(record::Record)::Bool
return flag(record) & 0x900 == 0
end
"""
ispositivestrand(record::Record)::Bool
Test if `record` is aligned to the positive strand.
This is equivalent to `flag(record) & 0x10 == 0`.
"""
function ispositivestrand(record::Record)::Bool
flag(record) & 0x10 == 0
end
"""
refid(record::Record)::Int
Get the reference sequence ID of `record`.
The ID is 1-based (i.e. the first sequence is 1) and is 0 for a record without a mapping position.
See also: `BAM.rname`
"""
function refid(record::Record)::Int
checkfilled(record)
return record.refid + 1
end
function hasrefid(record::Record)
return isfilled(record)
end
function checked_refid(record::Record)
id = refid(record)
if id == 0
throw(ArgumentError("record is not mapped"))
elseif !isdefined(record, :reader)
throw(ArgumentError("reader is not defined"))
end
return id
end
"""
refname(record::Record)::String
Get the reference sequence name of `record`.
See also: `BAM.refid`
"""
function refname(record::Record)::String
checkfilled(record)
id = checked_refid(record)
return record.reader.refseqnames[id]
end
"""
reflen(record::Record)::Int
Get the length of the reference sequence this record applies to.
"""
function reflen(record::Record)::Int
checkfilled(record)
id = checked_refid(record)
return record.reader.refseqlens[id]
end
function hasrefname(record::Record)
return hasrefid(record)
end
"""
position(record::Record)::Int
Get the 1-based leftmost mapping position of `record`.
"""
function position(record::Record)::Int
checkfilled(record)
return record.pos + 1
end
function hasposition(record::Record)
return isfilled(record)
end
"""
rightposition(record::Record)::Int
Get the 1-based rightmost mapping position of `record`.
"""
function rightposition(record::Record)::Int
checkfilled(record)
return Int32(position(record) + alignlength(record) - 1)
end
function hasrightposition(record::Record)
return isfilled(record) && ismapped(record)
end
"""
isnextmapped(record::Record)::Bool
Test if the mate/next read of `record` is mapped.
"""
function isnextmapped(record::Record)::Bool
return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0)
end
"""
nextrefid(record::Record)::Int
Get the next/mate reference sequence ID of `record`.
"""
function nextrefid(record::Record)::Int
checkfilled(record)
return record.next_refid + 1
end
function hasnextrefid(record::Record)
return isfilled(record)
end
"""
nextrefname(record::Record)::String
Get the reference name of the mate/next read of `record`.
"""
function nextrefname(record::Record)::String
checkfilled(record)
id = nextrefid(record)
if id == 0
throw(ArgumentError("next record is not mapped"))
elseif !isdefined(record, :reader)
throw(ArgumentError("reader is not defined"))
end
return record.reader.refseqnames[id]
end
function hasnextrefname(record::Record)
return isfilled(record) && isnextmapped(record)
end
"""
nextposition(record::Record)::Int
Get the 1-based leftmost mapping position of the next/mate read of `record`.
"""
function nextposition(record::Record)::Int
checkfilled(record)
return record.next_pos + 1
end
function hasnextposition(record::Record)
return isfilled(record)
end
"""
mappingquality(record::Record)::UInt8
Get the mapping quality of `record`.
"""
function mappingquality(record::Record)::UInt8
return UInt8((record.bin_mq_nl >> 8) & 0xff)
end
function hasmappingquality(record::Record)
return isfilled(record)
end
"""
n_cigar_op(record::Record, checkCG::Bool = true)
Return the number of operations in the CIGAR string of `record`.
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
Calling this method with `checkCG` set to `true` (default) this method will always yield the number of operations in the true cigar string, because this is probably what you want, the vast majority of the time.
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to get the number of operations in the `cigar` field of the BAM record, then set `checkCG` to `false`.
"""
function n_cigar_op(record::Record, checkCG::Bool = true)
return cigar_position(record, checkCG)[2]
end
"""
cigar(record::Record)::String
Get the CIGAR string of `record`.
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time.
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`.
See also `BAM.cigar_rle`.
"""
function cigar(record::Record, checkCG::Bool = true)::String
buf = IOBuffer()
for (op, len) in zip(cigar_rle(record, checkCG)...)
print(buf, len, convert(Char, op))
end
return String(take!(buf))
end
"""
cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}}
Get a run-length encoded tuple `(ops, lens)` of the CIGAR string in `record`.
Note that in the BAM specification, the field called `cigar` typically stores the cigar string of the record.
However, this is not always true, sometimes the true cigar is very long, and due to some constraints of the BAM format, the actual cigar string is stored in an extra tag: `CG:B,I`, and the `cigar` field stores a pseudo-cigar string.
Calling this method with `checkCG` set to `true` (default) this method will always yield the true cigar string, because this is probably what you want the vast majority of the time.
If you have a record that stores the true cigar in a `CG:B,I` tag, but you still want to access the pseudo-cigar that is stored in the `cigar` field of the BAM record, then you can set checkCG to `false`.
See also `BAM.cigar`.
"""
function cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}}
checkfilled(record)
idx, nops = cigar_position(record, checkCG)
ops, lens = extract_cigar_rle(record.data, idx, nops)
return ops, lens
end
function extract_cigar_rle(data::Vector{UInt8}, offset, n)
ops = Vector{BioAlignments.Operation}()
lens = Vector{Int}()
for i in offset:4:offset + (n - 1) * 4
x = unsafe_load(Ptr{UInt32}(pointer(data, i)))
op = BioAlignments.Operation(x & 0x0F)
push!(ops, op)
push!(lens, x >> 4)
end
return ops, lens
end
function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int}
cigaridx, nops = seqname_length(record) + 1, record.flag_nc & 0xFFFF
if !checkCG
return cigaridx, nops
end
if nops != 2
return cigaridx, nops
end
x = unsafe_load(Ptr{UInt32}(pointer(record.data, cigaridx)))
if x != UInt32(seqlength(record) << 4 | 4)
return cigaridx, nops
end
start = auxdata_position(record)
stop = data_size(record)
tagidx = findauxtag(record.data, start, stop, UInt8('C'), UInt8('G'))
if tagidx == 0
return cigaridx, nops
end
# Tag exists, validate type is BI.
typ = unsafe_load(Ptr{UInt16}(pointer(record.data, tagidx += 2)))
if typ != (UInt16('I') << 8 | UInt16('B'))
return cigaridx, nops
end
# If got this far, the CG tag is valid and contains the cigar.
# Get the true n_cigar_ops, and return it and the idx of the first
nops = UInt32(unsafe_load(Ptr{Int32}(pointer(record.data, tagidx += 2))))
tagidx += 4
return tagidx, nops
end
"""
alignment(record::Record)::BioAlignments.Alignment
Get the alignment of `record`.
"""
function alignment(record::Record)::BioAlignments.Alignment
checkfilled(record)
if !ismapped(record)
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
end
seqpos = 0
refpos = position(record) - 1
anchors = [BioAlignments.AlignmentAnchor(seqpos, refpos, BioAlignments.OP_START)]
for (op, len) in zip(cigar_rle(record)...)
if BioAlignments.ismatchop(op)
seqpos += len
refpos += len
elseif BioAlignments.isinsertop(op)
seqpos += len
elseif BioAlignments.isdeleteop(op)
refpos += len
else
error("operation $(op) is not supported")
end
push!(anchors, BioAlignments.AlignmentAnchor(seqpos, refpos, op))
end
return BioAlignments.Alignment(anchors)
end
function hasalignment(record::Record)
return ismapped(record)
end
"""
alignlength(record::Record)::Int
Get the alignment length of `record`.
"""
function alignlength(record::Record)::Int
offset = seqname_length(record)
length::Int = 0
for i in offset + 1:4:offset + n_cigar_op(record, false) * 4
x = unsafe_load(Ptr{UInt32}(pointer(record.data, i)))
op = BioAlignments.Operation(x & 0x0F)
if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op)
length += x >> 4
end
end
return length
end
"""
tempname(record::Record)::String
Get the query template name of `record`.
"""
function tempname(record::Record)::String
checkfilled(record)
# drop the last NUL character
return unsafe_string(pointer(record.data), max(seqname_length(record) - 1, 0))
end
function hastempname(record::Record)
return isfilled(record)
end
"""
templength(record::Record)::Int
Get the template length of `record`.
"""
function templength(record::Record)::Int
checkfilled(record)
return record.tlen
end
function hastemplength(record::Record)
return isfilled(record)
end
"""
sequence(record::Record)::BioSequences.LongDNASeq
Get the segment sequence of `record`.
"""
function sequence(record::Record)::BioSequences.LongDNASeq
checkfilled(record)
seqlen = seqlength(record)
data = Vector{UInt64}(undef, cld(seqlen, 16))
src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1)
for i in 1:lastindex(data)
# copy data flipping high and low nybble
x = unsafe_load(src, i)
data[i] = (x & 0x0f0f0f0f0f0f0f0f) << 4 | (x & 0xf0f0f0f0f0f0f0f0) >> 4
end
return BioSequences.LongDNASeq(data, 1:seqlen, false)
end
function hassequence(record::Record)
return isfilled(record)
end
"""
seqlength(record::Record)::Int
Get the sequence length of `record`.
"""
function seqlength(record::Record)::Int
checkfilled(record)
return record.l_seq % Int
end
function hasseqlength(record::Record)
return isfilled(record)
end
"""
quality(record::Record)::Vector{UInt8}
Get the base quality of `record`.
"""
function quality(record::Record)::Vector{UInt8}
checkfilled(record)
seqlen = seqlength(record)
offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2)
return [reinterpret(Int8, record.data[i+offset]) for i in 1:seqlen]
end
function hasquality(record::Record)
return isfilled(record)
end
"""
auxdata(record::Record)::BAM.AuxData
Get the auxiliary data of `record`.
"""
function auxdata(record::Record)
checkfilled(record)
return AuxData(record.data[auxdata_position(record):data_size(record)])
end
function hasauxdata(record::Record)
return isfilled(record)
end
function Base.getindex(record::Record, tag::AbstractString)
checkauxtag(tag)
start = auxdata_position(record)
stop = data_size(record)
return getauxvalue(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2]))
end
function Base.haskey(record::Record, tag::AbstractString)
checkauxtag(tag)
start = auxdata_position(record)
stop = data_size(record)
return findauxtag(record.data, start, stop, UInt8(tag[1]), UInt8(tag[2])) > 0
end
function Base.keys(record::Record)
return collect(keys(auxdata(record)))
end
function Base.values(record::Record)
return [record[key] for key in keys(record)]
end
# BioCore Methods
# -----------
function BioCore.isfilled(record::Record)
return record.block_size != 0
end
function BioCore.seqname(record::Record)
return tempname(record)
end
function BioCore.hasseqname(record::Record)
return hastempname(record)
end
function BioCore.sequence(record::Record)
return sequence(record)
end
function BioCore.hassequence(record::Record)
return hassequence(record)
end
function BioCore.leftposition(record::Record)
return position(record)
end
function BioCore.hasleftposition(record::Record)
return hasposition(record)
end
function BioCore.rightposition(record::Record)
return rightposition(record)
end
function BioCore.hasrightposition(record::Record)
return hasrightposition(record)
end
# Helper Functions
# ----------------
# Return the size of the `.data` field.
function data_size(record::Record)
if isfilled(record)
return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size)
else
return 0
end
end
function checkfilled(record::Record)
if !isfilled(record)
throw(ArgumentError("unfilled BAM record"))
end
end
function auxdata_position(record::Record)
seqlen = seqlength(record)
return seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + seqlen + 1
end
# Return the length of the read name.
function seqname_length(record::Record)
return record.bin_mq_nl & 0xff
end

62
src/bam/writer.jl Normal file
View file

@ -0,0 +1,62 @@
# BAM Writer
# ==========
"""
BAM.Writer(output::BGZFStream, header::SAM.Header)
Create a data writer of the BAM file format.
# Arguments
* `output`: data sink
* `header`: SAM header object
"""
mutable struct Writer <: BioCore.IO.AbstractWriter
stream::BGZFStreams.BGZFStream
end
function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header)
refseqnames = String[]
refseqlens = Int[]
for metainfo in findall(header, "SQ")
push!(refseqnames, metainfo["SN"])
push!(refseqlens, parse(Int, metainfo["LN"]))
end
write_header(stream, header, refseqnames, refseqlens)
return Writer(stream)
end
function BioCore.IO.stream(writer::Writer)
return writer.stream
end
function Base.write(writer::Writer, record::Record)
n = 0
n += unsafe_write(writer.stream, pointer_from_objref(record), FIXED_FIELDS_BYTES)
n += unsafe_write(writer.stream, pointer(record.data), data_size(record))
return n
end
function write_header(stream, header, refseqnames, refseqlens)
@assert length(refseqnames) == length(refseqlens)
n = 0
# magic bytes
n += write(stream, "BAM\1")
# SAM header
buf = IOBuffer()
l = write(SAM.Writer(buf), header)
n += write(stream, Int32(l))
n += write(stream, take!(buf))
# reference sequences
n += write(stream, Int32(length(refseqnames)))
for (seqname, seqlen) in zip(refseqnames, refseqlens)
namelen = length(seqname)
n += write(stream, Int32(namelen + 1))
n += write(stream, seqname, '\0')
n += write(stream, Int32(seqlen))
end
return n
end

26
src/sam/flags.jl Normal file
View file

@ -0,0 +1,26 @@
# SAM Flags
# =========
#
# Bitwise flags (or FLAG).
for (name, bits, doc) in [
(:PAIRED, UInt16(0x001), "the read is paired in sequencing, no matter whether it is mapped in a pair"),
(:PROPER_PAIR, UInt16(0x002), "the read is mapped in a proper pair" ),
(:UNMAP, UInt16(0x004), "the read itself is unmapped; conflictive with SAM.FLAG_PROPER_PAIR" ),
(:MUNMAP, UInt16(0x008), "the mate is unmapped" ),
(:REVERSE, UInt16(0x010), "the read is mapped to the reverse strand" ),
(:MREVERSE, UInt16(0x020), "the mate is mapped to the reverse strand" ),
(:READ1, UInt16(0x040), "this is read1" ),
(:READ2, UInt16(0x080), "this is read2" ),
(:SECONDARY, UInt16(0x100), "not primary alignment" ),
(:QCFAIL, UInt16(0x200), "QC failure" ),
(:DUP, UInt16(0x400), "optical or PCR duplicate" ),
(:SUPPLEMENTARY, UInt16(0x800), "supplementary alignment" ),]
@assert bits isa UInt16
sym = Symbol("FLAG_", name)
doc = string(@sprintf("0x%04x: ", bits), doc)
@eval begin
@doc $(doc) const $(sym) = $(bits)
end
end

53
src/sam/header.jl Normal file
View file

@ -0,0 +1,53 @@
# SAM Header
# ==========
struct Header
metainfo::Vector{MetaInfo}
end
"""
SAM.Header()
Create an empty header.
"""
function Header()
return Header(MetaInfo[])
end
function Base.copy(header::Header)
return Header(header.metainfo)
end
function Base.eltype(::Type{Header})
return MetaInfo
end
function Base.length(header::Header)
return length(header.metainfo)
end
function Base.iterate(header::Header, i=1)
if i > length(header.metainfo)
return nothing
end
return header.metainfo[i], i + 1
end
"""
find(header::Header, key::AbstractString)::Vector{MetaInfo}
Find metainfo objects satisfying `SAM.tag(metainfo) == key`.
"""
function Base.findall(header::Header, key::AbstractString)::Vector{MetaInfo}
return filter(m -> isequalkey(m, key), header.metainfo)
end
function Base.pushfirst!(header::Header, metainfo::MetaInfo)
pushfirst!(header.metainfo, metainfo)
return header
end
function Base.push!(header::Header, metainfo::MetaInfo)
push!(header.metainfo, metainfo)
return header
end

235
src/sam/metainfo.jl Normal file
View file

@ -0,0 +1,235 @@
# SAM Meta-Information
# ====================
mutable struct MetaInfo
# data and filled range
data::Vector{UInt8}
filled::UnitRange{Int}
# indexes
tag::UnitRange{Int}
val::UnitRange{Int}
dictkey::Vector{UnitRange{Int}}
dictval::Vector{UnitRange{Int}}
end
function MetaInfo(data::Vector{UInt8}=UInt8[])
metainfo = MetaInfo(data, 1:0, 1:0, 1:0, UnitRange{Int}[], UnitRange{Int}[])
if !isempty(data)
index!(metainfo)
end
return metainfo
end
"""
MetaInfo(str::AbstractString)
Create a SAM metainfo from `str`.
# Examples
julia> SAM.MetaInfo("@CO\tsome comment")
BioAlignments.SAM.MetaInfo:
tag: CO
value: some comment
julia> SAM.MetaInfo("@SQ\tSN:chr1\tLN:12345")
BioAlignments.SAM.MetaInfo:
tag: SQ
value: SN=chr1 LN=12345
"""
function MetaInfo(str::AbstractString)
return MetaInfo(Vector{UInt8}(str))
end
"""
MetaInfo(tag::AbstractString, value)
Create a SAM metainfo with `tag` and `value`.
`tag` is a two-byte ASCII string. If `tag` is `"CO"`, `value` must be a string; otherwise, `value` is an iterable object with key and value pairs.
# Examples
julia> SAM.MetaInfo("CO", "some comment")
BioAlignments.SAM.MetaInfo:
tag: CO
value: some comment
julia> string(ans)
"@CO\tsome comment"
julia> SAM.MetaInfo("SQ", ["SN" => "chr1", "LN" => 12345])
BioAlignments.SAM.MetaInfo:
tag: SQ
value: SN=chr1 LN=12345
julia> string(ans)
"@SQ\tSN:chr1\tLN:12345"
"""
function MetaInfo(tag::AbstractString, value)
buf = IOBuffer()
if tag == "CO" # comment
if !isa(value, AbstractString)
throw(ArgumentError("value must be a string"))
end
write(buf, "@CO\t", value)
elseif occursin(r"[A-Z][A-Z]", tag)
print(buf, '@', tag)
for (key, val) in value
print(buf, '\t', key, ':', val)
end
else
throw(ArgumentError("tag must match r\"[A-Z][A-Z]\""))
end
return MetaInfo(take!(buf))
end
function initialize!(metainfo::MetaInfo)
metainfo.filled = 1:0
metainfo.tag = 1:0
metainfo.val = 1:0
empty!(metainfo.dictkey)
empty!(metainfo.dictval)
return metainfo
end
function isfilled(metainfo::MetaInfo)
return !isempty(metainfo.filled)
end
function datarange(metainfo::MetaInfo)
return metainfo.filled
end
function checkfilled(metainfo::MetaInfo)
if !isfilled(metainfo)
throw(ArgumentError("unfilled SAM metainfo"))
end
end
function isequalkey(metainfo::MetaInfo, key::AbstractString)
if !isfilled(metainfo) || sizeof(key) != 2
return false
end
k1, k2 = UInt8(key[1]), UInt8(key[2])
return metainfo.data[metainfo.tag[1]] == k1 && metainfo.data[metainfo.tag[2]] == k2
end
function Base.show(io::IO, metainfo::MetaInfo)
print(io, summary(metainfo), ':')
if isfilled(metainfo)
println(io)
println(io, " tag: ", tag(metainfo))
print(io, " value:")
if !iscomment(metainfo)
for (key, val) in zip(keys(metainfo), values(metainfo))
print(io, ' ', key, '=', val)
end
else
print(io, ' ', value(metainfo))
end
else
print(io, " <not filled>")
end
end
function Base.print(io::IO, metainfo::MetaInfo)
write(io, metainfo)
return nothing
end
function Base.write(io::IO, metainfo::MetaInfo)
checkfilled(metainfo)
r = datarange(metainfo)
return unsafe_write(io, pointer(metainfo.data, first(r)), length(r))
end
# Accessor Functions
# ------------------
"""
iscomment(metainfo::MetaInfo)::Bool
Test if `metainfo` is a comment (i.e. its tag is "CO").
"""
function iscomment(metainfo::MetaInfo)::Bool
return isequalkey(metainfo, "CO")
end
"""
tag(metainfo::MetaInfo)::String
Get the tag of `metainfo`.
"""
function tag(metainfo::MetaInfo)::String
checkfilled(metainfo)
return String(metainfo.data[metainfo.tag])
end
"""
value(metainfo::MetaInfo)::String
Get the value of `metainfo` as a string.
"""
function value(metainfo::MetaInfo)::String
checkfilled(metainfo)
return String(metainfo.data[metainfo.val])
end
"""
keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}}
Get the values of `metainfo` as string pairs.
"""
function keyvalues(metainfo::MetaInfo)::Vector{Pair{String,String}}
checkfilled(metainfo)
if iscomment(metainfo)
throw(ArgumentError("not a dictionary"))
end
return Pair{String,String}[key => val for (key, val) in zip(keys(metainfo), values(metainfo))]
end
function Base.keys(metainfo::MetaInfo)
checkfilled(metainfo)
if iscomment(metainfo)
throw(ArgumentError("not a dictionary"))
end
return [String(metainfo.data[r]) for r in metainfo.dictkey]
end
function Base.values(metainfo::MetaInfo)
checkfilled(metainfo)
if iscomment(metainfo)
throw(ArgumentError("not a dictionary"))
end
return [String(metainfo.data[r]) for r in metainfo.dictval]
end
function Base.haskey(metainfo::MetaInfo, key::AbstractString)
return findkey(metainfo, key) > 0
end
function Base.getindex(metainfo::MetaInfo, key::AbstractString)
i = findkey(metainfo, key)
if i == 0
throw(KeyError(key))
end
return String(metainfo.data[metainfo.dictval[i]])
end
function findkey(metainfo::MetaInfo, key::AbstractString)
checkfilled(metainfo)
if sizeof(key) != 2
return 0
end
t1, t2 = UInt8(key[1]), UInt8(key[2])
for (i, k) in enumerate(metainfo.dictkey)
if metainfo.data[first(k)] == t1 && metainfo.data[first(k)+1] == t2
return i
end
end
return 0
end

261
src/sam/reader.jl Normal file
View file

@ -0,0 +1,261 @@
# SAM Reader
# =========
mutable struct Reader <: BioCore.IO.AbstractReader
state::BioCore.Ragel.State
header::Header
function Reader(input::BufferedStreams.BufferedInputStream)
reader = new(BioCore.Ragel.State(sam_header_machine.start_state, input), Header())
readheader!(reader)
reader.state.cs = sam_body_machine.start_state
return reader
end
end
"""
SAM.Reader(input::IO)
Create a data reader of the SAM file format.
# Arguments
* `input`: data source
"""
function Reader(input::IO)
return Reader(BufferedStreams.BufferedInputStream(input))
end
function BioCore.IO.stream(reader::Reader)
return reader.state.stream
end
"""
header(reader::Reader)::Header
Get the header of `reader`.
"""
function header(reader::Reader)::Header
return reader.header
end
function Base.eltype(::Type{Reader})
return Record
end
# file = header . body
# header = metainfo*
# body = record*
isinteractive() && info("compiling SAM")
const sam_metainfo_machine, sam_record_machine, sam_header_machine, sam_body_machine = (function ()
cat = Automa.RegExp.cat
rep = Automa.RegExp.rep
alt = Automa.RegExp.alt
opt = Automa.RegExp.opt
any = Automa.RegExp.any
metainfo = let
tag = re"[A-Z][A-Z]" \ cat("CO")
tag.actions[:enter] = [:mark1]
tag.actions[:exit] = [:metainfo_tag]
dict = let
key = re"[A-Za-z][A-Za-z0-9]"
key.actions[:enter] = [:mark2]
key.actions[:exit] = [:metainfo_dict_key]
val = re"[ -~]+"
val.actions[:enter] = [:mark2]
val.actions[:exit] = [:metainfo_dict_val]
keyval = cat(key, ':', val)
cat(keyval, rep(cat('\t', keyval)))
end
dict.actions[:enter] = [:mark1]
dict.actions[:exit] = [:metainfo_val]
co = cat("CO")
co.actions[:enter] = [:mark1]
co.actions[:exit] = [:metainfo_tag]
comment = re"[^\r\n]*"
comment.actions[:enter] = [:mark1]
comment.actions[:exit] = [:metainfo_val]
cat('@', alt(cat(tag, '\t', dict), cat(co, '\t', comment)))
end
metainfo.actions[:enter] = [:anchor]
metainfo.actions[:exit] = [:metainfo]
record = let
qname = re"[!-?A-~]+"
qname.actions[:enter] = [:mark]
qname.actions[:exit] = [:record_qname]
flag = re"[0-9]+"
flag.actions[:enter] = [:mark]
flag.actions[:exit] = [:record_flag]
rname = re"\*|[!-()+-<>-~][!-~]*"
rname.actions[:enter] = [:mark]
rname.actions[:exit] = [:record_rname]
pos = re"[0-9]+"
pos.actions[:enter] = [:mark]
pos.actions[:exit] = [:record_pos]
mapq = re"[0-9]+"
mapq.actions[:enter] = [:mark]
mapq.actions[:exit] = [:record_mapq]
cigar = re"\*|([0-9]+[MIDNSHPX=])+"
cigar.actions[:enter] = [:mark]
cigar.actions[:exit] = [:record_cigar]
rnext = re"\*|=|[!-()+-<>-~][!-~]*"
rnext.actions[:enter] = [:mark]
rnext.actions[:exit] = [:record_rnext]
pnext = re"[0-9]+"
pnext.actions[:enter] = [:mark]
pnext.actions[:exit] = [:record_pnext]
tlen = re"[-+]?[0-9]+"
tlen.actions[:enter] = [:mark]
tlen.actions[:exit] = [:record_tlen]
seq = re"\*|[A-Za-z=.]+"
seq.actions[:enter] = [:mark]
seq.actions[:exit] = [:record_seq]
qual = re"[!-~]+"
qual.actions[:enter] = [:mark]
qual.actions[:exit] = [:record_qual]
field = let
tag = re"[A-Za-z][A-Za-z0-9]"
val = alt(
re"A:[!-~]",
re"i:[-+]?[0-9]+",
re"f:[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?",
re"Z:[ !-~]*",
re"H:([0-9A-F][0-9A-F])*",
re"B:[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+")
cat(tag, ':', val)
end
field.actions[:enter] = [:mark]
field.actions[:exit] = [:record_field]
cat(
qname, '\t',
flag, '\t',
rname, '\t',
pos, '\t',
mapq, '\t',
cigar, '\t',
rnext, '\t',
pnext, '\t',
tlen, '\t',
seq, '\t',
qual,
rep(cat('\t', field)))
end
record.actions[:enter] = [:anchor]
record.actions[:exit] = [:record]
newline = let
lf = re"\n"
lf.actions[:enter] = [:countline]
cat(re"\r?", lf)
end
header = rep(cat(metainfo, newline))
header.actions[:exit] = [:header]
header = cat(header, opt(any() \ cat('@'))) # look ahead
body = rep(cat(record, newline))
return map(Automa.compile, (metainfo, record, header, body))
end)()
const sam_metainfo_actions = Dict(
:metainfo_tag => :(record.tag = (mark1:p-1) .- offset),
:metainfo_val => :(record.val = (mark1:p-1) .- offset),
:metainfo_dict_key => :(push!(record.dictkey, (mark2:p-1) .- offset)),
:metainfo_dict_val => :(push!(record.dictval, (mark2:p-1) .- offset)),
:metainfo => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, offset+1:p-1)
record.filled = (offset+1:p-1) .- offset
end,
:anchor => :(),
:mark1 => :(mark1 = p),
:mark2 => :(mark2 = p))
eval(
BioCore.ReaderHelper.generate_index_function(
MetaInfo,
sam_metainfo_machine,
:(mark1 = mark2 = offset = 0),
sam_metainfo_actions))
eval(
BioCore.ReaderHelper.generate_readheader_function(
Reader,
MetaInfo,
sam_header_machine,
:(mark1 = mark2 = offset = 0),
merge(sam_metainfo_actions, Dict(
:metainfo => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
record.filled = (offset+1:p-1) .- offset
@assert isfilled(record)
push!(reader.header.metainfo, record)
BioCore.ReaderHelper.ensure_margin!(stream)
record = MetaInfo()
end,
:header => :(finish_header = true; @escape),
:countline => :(linenum += 1),
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1))),
quote
if !eof(stream)
stream.position -= 1 # cancel look-ahead
end
end))
const sam_record_actions = Dict(
:record_qname => :(record.qname = (mark:p-1) .- offset),
:record_flag => :(record.flag = (mark:p-1) .- offset),
:record_rname => :(record.rname = (mark:p-1) .- offset),
:record_pos => :(record.pos = (mark:p-1) .- offset),
:record_mapq => :(record.mapq = (mark:p-1) .- offset),
:record_cigar => :(record.cigar = (mark:p-1) .- offset),
:record_rnext => :(record.rnext = (mark:p-1) .- offset),
:record_pnext => :(record.pnext = (mark:p-1) .- offset),
:record_tlen => :(record.tlen = (mark:p-1) .- offset),
:record_seq => :(record.seq = (mark:p-1) .- offset),
:record_qual => :(record.qual = (mark:p-1) .- offset),
:record_field => :(push!(record.fields, (mark:p-1) .- offset)),
:record => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, 1:p-1)
record.filled = (offset+1:p-1) .- offset
end,
:anchor => :(),
:mark => :(mark = p))
eval(
BioCore.ReaderHelper.generate_index_function(
Record,
sam_record_machine,
:(mark = offset = 0),
sam_record_actions))
eval(
BioCore.ReaderHelper.generate_read_function(
Reader,
sam_body_machine,
:(mark = offset = 0),
merge(sam_record_actions, Dict(
:record => quote
BioCore.ReaderHelper.resize_and_copy!(record.data, data, BioCore.ReaderHelper.upanchor!(stream):p-1)
record.filled = (offset+1:p-1) .- offset
found_record = true
@escape
end,
:countline => :(linenum += 1),
:anchor => :(BioCore.ReaderHelper.anchor!(stream, p); offset = p - 1)))))

624
src/sam/record.jl Normal file
View file

@ -0,0 +1,624 @@
# SAM Record
# ==========
mutable struct Record
# data and filled range
data::Vector{UInt8}
filled::UnitRange{Int}
# indexes
qname::UnitRange{Int}
flag::UnitRange{Int}
rname::UnitRange{Int}
pos::UnitRange{Int}
mapq::UnitRange{Int}
cigar::UnitRange{Int}
rnext::UnitRange{Int}
pnext::UnitRange{Int}
tlen::UnitRange{Int}
seq::UnitRange{Int}
qual::UnitRange{Int}
fields::Vector{UnitRange{Int}}
end
"""
SAM.Record()
Create an unfilled SAM record.
"""
function Record()
return Record(
UInt8[], 1:0,
# qname-mapq
1:0, 1:0, 1:0, 1:0, 1:0,
# cigar-seq
1:0, 1:0, 1:0, 1:0, 1:0,
# qual and fields
1:0, UnitRange{Int}[])
end
"""
SAM.Record(data::Vector{UInt8})
Create a SAM record from `data`.
This function verifies the format and indexes fields for accessors.
Note that the ownership of `data` is transferred to a new record object.
"""
function Record(data::Vector{UInt8})
return convert(Record, data)
end
function Base.convert(::Type{Record}, data::Vector{UInt8})
record = Record(
data, 1:0,
# qname-mapq
1:0, 1:0, 1:0, 1:0, 1:0,
# cigar-seq
1:0, 1:0, 1:0, 1:0, 1:0,
# qual and fields
1:0, UnitRange{Int}[])
index!(record)
return record
end
"""
SAM.Record(str::AbstractString)
Create a SAM record from `str`.
This function verifies the format and indexes fields for accessors.
"""
function Record(str::AbstractString)
return convert(Record, str)
end
function Base.convert(::Type{Record}, str::AbstractString)
return Record(Vector{UInt8}(str))
end
function Base.show(io::IO, record::Record)
print(io, summary(record), ':')
if isfilled(record)
println(io)
println(io, " template name: ", hastempname(record) ? tempname(record) : "<missing>")
println(io, " flag: ", hasflag(record) ? flag(record) : "<missing>")
println(io, " reference: ", hasrefname(record) ? refname(record) : "<missing>")
println(io, " position: ", hasposition(record) ? position(record) : "<missing>")
println(io, " mapping quality: ", hasmappingquality(record) ? mappingquality(record) : "<missing>")
println(io, " CIGAR: ", hascigar(record) ? cigar(record) : "<missing>")
println(io, " next reference: ", hasnextrefname(record) ? nextrefname(record) : "<missing>")
println(io, " next position: ", hasnextposition(record) ? nextposition(record) : "<missing>")
println(io, " template length: ", hastemplength(record) ? templength(record) : "<missing>")
println(io, " sequence: ", hassequence(record) ? sequence(String, record) : "<missing>")
println(io, " base quality: ", hasquality(record) ? quality(String, record) : "<missing>")
print(io, " auxiliary data:")
for field in record.fields
print(io, ' ', String(record.data[field]))
end
else
print(io, " <not filled>")
end
end
function Base.print(io::IO, record::Record)
write(io, record)
return nothing
end
function Base.write(io::IO, record::Record)
checkfilled(record)
return unsafe_write(io, pointer(record.data, first(record.filled)), length(record.filled))
end
function Base.copy(record::Record)
return Record(
copy(record.data),
record.filled,
record.qname,
record.flag,
record.rname,
record.pos,
record.mapq,
record.cigar,
record.rnext,
record.pnext,
record.tlen,
record.seq,
record.qual,
copy(record.fields))
end
# Accessor Functions
# ------------------
"""
flag(record::Record)::UInt16
Get the bitwise flag of `record`.
"""
function flag(record::Record)::UInt16
checkfilled(record)
return unsafe_parse_decimal(UInt16, record.data, record.flag)
end
function hasflag(record::Record)
return isfilled(record)
end
"""
ismapped(record::Record)::Bool
Test if `record` is mapped.
"""
function ismapped(record::Record)::Bool
return isfilled(record) && (flag(record) & FLAG_UNMAP == 0)
end
"""
isprimary(record::Record)::Bool
Test if `record` is a primary line of the read.
This is equivalent to `flag(record) & 0x900 == 0`.
"""
function isprimary(record::Record)::Bool
return flag(record) & 0x900 == 0
end
"""
refname(record::Record)::String
Get the reference sequence name of `record`.
"""
function refname(record::Record)
checkfilled(record)
if ismissing(record, record.rname)
missingerror(:refname)
end
return String(record.data[record.rname])
end
function hasrefname(record::Record)
return isfilled(record) && !ismissing(record, record.rname)
end
"""
position(record::Record)::Int
Get the 1-based leftmost mapping position of `record`.
"""
function position(record::Record)::Int
checkfilled(record)
pos = unsafe_parse_decimal(Int, record.data, record.pos)
if pos == 0
missingerror(:position)
end
return pos
end
function hasposition(record::Record)
return isfilled(record) && (length(record.pos) != 1 || record.data[first(record.pos)] != UInt8('0'))
end
"""
rightposition(record::Record)::Int
Get the 1-based rightmost mapping position of `record`.
"""
function rightposition(record::Record)
return position(record) + alignlength(record) - 1
end
function hasrightposition(record::Record)
return hasposition(record) && hasalignment(record)
end
"""
isnextmapped(record::Record)::Bool
Test if the mate/next read of `record` is mapped.
"""
function isnextmapped(record::Record)::Bool
return isfilled(record) && (flag(record) & FLAG_MUNMAP == 0)
end
"""
nextrefname(record::Record)::String
Get the reference name of the mate/next read of `record`.
"""
function nextrefname(record::Record)::String
checkfilled(record)
if ismissing(record, record.rnext)
missingerror(:nextrefname)
end
return String(record.data[record.rnext])
end
function hasnextrefname(record::Record)
return isfilled(record) && !ismissing(record, record.rnext)
end
"""
nextposition(record::Record)::Int
Get the position of the mate/next read of `record`.
"""
function nextposition(record::Record)::Int
checkfilled(record)
pos = unsafe_parse_decimal(Int, record.data, record.pnext)
if pos == 0
missingerror(:nextposition)
end
return pos
end
function hasnextposition(record::Record)
return isfilled(record) && (length(record.pnext) != 1 || record.data[first(record.pnext)] != UInt8('0'))
end
"""
mappingquality(record::Record)::UInt8
Get the mapping quality of `record`.
"""
function mappingquality(record::Record)::UInt8
checkfilled(record)
qual = unsafe_parse_decimal(UInt8, record.data, record.mapq)
if qual == 0xff
missingerror(:mappingquality)
end
return qual
end
function hasmappingquality(record::Record)
return isfilled(record) && unsafe_parse_decimal(UInt8, record.data, record.mapq) != 0xff
end
"""
cigar(record::Record)::String
Get the CIGAR string of `record`.
"""
function cigar(record::Record)::String
checkfilled(record)
if ismissing(record, record.cigar)
missingerror(:cigar)
end
return String(record.data[record.cigar])
end
function hascigar(record::Record)
return isfilled(record) && !ismissing(record, record.cigar)
end
"""
alignment(record::Record)::BioAlignments.Alignment
Get the alignment of `record`.
"""
function alignment(record::Record)::BioAlignments.Alignment
if ismapped(record)
return BioAlignments.Alignment(cigar(record), 1, position(record))
else
return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[])
end
end
function hasalignment(record::Record)
return isfilled(record) && hascigar(record)
end
"""
alignlength(record::Record)::Int
Get the alignment length of `record`.
"""
function alignlength(record::Record)::Int
if length(record.cigar) == 1 && record.data[first(record.cigar)] == UInt8('*')
return 0
end
ret::Int = 0
len = 0 # operation length
for i in record.cigar
c = record.data[i]
if c UInt8('0'):UInt8('9')
len = len * 10 + (c - UInt8('0'))
else
op = convert(BioAlignments.Operation, Char(c))
if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op)
ret += len
len = 0
end
end
end
return ret
end
"""
tempname(record::Record)::String
Get the query template name of `record`.
"""
function tempname(record::Record)::String
checkfilled(record)
if ismissing(record, record.qname)
missingerror(:tempname)
end
return String(record.data[record.qname])
end
function hastempname(record::Record)
return isfilled(record) && !ismissing(record, record.qname)
end
"""
templength(record::Record)::Int
Get the template length of `record`.
"""
function templength(record::Record)::Int
checkfilled(record)
len = unsafe_parse_decimal(Int, record.data, record.tlen)
if len == 0
missingerror(:tlen)
end
return len
end
function hastemplength(record::Record)
return isfilled(record) && (length(record.tlen) != 1 || record.data[first(record.tlen)] != UInt8('0'))
end
"""
sequence(record::Record)::BioSequences.LongDNASeq
Get the segment sequence of `record`.
"""
function sequence(record::Record)::BioSequences.LongDNASeq
checkfilled(record)
if ismissing(record, record.seq)
missingerror(:sequence)
end
seqlen = length(record.seq)
ret = BioSequences.LongDNASeq(seqlen)
BioSequences.encode_copy!(ret, 1, record.data, first(record.seq), seqlen)
return ret
end
function hassequence(record::Record)
return isfilled(record) && !ismissing(record, record.seq)
end
"""
sequence(::Type{String}, record::Record)::String
Get the segment sequence of `record` as `String`.
"""
function sequence(::Type{String}, record::Record)::String
checkfilled(record)
return String(record.data[record.seq])
end
"""
seqlength(record::Record)::Int
Get the sequence length of `record`.
"""
function seqlength(record::Record)::Int
checkfilled(record)
if ismissing(record, record.seq)
missingerror(:seq)
end
return length(record.seq)
end
function hasseqlength(record::Record)
return isfilled(record) && !ismissing(record, record.seq)
end
"""
quality(record::Record)::Vector{UInt8}
Get the Phred-scaled base quality of `record`.
"""
function quality(record::Record)::Vector{UInt8}
checkfilled(record)
if ismissing(record, record.qual)
missingerror(:quality)
end
qual = record.data[record.qual]
for i in 1:lastindex(qual)
@inbounds qual[i] -= 33
end
return qual
end
function hasquality(record::Record)
return isfilled(record) && !ismissing(record, record.qual)
end
"""
quality(::Type{String}, record::Record)::String
Get the ASCII-encoded base quality of `record`.
"""
function quality(::Type{String}, record::Record)::String
checkfilled(record)
return String(record.data[record.qual])
end
"""
auxdata(record::Record)::Dict{String,Any}
Get the auxiliary data (optional fields) of `record`.
"""
function auxdata(record::Record)::Dict{String,Any}
checkfilled(record)
return Dict(k => record[k] for k in keys(record))
end
function Base.haskey(record::Record, tag::AbstractString)
return findauxtag(record, tag) > 0
end
function Base.getindex(record::Record, tag::AbstractString)
i = findauxtag(record, tag)
if i == 0
throw(KeyError(tag))
end
field = record.fields[i]
# data type
typ = record.data[first(field)+3]
lo = first(field) + 5
if i == lastindex(record.fields)
hi = last(field)
else
hi = first(record.fields[i+1]) - 2
end
if typ == UInt8('A')
@assert lo == hi
return Char(record.data[lo])
elseif typ == UInt8('i')
return unsafe_parse_decimal(Int, record.data, lo:hi)
elseif typ == UInt8('f')
# TODO: Call a C function directly for speed?
return parse(Float32, SubString(record.data[lo:hi]))
elseif typ == UInt8('Z')
return String(record.data[lo:hi])
elseif typ == UInt8('H')
return parse_hexarray(record.data, lo:hi)
elseif typ == UInt8('B')
return parse_typedarray(record.data, lo:hi)
else
throw(ArgumentError("type code '$(Char(typ))' is not defined"))
end
end
function Base.keys(record::Record)
checkfilled(record)
return [String(record.data[first(f):first(f)+1]) for f in record.fields]
end
function Base.values(record::Record)
return [record[k] for k in keys(record)]
end
# Bio Methods
# -----------
function BioCore.isfilled(record::Record)
return !isempty(record.filled)
end
function BioCore.seqname(record::Record)
return tempname(record)
end
function BioCore.hasseqname(record::Record)
return hastempname(record)
end
function BioCore.sequence(record::Record)
return sequence(record)
end
function BioCore.hassequence(record::Record)
return hassequence(record)
end
function BioCore.rightposition(record::Record)
return rightposition(record)
end
function BioCore.hasrightposition(record::Record)
return hasrightposition(record)
end
function BioCore.leftposition(record::Record)
return position(record)
end
function BioCore.hasleftposition(record::Record)
return hasposition(record)
end
# Helper Functions
# ----------------
function initialize!(record::Record)
record.filled = 1:0
record.qname = 1:0
record.flag = 1:0
record.rname = 1:0
record.pos = 1:0
record.mapq = 1:0
record.cigar = 1:0
record.rnext = 1:0
record.pnext = 1:0
record.tlen = 1:0
record.seq = 1:0
record.qual = 1:0
empty!(record.fields)
return record
end
function checkfilled(record::Record)
if !isfilled(record)
throw(ArgumentError("unfilled SAM record"))
end
end
function findauxtag(record::Record, tag::AbstractString)
checkfilled(record)
if sizeof(tag) != 2
return 0
end
t1, t2 = UInt8(tag[1]), UInt8(tag[2])
for (i, field) in enumerate(record.fields)
p = first(field)
if record.data[p] == t1 && record.data[p+1] == t2
return i
end
end
return 0
end
function parse_hexarray(data::Vector{UInt8}, range::UnitRange{Int})
@assert iseven(length(range))
ret = Vector{UInt8}(length(range) >> 1)
byte2hex(b) = b 0x30:0x39 ? (b - 0x30) : b 0x41:0x46 ? (b - 0x41 + 0x0A) : error("not in [0-9A-F]")
j = 1
for i in first(range):2:last(range)-1
ret[j] = (byte2hex(data[range[i]]) << 4) | byte2hex(data[range[i+1]])
j += 1
end
return ret
end
function parse_typedarray(data::Vector{UInt8}, range::UnitRange{Int})
# format: [cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+
t = data[first(range)]
xs = split(String(data[first(range)+2:last(range)]))
if t == UInt8('c')
return [parse(Int8, x) for x in xs]
elseif t == UInt8('C')
return [parse(UInt8, x) for x in xs]
elseif t == UInt8('s')
return [parse(Int16, x) for x in xs]
elseif t == UInt8('S')
return [parse(UInt16, x) for x in xs]
elseif t == UInt8('i')
return [parse(Int32, x) for x in xs]
elseif t == UInt8('I')
return [parse(UInt32, x) for x in xs]
elseif t == UInt8('f')
return [parse(Float32, x) for x in xs]
else
throw(ArgumentError("type code '$(Char(t))' is not defined"))
end
end
function ismissing(record::Record, range::UnitRange{Int})
return length(range) == 1 && record.data[first(range)] == UInt8('*')
end

25
src/sam/sam.jl Normal file
View file

@ -0,0 +1,25 @@
# SAM File Format
# ===============
module SAM
using BioCore
import Automa
import Automa.RegExp: @re_str
import BioAlignments
import BioCore.Exceptions: missingerror
import BioCore.RecordHelper: unsafe_parse_decimal
import BioCore: isfilled, header
import BioSequences
import BufferedStreams
using Printf: @sprintf
include("flags.jl")
include("metainfo.jl")
include("record.jl")
include("header.jl")
include("reader.jl")
include("writer.jl")
end

43
src/sam/writer.jl Normal file
View file

@ -0,0 +1,43 @@
# SAM Writer
# ==========
"""
Writer(output::IO, header::Header=Header())
Create a data writer of the SAM file format.
# Arguments
* `output`: data sink
* `header=Header()`: SAM header object
"""
mutable struct Writer <: BioCore.IO.AbstractWriter
stream::IO
function Writer(output::IO, header::Header=Header())
writer = new(output)
write(writer, header)
return writer
end
end
function BioCore.IO.stream(writer::Writer)
return writer.stream
end
function Base.write(writer::Writer, header::Header)
n = 0
for metainfo in header
n += write(writer, metainfo)
end
return n
end
function Base.write(writer::Writer, metainfo::MetaInfo)
checkfilled(metainfo)
return write(writer.stream, metainfo, '\n')
end
function Base.write(writer::Writer, record::Record)
checkfilled(record)
return write(writer.stream, record, '\n')
end

438
test/runtests.jl Normal file
View file

@ -0,0 +1,438 @@
using Test
using GenomicFeatures
using XAM
import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE
using FormatSpecimens
import BGZFStreams: BGZFStream
import BioCore.Exceptions: MissingFieldException
import BioSequences: @dna_str, @aa_str
import BioCore:
header,
isfilled,
seqname,
hasseqname,
sequence,
hassequence,
leftposition,
rightposition,
hasleftposition,
hasrightposition
# Generate a random range within `range`.
function randrange(range)
x = rand(range)
y = rand(range)
if x < y
return x:y
else
return y:x
end
end
@testset "SAM" begin
samdir = path_of_format("SAM")
@testset "MetaInfo" begin
metainfo = SAM.MetaInfo()
@test !isfilled(metainfo)
@test occursin("not filled", repr(metainfo))
metainfo = SAM.MetaInfo("CO", "some comment (parens)")
@test isfilled(metainfo)
@test string(metainfo) == "@CO\tsome comment (parens)"
@test occursin("CO", repr(metainfo))
@test SAM.tag(metainfo) == "CO"
@test SAM.value(metainfo) == "some comment (parens)"
@test_throws ArgumentError keys(metainfo)
@test_throws ArgumentError values(metainfo)
metainfo = SAM.MetaInfo("HD", ["VN" => "1.0", "SO" => "coordinate"])
@test isfilled(metainfo)
@test string(metainfo) == "@HD\tVN:1.0\tSO:coordinate"
@test occursin("HD", repr(metainfo))
@test SAM.tag(metainfo) == "HD"
@test SAM.value(metainfo) == "VN:1.0\tSO:coordinate"
@test keys(metainfo) == ["VN", "SO"]
@test values(metainfo) == ["1.0", "coordinate"]
@test SAM.keyvalues(metainfo) == ["VN" => "1.0", "SO" => "coordinate"]
@test haskey(metainfo, "VN")
@test haskey(metainfo, "SO")
@test !haskey(metainfo, "GO")
@test metainfo["VN"] == "1.0"
@test metainfo["SO"] == "coordinate"
@test_throws KeyError metainfo["GO"]
end
@testset "Header" begin
header = SAM.Header()
@test isempty(header)
push!(header, SAM.MetaInfo("@HD\tVN:1.0\tSO:coordinate"))
@test !isempty(header)
@test length(header) == 1
push!(header, SAM.MetaInfo("@CO\tsome comment"))
@test length(header) == 2
@test isa(collect(header), Vector{SAM.MetaInfo})
end
@testset "Record" begin
record = SAM.Record()
@test !isfilled(record)
@test !SAM.ismapped(record)
@test repr(record) == "XAM.SAM.Record: <not filled>"
@test_throws ArgumentError SAM.flag(record)
record = SAM.Record("r001\t99\tchr1\t7\t30\t8M2I4M1D3M\t=\t37\t39\tTTAGATAAAGGATACTG\t*")
@test isfilled(record)
@test occursin(r"^XAM.SAM.Record:\n", repr(record))
@test SAM.ismapped(record)
@test SAM.isprimary(record)
@test SAM.hastempname(record)
@test SAM.tempname(record) == "r001"
@test SAM.hasflag(record)
@test SAM.flag(record) === UInt16(99)
@test SAM.hasrefname(record)
@test SAM.refname(record) == "chr1"
@test SAM.hasposition(record)
@test SAM.position(record) === 7
@test SAM.hasmappingquality(record)
@test SAM.mappingquality(record) === UInt8(30)
@test SAM.hascigar(record)
@test SAM.cigar(record) == "8M2I4M1D3M"
@test SAM.hasnextrefname(record)
@test SAM.nextrefname(record) == "="
@test SAM.hasnextposition(record)
@test SAM.nextposition(record) === 37
@test SAM.hastemplength(record)
@test SAM.templength(record) === 39
@test SAM.hassequence(record)
@test SAM.sequence(record) == dna"TTAGATAAAGGATACTG"
@test !SAM.hasquality(record)
@test_throws MissingFieldException SAM.quality(record)
end
@testset "Reader" begin
reader = open(SAM.Reader, joinpath(samdir, "ce#1.sam"))
@test isa(reader, SAM.Reader)
@test eltype(reader) === SAM.Record
# header
h = header(reader)
@test string.(findall(header(reader), "SQ")) == ["@SQ\tSN:CHROMOSOME_I\tLN:1009800"]
# first record
record = SAM.Record()
read!(reader, record)
@test SAM.ismapped(record)
@test SAM.refname(record) == "CHROMOSOME_I"
@test SAM.position(record) == leftposition(record) == 2
@test SAM.rightposition(record) == rightposition(record) == 102
@test SAM.tempname(record) == seqname(record) == "SRR065390.14978392"
@test SAM.sequence(record) == sequence(record) == dna"CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA"
@test SAM.sequence(String, record) == "CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA"
@test SAM.seqlength(record) == 100
@test SAM.quality(record) == (b"#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" .- 33)
@test SAM.quality(String, record) == "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"
@test SAM.flag(record) == 16
@test SAM.cigar(record) == "27M1D73M"
@test SAM.alignment(record) == Alignment([
AlignmentAnchor( 0, 1, OP_START),
AlignmentAnchor( 27, 28, OP_MATCH),
AlignmentAnchor( 27, 29, OP_DELETE),
AlignmentAnchor(100, 102, OP_MATCH)])
@test record["XG"] == 1
@test record["XM"] == 5
@test record["XN"] == 0
@test record["XO"] == 1
@test record["AS"] == -18
@test record["XS"] == -18
@test record["YT"] == "UU"
@test eof(reader)
close(reader)
# iterator
@test length(collect(open(SAM.Reader, joinpath(samdir, "ce#1.sam")))) == 1
@test length(collect(open(SAM.Reader, joinpath(samdir, "ce#2.sam")))) == 2
# IOStream
@test length(collect(SAM.Reader(open(joinpath(samdir, "ce#1.sam"))))) == 1
@test length(collect(SAM.Reader(open(joinpath(samdir, "ce#2.sam"))))) == 2
end
@testset "Round trip" begin
function compare_records(xs, ys)
if length(xs) != length(ys)
return false
end
for (x, y) in zip(xs, ys)
if x.data[x.filled] != y.data[y.filled]
return false
end
end
return true
end
for specimen in list_valid_specimens("SAM")
filepath = joinpath(samdir, filename(specimen))
mktemp() do path, io
# copy
reader = open(SAM.Reader, filepath)
writer = SAM.Writer(io, header(reader))
records = SAM.Record[]
for record in reader
push!(records, record)
write(writer, record)
end
close(reader)
close(writer)
@test compare_records(open(collect, SAM.Reader, path), records)
end
end
end
end
@testset "BAM" begin
bamdir = path_of_format("BAM")
@testset "AuxData" begin
auxdata = BAM.AuxData(UInt8[])
@test isempty(auxdata)
buf = IOBuffer()
write(buf, "NM", UInt8('s'), Int16(1))
auxdata = BAM.AuxData(take!(buf))
@test length(auxdata) == 1
@test auxdata["NM"] === Int16(1)
@test collect(auxdata) == ["NM" => Int16(1)]
buf = IOBuffer()
write(buf, "AS", UInt8('c'), Int8(-18))
write(buf, "NM", UInt8('s'), Int16(1))
write(buf, "XA", UInt8('f'), Float32(3.14))
write(buf, "XB", UInt8('Z'), "some text\0")
write(buf, "XC", UInt8('B'), UInt8('i'), Int32(3), Int32[10, -5, 8])
auxdata = BAM.AuxData(take!(buf))
@test length(auxdata) == 5
@test auxdata["AS"] === Int8(-18)
@test auxdata["NM"] === Int16(1)
@test auxdata["XA"] === Float32(3.14)
@test auxdata["XB"] == "some text"
@test auxdata["XC"] == Int32[10, -5, 8]
@test convert(Dict{String,Any}, auxdata) == Dict(
"AS" => Int8(-18),
"NM" => Int16(1),
"XA" => Float32(3.14),
"XB" => "some text",
"XC" => Int32[10, -5, 8])
end
@testset "Record" begin
record = BAM.Record()
@test !isfilled(record)
@test repr(record) == "XAM.BAM.Record: <not filled>"
@test_throws ArgumentError BAM.flag(record)
end
@testset "Reader" begin
reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam"))
@test isa(reader, BAM.Reader)
@test eltype(reader) === BAM.Record
@test startswith(repr(reader), "XAM.BAM.Reader{IOStream}:")
# header
h = header(reader)
@test isa(h, SAM.Header)
# first record
record = BAM.Record()
read!(reader, record)
@test BAM.ismapped(record)
@test BAM.isprimary(record)
@test ! BAM.ispositivestrand(record)
@test BAM.refname(record) == "CHROMOSOME_I"
@test BAM.refid(record) === 1
@test BAM.hasnextrefid(record)
@test BAM.nextrefid(record) === 0
@test BAM.hasposition(record) === hasleftposition(record) === true
@test BAM.position(record) === leftposition(record) === 2
@test BAM.hasnextposition(record)
@test BAM.nextposition(record) === 0
@test rightposition(record) == 102
@test BAM.hastempname(record) === hasseqname(record) === true
@test BAM.tempname(record) == seqname(record) == "SRR065390.14978392"
@test BAM.hassequence(record) === hassequence(record) === true
@test BAM.sequence(record) == sequence(record) == dna"""
CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCT
AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA
"""
@test BAM.seqlength(record) === 100
@test BAM.hasquality(record)
@test eltype(BAM.quality(record)) == UInt8
@test BAM.quality(record) == [Int(x) - 33 for x in "#############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"]
@test BAM.flag(record) === UInt16(16)
@test BAM.cigar(record) == "27M1D73M"
@test BAM.alignment(record) == Alignment([
AlignmentAnchor( 0, 1, OP_START),
AlignmentAnchor( 27, 28, OP_MATCH),
AlignmentAnchor( 27, 29, OP_DELETE),
AlignmentAnchor(100, 102, OP_MATCH)])
@test record["XG"] == 1
@test record["XM"] == 5
@test record["XN"] == 0
@test record["XO"] == 1
@test record["AS"] == -18
@test record["XS"] == -18
@test record["YT"] == "UU"
@test keys(record) == ["XG","XM","XN","XO","AS","XS","YT"]
@test values(record) == [1, 5, 0, 1, -18, -18, "UU"]
@test eof(reader)
close(reader)
# Test conversion from byte array to record
dsize = BAM.data_size(record)
array = Vector{UInt8}(undef, BAM.FIXED_FIELDS_BYTES + dsize)
GC.@preserve array record begin
ptr = Ptr{UInt8}(pointer_from_objref(record))
unsafe_copyto!(pointer(array), ptr, BAM.FIXED_FIELDS_BYTES)
unsafe_copyto!(array, BAM.FIXED_FIELDS_BYTES + 1, record.data, 1, dsize)
end
new_record = convert(BAM.Record, array)
@test record.bin_mq_nl == new_record.bin_mq_nl
@test record.block_size == new_record.block_size
@test record.flag_nc == new_record.flag_nc
@test record.l_seq == new_record.l_seq
@test record.next_refid == new_record.next_refid
@test record.next_pos == new_record.next_pos
@test record.refid == new_record.refid
@test record.pos == new_record.pos
@test record.tlen == new_record.tlen
@test record.data == new_record.data
# iterator
@test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#1.bam")))) == 1
@test length(collect(open(BAM.Reader, joinpath(bamdir, "ce#2.bam")))) == 2
# IOStream
@test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#1.bam"))))) == 1
@test length(collect(BAM.Reader(open(joinpath(bamdir, "ce#2.bam"))))) == 2
end
@testset "Read long CIGARs" begin
function get_cigar_lens(rec::BAM.Record)
cigar_ops, cigar_n = BAM.cigar_rle(rec)
field_ops, field_n = BAM.cigar_rle(rec, false)
cigar_l = length(cigar_ops)
field_l = length(field_ops)
return cigar_l, field_l
end
function check_cigar_vs_field(rec::BAM.Record)
cigar = BAM.cigar(rec)
field = BAM.cigar(rec, false)
cigar_l, field_l = get_cigar_lens(rec)
return cigar != field && cigar_l != field_l
end
function check_cigar_lens(rec::BAM.Record, field_len, cigar_len)
cigar_l, field_l = get_cigar_lens(rec)
return cigar_l == cigar_len && field_l == field_len
end
reader = open(BAM.Reader, joinpath(bamdir, "cigar-64k.bam"))
rec = BAM.Record()
read!(reader, rec)
@test !check_cigar_vs_field(rec)
read!(reader, rec)
@test check_cigar_vs_field(rec)
@test check_cigar_lens(rec, 2, 72091)
end
function compare_records(xs, ys)
if length(xs) != length(ys)
return false
end
for (x, y) in zip(xs, ys)
if !(
x.block_size == y.block_size &&
x.refid == y.refid &&
x.pos == y.pos &&
x.bin_mq_nl == y.bin_mq_nl &&
x.flag_nc == y.flag_nc &&
x.l_seq == y.l_seq &&
x.next_refid == y.next_refid &&
x.next_pos == y.next_pos &&
x.tlen == y.tlen &&
x.data[1:BAM.data_size(x)] == y.data[1:BAM.data_size(y)])
return false
end
end
return true
end
@testset "Round trip" begin
for specimen in list_valid_specimens("BAM")
filepath = joinpath(bamdir, filename(specimen))
mktemp() do path, _
# copy
if hastags(specimen) && in("bai", tags(specimen))
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
else
reader = open(BAM.Reader, filepath)
end
writer = BAM.Writer(
BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ"))))
records = BAM.Record[]
for record in reader
push!(records, record)
write(writer, record)
end
close(reader)
close(writer)
@test compare_records(open(collect, BAM.Reader, path), records)
end
end
end
@testset "Random access" begin
filepath = joinpath(bamdir, "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam")
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
@test isa(eachoverlap(reader, "chr1", 1:100), BAM.OverlapIterator)
@test isa(eachoverlap(reader, GenomicFeatures.Interval("chr1", 1, 100)), BAM.OverlapIterator)
# expected values are counted using samtools
for (refname, interval, expected) in [
("chr1", 1_000:10000, 21),
("chr1", 8_000:10000, 20),
("chr1", 766_000:800_000, 142),
("chr1", 786_000:800_000, 1),
("chr1", 796_000:800_000, 0)]
intsect = eachoverlap(reader, refname, interval)
@test eltype(intsect) == BAM.Record
@test count(_ -> true, intsect) == expected
# check that the intersection iterator is stateless
@test count(_ -> true, intsect) == expected
end
# randomized tests
for n in 1:50
refindex = 1
refname = "chr1"
range = randrange(1:1_000_000)
seekstart(reader)
# linear scan
expected = filter(collect(reader)) do record
BAM.compare_intervals(record, (refindex, range)) == 0
end
# indexed scan
actual = collect(eachoverlap(reader, refname, range))
@test compare_records(actual, expected)
end
close(reader)
filepath = joinpath(bamdir, "R_12h_D06.uniq.q40.bam")
reader = open(BAM.Reader, filepath, index=filepath * ".bai")
@test isempty(collect(eachoverlap(reader, "chr19", 5823708:5846478)))
close(reader)
end
end