New modules: ultra/index and ultra/align (#1830)

* Add ultra/index and ultra/align modules

* Correct tag and prefix

* Fix typos

* Remove SAMTOOLS SORT from test

* Update: Convert sam to bam

* Add tag to docker image

* Fix typo

* Add args2 for samtools
This commit is contained in:
Sébastien Guizard 2022-07-04 07:46:49 +01:00 committed by GitHub
parent 0e9fd9370a
commit 60c65fb386
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 329 additions and 0 deletions

View file

@ -0,0 +1,53 @@
process ULTRA_ALIGN {
tag "$meta.id"
label 'process_high'
conda (params.enable_conda ? "bioconda::ultra_bioinformatics=0.0.4 bioconda::samtools=1.15.1" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-4b749ef583d6de806ddbf51c2d235ac8c14763c6:f63170074b42f54276c1f9b334e732a0f3bf28bd-0':
'quay.io/biocontainers/mulled-v2-4b749ef583d6de806ddbf51c2d235ac8c14763c6:f63170074b42f54276c1f9b334e732a0f3bf28bd-0' }"
input:
tuple val(meta), path(reads)
path genome
tuple path(pickle), path(db)
output:
tuple val(meta), path("*.bam"), emit: bam
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def args2 = task.ext.args2 ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
uLTRA \\
align \\
--t $task.cpus \\
--prefix $prefix \\
--index ./ \\
$args \\
$genome \\
$reads \\
./
samtools \\
sort \\
--threads $task.cpus \\
-o ${prefix}.bam \\
-O BAM \\
$args2 \\
${prefix}.sam
rm ${prefix}.sam
cat <<-END_VERSIONS > versions.yml
"${task.process}":
ultra: \$( uLTRA --version|sed 's/uLTRA //g' )
samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,58 @@
name: "ultra_align"
description: uLTRA aligner - A wrapper around minimap2 to improve small exon detection - Map reads on genome
keywords:
- uLTRA
- align
- minimap2
- long_read
- isoseq
- ont
tools:
- "ultra":
description: "Splice aligner of long transcriptomic reads to genome."
homepage: "https://github.com/ksahlin/uLTRA"
documentation: "https://github.com/ksahlin/uLTRA"
tool_dev_url: "https://github.com/ksahlin/uLTRA"
doi: "10.1093/bioinformatics/btab540"
licence: "['GNU GPLV3']"
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: A fasta or fastq file of reads to align
pattern: "*.{fa,fasta,fastq}"
- genome:
type: file
description: A fasta file of reference genome
pattern: "*.{fa,fasta}"
- pickle:
type: file
description: Pickle files generated by uLTRA index
pattern: "*.pickle"
- db:
type: file
description: Database generated by uLTRA index
pattern: "*.db"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- bam:
type: file
description: The aligned reads in bam format
pattern: "*.bam"
authors:
- "@sguizard"

View file

@ -0,0 +1,37 @@
process ULTRA_INDEX {
tag "$gtf"
label 'process_low'
conda (params.enable_conda ? "bioconda::ultra_bioinformatics=0.0.4" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ultra_bioinformatics:0.0.4.1--pyh5e36f6f_0':
'quay.io/biocontainers/ultra_bioinformatics:0.0.4.1--pyh5e36f6f_0' }"
input:
path fasta
path gtf
output:
tuple path("*.pickle"), path("*.db"), emit: index
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${gtf.baseName}"
"""
uLTRA \\
index \\
$args \\
$fasta \\
$gtf \\
./
cat <<-END_VERSIONS > versions.yml
"${task.process}":
ultra: \$( uLTRA --version|sed 's/uLTRA //g' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,44 @@
name: "ultra_index"
description: uLTRA aligner - A wrapper around minimap2 to improve small exon detection - Index gtf file for reads alignment
keywords:
- uLTRA
- index
- minimap2
- long_read
- isoseq
- ont
tools:
- "ultra":
description: "Splice aligner of long transcriptomic reads to genome."
homepage: "https://github.com/ksahlin/uLTRA"
documentation: "https://github.com/ksahlin/uLTRA"
tool_dev_url: "https://github.com/ksahlin/uLTRA"
doi: "10.1093/bioinformatics/btab540"
licence: "['GNU GPLV3']"
input:
- fasta:
type: file
description: A fasta file of the genome to use as reference for mapping
pattern: "*.{fasta, fa}"
- gtf:
type: file
description: An annotation file of the reference genome in GTF format
pattern: "*.gtf"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- pickle:
type: file
description: Index files generated by uLTRA index
pattern: "*.pickle"
- pickle:
type: file
description: database file generated by uLTRA index
pattern: "*.db"
authors:
- "@sguizard"

View file

@ -2159,6 +2159,14 @@ ucsc/wigtobigwig:
- modules/ucsc/wigtobigwig/** - modules/ucsc/wigtobigwig/**
- tests/modules/ucsc/wigtobigwig/** - tests/modules/ucsc/wigtobigwig/**
ultra/align:
- modules/ultra/align/**
- tests/modules/ultra/align/**
ultra/index:
- modules/ultra/index/**
- tests/modules/ultra/index/**
ultra/pipeline: ultra/pipeline:
- modules/ultra/pipeline/** - modules/ultra/pipeline/**
- tests/modules/ultra/pipeline/** - tests/modules/ultra/pipeline/**

View file

@ -0,0 +1,24 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { GUNZIP } from '../../../../modules/gunzip/main.nf'
include { GFFREAD } from '../../../../modules/gffread/main.nf'
include { ULTRA_INDEX } from '../../../../modules/ultra/index/main.nf'
include { ULTRA_ALIGN } from '../../../../modules/ultra/align/main.nf'
workflow test_ultra_align {
input = [
[ id:'test', single_end:false ],
file(params.test_data['homo_sapiens']['pacbio']['hifi'], checkIfExists: true)
]
genome = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
gtf = file(params.test_data['homo_sapiens']['genome']['genome_gtf'] , checkIfExists: true)
GUNZIP ( input )
GFFREAD ( gtf )
ULTRA_INDEX ( genome, GFFREAD.out.gtf )
ULTRA_ALIGN ( GUNZIP.out.gunzip, genome, ULTRA_INDEX.out.index )
}

View file

@ -0,0 +1,14 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: GFFREAD {
ext.args = '--sort-alpha --keep-genes -T'
ext.prefix = { "${gff.baseName}_sorted" }
}
withName: ULTRA_INDEX {
ext.args = '--disable_infer'
}
}

View file

@ -0,0 +1,33 @@
- name: ultra align test_ultra_align
command: nextflow run ./tests/modules/ultra/align -entry test_ultra_align -c ./tests/config/nextflow.config -c ./tests/modules/ultra/align/nextflow.config
tags:
- ultra/align
- ultra
files:
- path: output/gffread/genome_sorted.gtf
md5sum: c0b034860c679a354cd093109ed90437
- path: output/gunzip/test_hifi.fastq
md5sum: 20e41c569d5828c1e87337e13a5185d3
- path: output/ultra/all_splice_pairs_annotations.pickle
- path: output/ultra/all_splice_sites_annotations.pickle
- path: output/ultra/chr_to_id.pickle
- path: output/ultra/database.db
- path: output/ultra/exon_choordinates_to_id.pickle
- path: output/ultra/flank_choordinates.pickle
- path: output/ultra/gene_to_small_segments.pickle
- path: output/ultra/id_to_chr.pickle
- path: output/ultra/max_intron_chr.pickle
- path: output/ultra/parts_to_segments.pickle
- path: output/ultra/ref_exon_sequences.pickle
- path: output/ultra/ref_flank_sequences.pickle
- path: output/ultra/ref_part_sequences.pickle
- path: output/ultra/ref_segment_sequences.pickle
- path: output/ultra/refs_id_lengths.pickle
- path: output/ultra/refs_lengths.pickle
- path: output/ultra/segment_id_to_choordinates.pickle
- path: output/ultra/segment_to_gene.pickle
- path: output/ultra/segment_to_ref.pickle
- path: output/ultra/splices_to_transcripts.pickle
- path: output/ultra/test.bam
md5sum: b34c3631a899ba800602ff07b8183f87
- path: output/ultra/transcripts_to_splices.pickle

View file

@ -0,0 +1,15 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { ULTRA_INDEX } from '../../../../modules/ultra/index/main.nf'
include { GFFREAD } from '../../../../modules/gffread/main.nf'
workflow test_ultra_index {
genome = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
gtf = file(params.test_data['homo_sapiens']['genome']['genome_gtf'] , checkIfExists: true)
GFFREAD ( gtf )
ULTRA_INDEX ( genome, GFFREAD.out.gtf )
}

View file

@ -0,0 +1,14 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: GFFREAD {
ext.args = '--sort-alpha --keep-genes -T'
ext.prefix = { "${gff.baseName}_sorted" }
}
withName: ULTRA_INDEX {
ext.args = '--disable_infer'
}
}

View file

@ -0,0 +1,29 @@
- name: ultra index test_ultra_index
command: nextflow run ./tests/modules/ultra/index -entry test_ultra_index -c ./tests/config/nextflow.config -c ./tests/modules/ultra/index/nextflow.config
tags:
- ultra
- ultra/index
files:
- path: output/gffread/genome_sorted.gtf
md5sum: c0b034860c679a354cd093109ed90437
- path: output/ultra/all_splice_pairs_annotations.pickle
- path: output/ultra/all_splice_sites_annotations.pickle
- path: output/ultra/chr_to_id.pickle
- path: output/ultra/database.db
- path: output/ultra/exon_choordinates_to_id.pickle
- path: output/ultra/flank_choordinates.pickle
- path: output/ultra/gene_to_small_segments.pickle
- path: output/ultra/id_to_chr.pickle
- path: output/ultra/max_intron_chr.pickle
- path: output/ultra/parts_to_segments.pickle
- path: output/ultra/ref_exon_sequences.pickle
- path: output/ultra/ref_flank_sequences.pickle
- path: output/ultra/ref_part_sequences.pickle
- path: output/ultra/ref_segment_sequences.pickle
- path: output/ultra/refs_id_lengths.pickle
- path: output/ultra/refs_lengths.pickle
- path: output/ultra/segment_id_to_choordinates.pickle
- path: output/ultra/segment_to_gene.pickle
- path: output/ultra/segment_to_ref.pickle
- path: output/ultra/splices_to_transcripts.pickle
- path: output/ultra/transcripts_to_splices.pickle