Merge pull request #31 from FelixKrueger/hisat2

Added HISAT2 module and test workflow
This commit is contained in:
Phil Ewels 2020-07-11 13:29:02 +02:00 committed by GitHub
commit 81b1d6081c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 131 additions and 0 deletions

58
tools/hisat2/main.nf Normal file
View file

@ -0,0 +1,58 @@
nextflow.preview.dsl=2
params.genome = ''
process HISAT2 {
// depending on the genome used one might want/need to adjust the memory settings.
// For the E. coli test data this is probably not required
// label 'bigMem'
// label 'multiCore'
input:
tuple val(name), path(reads)
val (outdir)
val (hisat2_args)
val (verbose)
output:
path "*bam", emit: bam
path "*stats.txt", emit: stats
publishDir "$outdir/hisat2",
mode: "copy", overwrite: true
script:
if (verbose){
println ("[MODULE] HISAT2 ARGS: " + hisat2_args)
}
cores = 4
readString = ""
hisat_options = hisat2_args
// Options we add are
hisat_options = hisat_options + " --no-unal --no-softclip "
if (reads instanceof List) {
readString = "-1 "+reads[0]+" -2 "+reads[1]
hisat_options = hisat_options + " --no-mixed --no-discordant"
}
else {
readString = "-U "+reads
}
index = params.genome["hisat2"]
splices = ''
if (params.genome.containsKey("hisat2_splices")){
splices = " --known-splicesite-infile " + params.genome["hisat2_splices"]
}
else{
println ("No key 'hisat2_splices' was supplied. Skipping...")
}
hisat_name = name + "_" + params.genome["name"]
"""
hisat2 -p ${cores} ${hisat_options} -x ${index} ${splices} ${readString} 2>${hisat_name}_hisat2_stats.txt | samtools view -bS -F 4 -F 8 -F 256 -> ${hisat_name}_hisat2.bam
"""
}

37
tools/hisat2/meta.yml Normal file
View file

@ -0,0 +1,37 @@
name: HISAT2
description: Graph-based alignment of next generation sequencing reads to a population of genomes
keywords:
- Alignment
- Short reads
- graph FM Index (GFM)
- RNA-seq
tools:
- fastqc:
description: |
HISAT2 is a fast and sensitive alignment program for mapping next-generation
sequencing reads (whole-genome, transcriptome, and exome sequencing data)
against the general human population (as well as against a single reference genome).
Based on GCSA (an extension of BWT for a graph) it is designed and implemented as a
graph FM index (GFM).
homepage: http://daehwankimlab.github.io/hisat2/
documentation: https://ccb.jhu.edu/software/hisat2/manual.shtml
input:
-
- sample_id:
type: string
description: Sample identifier
- reads:
type: file
description: Input FastQ file, or pair of files
output:
-
- report:
type: file
description: mapping statistics report
pattern: *hisat2_stats.txt
- alignment:
type: file
description: alignment file in BAM format
pattern: *hisat2.bam
authors:
- @FelixKrueger

34
tools/hisat2/test/main.nf Executable file
View file

@ -0,0 +1,34 @@
#!/usr/bin/env nextflow
nextflow.preview.dsl=2
params.outdir = "."
params.genome = ""
params.hisat2_args = ''
// HISAT2 arguments should be supplied in the following format to work:
// --hisat2_args="--score-min L,0,-0.8"
params.verbose = false
if (params.verbose){
println ("[WORKFLOW] HISAT2 ARGS ARE: " + params.hisat2_args)
}
// for other genomes this needs to be handled somehow to return all possible genomes
genomeValues = ["name" : params.genome]
genomeValues["hisat2"] = "/bi/home/fkrueger/VersionControl/nf-core-modules/test-datasets/indices/hisat2/E_coli/${params.genome}";
include '../main.nf' params(genome: genomeValues)
ch_read_files = Channel
.fromFilePairs('../../../test-datasets/Ecoli*{1,2}.fastq.gz',size:-1)
// .view() // to check whether the input channel works
workflow {
main:
HISAT2(ch_read_files, params.outdir, params.hisat2_args, params.verbose)
}

View file

@ -0,0 +1,2 @@
// docker.enabled = true
params.outdir = './results'