Add tests for hisat2 (#366)

* initial commit hisat2/build

* initial commit hisat2/build

* changed names for hisat2

* fixed directory structure and args

* added splice site test data

* added splice site inputs

* replaced list with individual args

* fixed removed commas

* added test yml file

* updated hisat2 conda version

* added meta.yml

* added meta.yml description

* added meta.yml inputs

* added meta.yml outputs

* update conda version for hisat2

* removed trailing whitespace meta.yml

* fixed version number for containers

* added test data to test config

* updated for new test logic

* fix pytest issue?

* fix pytest issue

* fixed wrong tool in meta.yaml

* updated tets.yaml name

* handle build bug for testing

* handle build bug for testing in yaml

* moved test folder to fix build bug

* use old hisat2 version to avoid conda giving inconsistent md5sum

* initial commit

* removed temp file

* added meta yaml

* add to pytest

* added tests

* added test yml

* add align meta yaml

* add hisat2 align to pytest

* remove need for splice data by calling process

* add hisat2 align se test

* add hisat2 align pe test

* update names hisat2 align

* update software pytest for using mutiple modules

* remove splice site test data since using module instead

* remove splice site from config since using module instead

* fixed extra brace

* added hisat2 align test.yml

* removed md5sum for bam files

* updated build md5sums

* Apply suggestions from code review

Co-authored-by: Nicholas TODA <nicholas.toda@mnhn.fr>
Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
This commit is contained in:
Nicholas Toda 2021-03-25 22:08:46 +01:00 committed by GitHub
parent fead37d57a
commit e526eae472
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 295 additions and 1 deletions

View file

@ -0,0 +1,57 @@
name: hisat2_align
description: Align RNA-Seq reads to a reference with HISAT2
keywords:
- align
- fasta
- genome
- reference
tools:
- hisat2:
description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
homepage: https://daehwankimlab.github.io/hisat2/
documentation: https://daehwankimlab.github.io/hisat2/manual/
doi: "10.1038/s41587-019-0201-4"
licence: ['MIT']
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- index:
type: file
description: HISAT2 genome index file
pattern: "*.ht2"
- splicesites:
type: file
description: Splices sites in gtf file
pattern: "*.{txt}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: Output BAM file containing read alignments
pattern: "*.{bam}"
- summary:
type: file
description: Aligment log
pattern: "*.log"
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
authors:
- "@ntoda03"

View file

@ -0,0 +1,42 @@
name: hisat2_build
description: Builds HISAT2 index for reference genome
keywords:
- build
- index
- fasta
- genome
- reference
tools:
- hisat2:
description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
homepage: https://daehwankimlab.github.io/hisat2/
documentation: https://daehwankimlab.github.io/hisat2/manual/
doi: "10.1038/s41587-019-0201-4"
licence: ['MIT']
input:
- fasta:
type: file
description: Reference fasta file
pattern: "*.{fa,fasta,fna}"
- gtf:
type: file
description: Reference gtf annotation file
pattern: "*.{gtf}"
- splicesites:
type: file
description: Splices sites in gtf file
pattern: "*.{txt}"
output:
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
- index:
type: file
description: HISAT2 genome index file
pattern: "*.ht2"
authors:
- "@ntoda03"

View file

@ -0,0 +1,34 @@
name: hisat2_extractsplicesites
description: Extracts splicing sites from a gtf files
keywords:
- splicing
- gtf
- genome
- reference
tools:
- hisat2:
description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
homepage: https://daehwankimlab.github.io/hisat2/
documentation: https://daehwankimlab.github.io/hisat2/manual/
doi: "10.1038/s41587-019-0201-4"
licence: ['MIT']
input:
- gtf:
type: file
description: Reference gtf annotation file
pattern: "*.{gtf}"
output:
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
- splicesites:
type: file
description: Splices sites in gtf file
pattern: "*.{splice_sites.txt}"
authors:
- "@ntoda03"

View file

@ -234,6 +234,21 @@ gunzip:
- software/gunzip/**
- tests/software/gunzip/**
hisat2_align:
- software/hisat2/align/**
- software/hisat2/build/**
- software/hisat2/extractsplicesites/**
- tests/software/hisat2/align/**
hisat2_build:
- software/hisat2/build/**
- software/hisat2/extractsplicesites/**
- tests/software/hisat2/build_test/**
hisat2_extractsplicesites:
- software/hisat2/extractsplicesites/**
- tests/software/hisat2/extractsplicesites/**
homer_annotatepeaks:
- software/homer/annotatepeaks/**
- tests/software/homer/annotatepeaks/**

View file

@ -0,0 +1,32 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { HISAT2_EXTRACTSPLICESITES } from '../../../../software/hisat2/extractsplicesites/main.nf' addParams( options: [:] )
include { HISAT2_BUILD } from '../../../../software/hisat2/build/main.nf' addParams( options: [:] )
include { HISAT2_ALIGN } from '../../../../software/hisat2/align/main.nf' addParams( options: [:] )
workflow test_hisat2_align_single_end {
input = [ [ id:'test', single_end:true ], // meta map
[ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ]
]
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
HISAT2_EXTRACTSPLICESITES ( gtf )
HISAT2_BUILD ( fasta, gtf, HISAT2_EXTRACTSPLICESITES.out.txt )
HISAT2_ALIGN ( input, HISAT2_BUILD.out.index, HISAT2_EXTRACTSPLICESITES.out.txt )
}
workflow test_hisat2_align_paired_end {
input = [ [ id:'test', single_end:false ], // meta map
[ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true)]
]
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
HISAT2_EXTRACTSPLICESITES ( gtf )
HISAT2_BUILD ( fasta, gtf, HISAT2_EXTRACTSPLICESITES.out.txt )
HISAT2_ALIGN ( input, HISAT2_BUILD.out.index, HISAT2_EXTRACTSPLICESITES.out.txt )
}

View file

@ -0,0 +1,57 @@
- name: hisat2 align test_hisat2_align_single_end
command: nextflow run tests/software/hisat2/align -entry test_hisat2_align_single_end -c tests/config/nextflow.config
tags:
- hisat2_align_single_end
- hisat2_align
- hisat2
files:
- path: output/hisat2/test.hisat2.summary.log
md5sum: 7b8a9e61b7646da1089b041333c41a87
- path: output/hisat2/genome.splice_sites.txt
md5sum: d41d8cd98f00b204e9800998ecf8427e
- path: output/hisat2/test.bam
- path: output/index/hisat2/genome.5.ht2
md5sum: 91198831aaba993acac1734138c5f173
- path: output/index/hisat2/genome.7.ht2
md5sum: 9013eccd91ad614d7893c739275a394f
- path: output/index/hisat2/genome.1.ht2
md5sum: 3ea3dc41304941ad8d047e4d71b4899e
- path: output/index/hisat2/genome.2.ht2
md5sum: 47b153cd1319abc88dda532462651fcf
- path: output/index/hisat2/genome.6.ht2
md5sum: 265e1284ce85686516fae5d35540994a
- path: output/index/hisat2/genome.3.ht2
md5sum: 4ed93abba181d8dfab2e303e33114777
- path: output/index/hisat2/genome.8.ht2
md5sum: 33cdeccccebe80329f1fdbee7f5874cb
- path: output/index/hisat2/genome.4.ht2
md5sum: c25be5f8b0378abf7a58c8a880b87626
- name: hisat2 align test_hisat2_align_paired_end
command: nextflow run tests/software/hisat2/align -entry test_hisat2_align_paired_end -c tests/config/nextflow.config
tags:
- hisat2_align_paired_end
- hisat2_align
- hisat2
files:
- path: output/hisat2/test.hisat2.summary.log
md5sum: 9839b31db795958cc4b70711a3414e9c
- path: output/hisat2/genome.splice_sites.txt
md5sum: d41d8cd98f00b204e9800998ecf8427e
- path: output/hisat2/test.bam
- path: output/index/hisat2/genome.5.ht2
md5sum: 91198831aaba993acac1734138c5f173
- path: output/index/hisat2/genome.7.ht2
md5sum: 9013eccd91ad614d7893c739275a394f
- path: output/index/hisat2/genome.1.ht2
md5sum: 3ea3dc41304941ad8d047e4d71b4899e
- path: output/index/hisat2/genome.2.ht2
md5sum: 47b153cd1319abc88dda532462651fcf
- path: output/index/hisat2/genome.6.ht2
md5sum: 265e1284ce85686516fae5d35540994a
- path: output/index/hisat2/genome.3.ht2
md5sum: 4ed93abba181d8dfab2e303e33114777
- path: output/index/hisat2/genome.8.ht2
md5sum: 33cdeccccebe80329f1fdbee7f5874cb
- path: output/index/hisat2/genome.4.ht2
md5sum: c25be5f8b0378abf7a58c8a880b87626

View file

@ -0,0 +1,14 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { HISAT2_EXTRACTSPLICESITES } from '../../../../software/hisat2/extractsplicesites/main.nf' addParams( options: [:] )
include { HISAT2_BUILD } from '../../../../software/hisat2/build/main.nf' addParams( options: [:] )
workflow test_hisat2_build {
fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
HISAT2_EXTRACTSPLICESITES ( gtf )
HISAT2_BUILD ( fasta, gtf, HISAT2_EXTRACTSPLICESITES.out.txt )
}

View file

@ -0,0 +1,24 @@
- name: hisat2 build test_hisat2_build
command: nextflow run tests/software/hisat2/build_test -entry test_hisat2_build -c tests/config/nextflow.config
tags:
- hisat2
- hisat2_build
files:
- path: output/hisat2/genome.splice_sites.txt
md5sum: d41d8cd98f00b204e9800998ecf8427e
- path: output/index/hisat2/genome.5.ht2
md5sum: 91198831aaba993acac1734138c5f173
- path: output/index/hisat2/genome.7.ht2
md5sum: 9013eccd91ad614d7893c739275a394f
- path: output/index/hisat2/genome.1.ht2
md5sum: 3ea3dc41304941ad8d047e4d71b4899e
- path: output/index/hisat2/genome.2.ht2
md5sum: 47b153cd1319abc88dda532462651fcf
- path: output/index/hisat2/genome.6.ht2
md5sum: 265e1284ce85686516fae5d35540994a
- path: output/index/hisat2/genome.3.ht2
md5sum: 4ed93abba181d8dfab2e303e33114777
- path: output/index/hisat2/genome.8.ht2
md5sum: 33cdeccccebe80329f1fdbee7f5874cb
- path: output/index/hisat2/genome.4.ht2
md5sum: c25be5f8b0378abf7a58c8a880b87626

View file

@ -0,0 +1,11 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { HISAT2_EXTRACTSPLICESITES } from '../../../../software/hisat2/extractsplicesites/main.nf' addParams( options: [:] )
workflow test_hisat2_extractsplicesites {
gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
HISAT2_EXTRACTSPLICESITES ( gtf )
}

View file

@ -0,0 +1,8 @@
- name: hisat2 extractsplicesites test_hisat2_extractsplicesites
command: nextflow run tests/software/hisat2/extractsplicesites -entry test_hisat2_extractsplicesites -c tests/config/nextflow.config
tags:
- hisat2
- hisat2_extractsplicesites
files:
- path: output/hisat2/genome.splice_sites.txt
md5sum: d41d8cd98f00b204e9800998ecf8427e