Add tests for hisat2 (#366)

* initial commit hisat2/build * initial commit hisat2/build * changed names for hisat2 * fixed directory structure and args * added splice site test data * added splice site inputs * replaced list with individual args * fixed removed commas * added test yml file * updated hisat2 conda version * added meta.yml * added meta.yml description * added meta.yml inputs * added meta.yml outputs * update conda version for hisat2 * removed trailing whitespace meta.yml * fixed version number for containers * added test data to test config * updated for new test logic * fix pytest issue? * fix pytest issue * fixed wrong tool in meta.yaml * updated tets.yaml name * handle build bug for testing * handle build bug for testing in yaml * moved test folder to fix build bug * use old hisat2 version to avoid conda giving inconsistent md5sum * initial commit * removed temp file * added meta yaml * add to pytest * added tests * added test yml * add align meta yaml * add hisat2 align to pytest * remove need for splice data by calling process * add hisat2 align se test * add hisat2 align pe test * update names hisat2 align * update software pytest for using mutiple modules * remove splice site test data since using module instead * remove splice site from config since using module instead * fixed extra brace * added hisat2 align test.yml * removed md5sum for bam files * updated build md5sums * Apply suggestions from code review Co-authored-by: Nicholas TODA <nicholas.toda@mnhn.fr> Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
2024-12-22 02:58:17 +00:00 · 2021-03-25 22:08:46 +01:00 · 2021-03-25 22:08:46 +01:00 · e526eae472
commit e526eae472
parent fead37d57a
11 changed files with 295 additions and 1 deletions
--- a/software/hisat2/align/meta.yml
+++ b/software/hisat2/align/meta.yml
@ -0,0 +1,57 @@
+name: hisat2_align
+description: Align RNA-Seq reads to a reference with HISAT2
+keywords:
+    - align
+    - fasta
+    - genome
+    - reference
+
+tools:
+  - hisat2:
+      description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
+      homepage: https://daehwankimlab.github.io/hisat2/
+      documentation: https://daehwankimlab.github.io/hisat2/manual/
+      doi: "10.1038/s41587-019-0201-4"
+      licence: ['MIT']
+
+input:
+  - meta:
+      type: map
+      description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+  - reads:
+      type: file
+      description: |
+          List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+          respectively.
+  - index:
+      type: file
+      description: HISAT2 genome index file
+      pattern: "*.ht2"
+  - splicesites:
+      type: file
+      description: Splices sites in gtf file
+      pattern: "*.{txt}"
+
+output:
+  - meta:
+      type: map
+      description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: Output BAM file containing read alignments
+      pattern: "*.{bam}"
+  - summary:
+      type: file
+      description: Aligment log
+      pattern: "*.log"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "*.{version.txt}"
+
+authors:
+  - "@ntoda03"
--- a/software/hisat2/build/meta.yml
+++ b/software/hisat2/build/meta.yml
@ -0,0 +1,42 @@
+name: hisat2_build
+description: Builds HISAT2 index for reference genome
+keywords:
+    - build
+    - index
+    - fasta
+    - genome
+    - reference
+tools:
+  - hisat2:
+      description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
+      homepage: https://daehwankimlab.github.io/hisat2/
+      documentation: https://daehwankimlab.github.io/hisat2/manual/
+      doi: "10.1038/s41587-019-0201-4"
+      licence: ['MIT']
+
+input:
+  - fasta:
+      type: file
+      description: Reference fasta file
+      pattern: "*.{fa,fasta,fna}"
+  - gtf:
+      type: file
+      description: Reference gtf annotation file
+      pattern: "*.{gtf}"
+  - splicesites:
+      type: file
+      description: Splices sites in gtf file
+      pattern: "*.{txt}"
+
+output:
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "*.{version.txt}"
+  - index:
+      type: file
+      description: HISAT2 genome index file
+      pattern: "*.ht2"
+
+authors:
+  - "@ntoda03"
--- a/software/hisat2/extractsplicesites/meta.yml
+++ b/software/hisat2/extractsplicesites/meta.yml
@ -0,0 +1,34 @@
+name: hisat2_extractsplicesites
+description: Extracts splicing sites from a gtf files
+keywords:
+    - splicing
+    - gtf
+    - genome
+    - reference
+
+tools:
+  - hisat2:
+      description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome.
+      homepage: https://daehwankimlab.github.io/hisat2/
+      documentation: https://daehwankimlab.github.io/hisat2/manual/
+      doi: "10.1038/s41587-019-0201-4"
+      licence: ['MIT']
+
+input:
+  - gtf:
+      type: file
+      description: Reference gtf annotation file
+      pattern: "*.{gtf}"
+
+output:
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "*.{version.txt}"
+  - splicesites:
+      type: file
+      description: Splices sites in gtf file
+      pattern: "*.{splice_sites.txt}"
+
+authors:
+  - "@ntoda03"
--- a/tests/config/pytest_software.yml
+++ b/tests/config/pytest_software.yml
@ -234,6 +234,21 @@ gunzip:
  - software/gunzip/**
  - tests/software/gunzip/**

+hisat2_align:
+  - software/hisat2/align/**
+  - software/hisat2/build/**
+  - software/hisat2/extractsplicesites/**
+  - tests/software/hisat2/align/**
+
+hisat2_build:
+  - software/hisat2/build/**
+  - software/hisat2/extractsplicesites/**
+  - tests/software/hisat2/build_test/**
+
+hisat2_extractsplicesites:
+  - software/hisat2/extractsplicesites/**
+  - tests/software/hisat2/extractsplicesites/**
+  
 homer_annotatepeaks:
  - software/homer/annotatepeaks/**
  - tests/software/homer/annotatepeaks/**
--- a/tests/config/test_data.config
+++ b/tests/config/test_data.config
@ -67,4 +67,4 @@ params {
            }
        }
    }
-}
+}
--- a/tests/software/hisat2/align/main.nf
+++ b/tests/software/hisat2/align/main.nf
@ -0,0 +1,32 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { HISAT2_EXTRACTSPLICESITES } from '../../../../software/hisat2/extractsplicesites/main.nf' addParams( options: [:] )
+include { HISAT2_BUILD } from '../../../../software/hisat2/build/main.nf' addParams( options: [:] )
+include { HISAT2_ALIGN } from '../../../../software/hisat2/align/main.nf' addParams( options: [:] )
+
+workflow test_hisat2_align_single_end {
+    input = [ [ id:'test', single_end:true ], // meta map
+              [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ]
+            ]
+    fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+    gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
+
+    HISAT2_EXTRACTSPLICESITES ( gtf )
+    HISAT2_BUILD ( fasta, gtf, HISAT2_EXTRACTSPLICESITES.out.txt )
+    HISAT2_ALIGN ( input, HISAT2_BUILD.out.index, HISAT2_EXTRACTSPLICESITES.out.txt )
+}
+
+workflow test_hisat2_align_paired_end {
+    input = [ [ id:'test', single_end:false ], // meta map
+              [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+               file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true)]
+            ]
+    fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+    gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
+
+    HISAT2_EXTRACTSPLICESITES ( gtf )
+    HISAT2_BUILD ( fasta, gtf, HISAT2_EXTRACTSPLICESITES.out.txt )
+    HISAT2_ALIGN ( input, HISAT2_BUILD.out.index, HISAT2_EXTRACTSPLICESITES.out.txt )
+}
--- a/tests/software/hisat2/align/test.yml
+++ b/tests/software/hisat2/align/test.yml
@ -0,0 +1,57 @@
+- name: hisat2 align test_hisat2_align_single_end
+  command: nextflow run tests/software/hisat2/align -entry test_hisat2_align_single_end -c tests/config/nextflow.config
+  tags:
+    - hisat2_align_single_end
+    - hisat2_align
+    - hisat2
+  files:
+    - path: output/hisat2/test.hisat2.summary.log
+      md5sum: 7b8a9e61b7646da1089b041333c41a87
+    - path: output/hisat2/genome.splice_sites.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/hisat2/test.bam
+    - path: output/index/hisat2/genome.5.ht2
+      md5sum: 91198831aaba993acac1734138c5f173
+    - path: output/index/hisat2/genome.7.ht2
+      md5sum: 9013eccd91ad614d7893c739275a394f
+    - path: output/index/hisat2/genome.1.ht2
+      md5sum: 3ea3dc41304941ad8d047e4d71b4899e
+    - path: output/index/hisat2/genome.2.ht2
+      md5sum: 47b153cd1319abc88dda532462651fcf
+    - path: output/index/hisat2/genome.6.ht2
+      md5sum: 265e1284ce85686516fae5d35540994a
+    - path: output/index/hisat2/genome.3.ht2
+      md5sum: 4ed93abba181d8dfab2e303e33114777
+    - path: output/index/hisat2/genome.8.ht2
+      md5sum: 33cdeccccebe80329f1fdbee7f5874cb
+    - path: output/index/hisat2/genome.4.ht2
+      md5sum: c25be5f8b0378abf7a58c8a880b87626
+
+- name: hisat2 align test_hisat2_align_paired_end
+  command: nextflow run tests/software/hisat2/align -entry test_hisat2_align_paired_end -c tests/config/nextflow.config
+  tags:
+    - hisat2_align_paired_end
+    - hisat2_align
+    - hisat2
+  files:
+    - path: output/hisat2/test.hisat2.summary.log
+      md5sum: 9839b31db795958cc4b70711a3414e9c
+    - path: output/hisat2/genome.splice_sites.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/hisat2/test.bam
+    - path: output/index/hisat2/genome.5.ht2
+      md5sum: 91198831aaba993acac1734138c5f173
+    - path: output/index/hisat2/genome.7.ht2
+      md5sum: 9013eccd91ad614d7893c739275a394f
+    - path: output/index/hisat2/genome.1.ht2
+      md5sum: 3ea3dc41304941ad8d047e4d71b4899e
+    - path: output/index/hisat2/genome.2.ht2
+      md5sum: 47b153cd1319abc88dda532462651fcf
+    - path: output/index/hisat2/genome.6.ht2
+      md5sum: 265e1284ce85686516fae5d35540994a
+    - path: output/index/hisat2/genome.3.ht2
+      md5sum: 4ed93abba181d8dfab2e303e33114777
+    - path: output/index/hisat2/genome.8.ht2
+      md5sum: 33cdeccccebe80329f1fdbee7f5874cb
+    - path: output/index/hisat2/genome.4.ht2
+      md5sum: c25be5f8b0378abf7a58c8a880b87626
--- a/tests/software/hisat2/build_test/main.nf
+++ b/tests/software/hisat2/build_test/main.nf
@ -0,0 +1,14 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { HISAT2_EXTRACTSPLICESITES } from '../../../../software/hisat2/extractsplicesites/main.nf' addParams( options: [:] )
+include { HISAT2_BUILD } from '../../../../software/hisat2/build/main.nf' addParams( options: [:] )
+
+workflow test_hisat2_build {
+    fasta = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+    gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
+
+    HISAT2_EXTRACTSPLICESITES ( gtf )
+    HISAT2_BUILD ( fasta, gtf, HISAT2_EXTRACTSPLICESITES.out.txt )
+}
--- a/tests/software/hisat2/build_test/test.yml
+++ b/tests/software/hisat2/build_test/test.yml
@ -0,0 +1,24 @@
+- name: hisat2 build test_hisat2_build
+  command: nextflow run tests/software/hisat2/build_test -entry test_hisat2_build -c tests/config/nextflow.config
+  tags:
+    - hisat2
+    - hisat2_build
+  files:
+    - path: output/hisat2/genome.splice_sites.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/index/hisat2/genome.5.ht2
+      md5sum: 91198831aaba993acac1734138c5f173
+    - path: output/index/hisat2/genome.7.ht2
+      md5sum: 9013eccd91ad614d7893c739275a394f
+    - path: output/index/hisat2/genome.1.ht2
+      md5sum: 3ea3dc41304941ad8d047e4d71b4899e
+    - path: output/index/hisat2/genome.2.ht2
+      md5sum: 47b153cd1319abc88dda532462651fcf
+    - path: output/index/hisat2/genome.6.ht2
+      md5sum: 265e1284ce85686516fae5d35540994a
+    - path: output/index/hisat2/genome.3.ht2
+      md5sum: 4ed93abba181d8dfab2e303e33114777
+    - path: output/index/hisat2/genome.8.ht2
+      md5sum: 33cdeccccebe80329f1fdbee7f5874cb
+    - path: output/index/hisat2/genome.4.ht2
+      md5sum: c25be5f8b0378abf7a58c8a880b87626
--- a/tests/software/hisat2/extractsplicesites/main.nf
+++ b/tests/software/hisat2/extractsplicesites/main.nf
@ -0,0 +1,11 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { HISAT2_EXTRACTSPLICESITES } from '../../../../software/hisat2/extractsplicesites/main.nf' addParams( options: [:] )
+
+workflow test_hisat2_extractsplicesites {
+    gtf = file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true)
+
+    HISAT2_EXTRACTSPLICESITES ( gtf )
+}
--- a/tests/software/hisat2/extractsplicesites/test.yml
+++ b/tests/software/hisat2/extractsplicesites/test.yml
@ -0,0 +1,8 @@
+- name: hisat2 extractsplicesites test_hisat2_extractsplicesites
+  command: nextflow run tests/software/hisat2/extractsplicesites -entry test_hisat2_extractsplicesites -c tests/config/nextflow.config
+  tags:
+    - hisat2
+    - hisat2_extractsplicesites
+  files:
+    - path: output/hisat2/genome.splice_sites.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e