New module: scramble (#2015)

* added scramble/clusteridentifier * linting * added cluster_analysis * added a comment to the mei ref * added reference comments * linting
2024-12-30 11:12:10 -05:00 · 2022-09-07 13:19:18 +02:00 · 2022-09-07 13:19:18 +02:00 · f2264c1052
commit f2264c1052
parent e726b1730d
12 changed files with 373 additions and 0 deletions
--- a/modules/scramble/clusteranalysis/main.nf
+++ b/modules/scramble/clusteranalysis/main.nf
@ -0,0 +1,53 @@
+process SCRAMBLE_CLUSTERANALYSIS {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda (params.enable_conda ? "bioconda::scramble=1.0.1" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/scramble:1.0.1--h779adbc_1':
+        'quay.io/biocontainers/scramble:1.0.1--h779adbc_1' }"
+
+    input:
+    tuple val(meta), path(clusters)
+    path fasta
+    path mei_ref
+
+    output:
+    tuple val(meta), path("*_MEIs.txt")                 , optional:true, emit: meis_tab
+    tuple val(meta), path("*_PredictedDeletions.txt")   , optional:true, emit: dels_tab
+    tuple val(meta), path("*.vcf")                      , optional:true, emit: vcf
+    path "versions.yml"                                 , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+
+    def blastdb = args.contains("--eval-dels") ? "makeblastdb -in ${fasta} -parse_seqids -title ${fasta} -dbtype nucl -out ${fasta}" : ""
+    def reference = fasta ? "--ref `pwd`/${fasta}" : ""
+
+    // The default file for the MEI reference is a file that's inside the container
+    def mei_reference = mei_ref ? "`pwd`/${mei_ref}" : "/usr/local/share/scramble/resources/MEI_consensus_seqs.fa"
+
+    def blastdb_version = args.contains("--eval-dels") ? "makeblastdb: \$(echo \$(makeblastdb -version 2>&1) | head -n 1 | sed 's/^makeblastdb: //; s/+ Package.*\$//')" : ""
+    """
+    ${blastdb}
+
+    Rscript --vanilla /usr/local/share/scramble/bin/SCRAMble.R \\
+        --install-dir /usr/local/share/scramble/bin \\
+        ${args} \\
+        --cluster-file `pwd`/${clusters} \\
+        ${reference} \\
+        --mei-refs ${mei_reference} \\
+        --out-name `pwd`/${prefix}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        scramble: ${VERSION}
+        ${blastdb_version}
+    END_VERSIONS
+    """
+}
--- a/modules/scramble/clusteranalysis/meta.yml
+++ b/modules/scramble/clusteranalysis/meta.yml
@ -0,0 +1,58 @@
+name: "scramble_clusteranalysis"
+description: The Cluster Analysis tool of Scramble analyses and interprets the soft-clipped clusters found by `cluster_identifier`
+keywords:
+  - soft-clipped clusters
+  - scramble
+tools:
+  - "scramble":
+      description: "Soft Clipped Read Alignment Mapper"
+      homepage: "https://github.com/GeneDx/scramble"
+      documentation: "https://github.com/GeneDx/scramble"
+      tool_dev_url: "https://github.com/GeneDx/scramble"
+      doi: ""
+      licence: "['CC']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - clusters:
+      type: file
+      description: Tab-delimited text file containing soft-clipped clusters. Has to be generated using scramble/clusteridentifier
+      pattern: "*clusters.txt"
+  - fasta:
+      type: file
+      description: Optional fasta reference file. This file is needed to create a VCF file and to evaluate predicted deletions.
+      pattern: "*.{fasta,fa}"
+  - mei_ref:
+      type: file
+      description: Optional fasta file containing the MEI reference. This file should only be supplied in special occasions where the default isn't correct
+      pattern: "*.{fasta,fa}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - meis_tab:
+      type: file
+      description: Tab-delimited text file containing MEI calls
+      pattern: "*_MEIs.txt"
+  - dels_tab:
+      type: file
+      description: Tab-delimited text file containing predicted deletions
+      pattern: "*_PredictedDeletions.txt"
+  - vcf:
+      type: file
+      description: A VCF file containing the MEI calls and/or the predicted deletions (depending on the given arguments)
+      pattern: "*.vcf"
+
+authors:
+  - "@nvnieuwk"
--- a/modules/scramble/clusteridentifier/main.nf
+++ b/modules/scramble/clusteridentifier/main.nf
@ -0,0 +1,48 @@
+process SCRAMBLE_CLUSTERIDENTIFIER {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda (params.enable_conda ? "bioconda::scramble=1.0.1" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/scramble:1.0.1--h779adbc_1':
+        'quay.io/biocontainers/scramble:1.0.1--h779adbc_1' }"
+
+    input:
+    tuple val(meta), path(input), path(input_index)
+    path fasta
+
+    output:
+    tuple val(meta), path("*.clusters.txt") , emit: clusters
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+
+    // The tool does not contain a way to specify the reference file when using CRAM files.
+    // It just looks in the header of the CRAM file where the reference file is located,
+    // but that reference can't always be fetched since most test data is created on
+    // another machine. I had to find another way to specify the reference and I
+    // found that I could create an md5 cache of a specified fasta and supply it to
+    // the REF_PATH environment variable. This way the tool uses the correct reference.
+    // An issue has been made about this: https://github.com/GeneDx/scramble/issues/27
+    // The reference code is a placeholder until this issue has been fixed.
+    def reference = fasta ? "wget https://raw.githubusercontent.com/samtools/samtools/master/misc/seq_cache_populate.pl && perl seq_cache_populate.pl -root ./md5_ref ${fasta} && export REF_PATH=`pwd`/md5_ref/%2s/%2s/%s" : ""
+    """
+    ${reference}
+
+    cluster_identifier \\
+        ${args} \\
+        ${input} \\
+        > ${prefix}.clusters.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        scramble: ${VERSION}
+    END_VERSIONS
+    """
+}
--- a/modules/scramble/clusteridentifier/meta.yml
+++ b/modules/scramble/clusteridentifier/meta.yml
@ -0,0 +1,51 @@
+name: "scramble_clusteridentifier"
+description: The cluster_identifier tool of Scramble identifies soft clipped clusters
+keywords:
+  - bam
+  - cram
+  - soft-clipped clusters
+tools:
+  - "scramble":
+      description: "Soft Clipped Read Alignment Mapper"
+      homepage: "https://github.com/GeneDx/scramble"
+      documentation: "https://github.com/GeneDx/scramble"
+      tool_dev_url: "https://github.com/GeneDx/scramble"
+      doi: ""
+      licence: "['CC']"
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - input:
+      type: file
+      description: BAM/CRAM file
+      pattern: "*.{bam,cram}"
+  - input_index:
+      type: file
+      description: Index of the BAM/CRAM file
+      pattern: "*.{bai,crai}"
+  - fasta:
+      type: file
+      description: The reference FASTA file (mandatory when using CRAM files)
+      pattern: "*.{fasta,fa}"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - clusters:
+      type: file
+      description: Tab-delimited file containing the soft-clipped clusters
+      pattern: "*.clusters.txt"
+
+authors:
+  - "@nvnieuwk"
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@ -2047,6 +2047,14 @@ scoary:
  - modules/scoary/**
  - tests/modules/scoary/**

+scramble/clusteranalysis:
+  - modules/scramble/clusteranalysis/**
+  - tests/modules/scramble/clusteranalysis/**
+
+scramble/clusteridentifier:
+  - modules/scramble/clusteridentifier/**
+  - tests/modules/scramble/clusteridentifier/**
+
 seacr/callpeak:
  - modules/seacr/callpeak/**
  - tests/modules/seacr/callpeak/**
--- a/tests/config/test_data.config
+++ b/tests/config/test_data.config
@ -368,6 +368,15 @@ params {
                genemodel2                      = "${test_data_dir}/genomics/homo_sapiens/pacbio/bed/alz.ccs.fl.NEB_5p--NEB_Clontech_3p.flnc.clustered.singletons.merged.aligned_tc.2.bed"
                filelist                        = "${test_data_dir}/genomics/homo_sapiens/pacbio/txt/filelist.txt"
            }
+            'scramble' {
+                fasta                           = "${test_data_dir}/genomics/homo_sapiens/scramble/test.fa"
+                fasta_fai                       = "${test_data_dir}/genomics/homo_sapiens/scramble/test.fa.fai"
+                bam                             = "${test_data_dir}/genomics/homo_sapiens/scramble/test.bam"
+                bam_bai                         = "${test_data_dir}/genomics/homo_sapiens/scramble/test.bam.bai"
+                cram                            = "${test_data_dir}/genomics/homo_sapiens/scramble/test.cram"
+                cram_crai                       = "${test_data_dir}/genomics/homo_sapiens/scramble/test.cram.crai"
+                bed                             = "${test_data_dir}/genomics/homo_sapiens/scramble/test.bed"
+            }
        }
        'bacteroides_fragilis' {
            'genome' {
--- a/tests/modules/scramble/clusteranalysis/main.nf
+++ b/tests/modules/scramble/clusteranalysis/main.nf
@ -0,0 +1,54 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { SCRAMBLE_CLUSTERANALYSIS   } from '../../../../modules/scramble/clusteranalysis/main.nf'
+include { SCRAMBLE_CLUSTERIDENTIFIER } from '../../../../modules/scramble/clusteridentifier/main.nf'
+
+workflow test_scramble_clusteranalysis {
+    
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        file(params.test_data['homo_sapiens']['scramble']['bam'], checkIfExists: true),   
+        file(params.test_data['homo_sapiens']['scramble']['bam_bai'], checkIfExists: true),   
+        []
+    ]
+
+    fasta = []
+    mei_ref = []
+
+    SCRAMBLE_CLUSTERIDENTIFIER(
+        input,
+        fasta
+    )
+
+    SCRAMBLE_CLUSTERANALYSIS (
+        SCRAMBLE_CLUSTERIDENTIFIER.out.clusters,
+        fasta,
+        mei_ref
+    )
+}
+
+workflow test_scramble_clusteranalysis_fasta {
+    
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        file(params.test_data['homo_sapiens']['scramble']['cram'], checkIfExists: true),   
+        file(params.test_data['homo_sapiens']['scramble']['cram_crai'], checkIfExists: true),   
+        []
+    ]
+
+    fasta = file(params.test_data['homo_sapiens']['scramble']['fasta'], checkIfExists: true)
+    mei_ref = []
+
+    SCRAMBLE_CLUSTERIDENTIFIER(
+        input,
+        fasta
+    )
+
+    SCRAMBLE_CLUSTERANALYSIS (
+        SCRAMBLE_CLUSTERIDENTIFIER.out.clusters,
+        fasta,
+        mei_ref
+    )
+}
--- a/tests/modules/scramble/clusteranalysis/nextflow.config
+++ b/tests/modules/scramble/clusteranalysis/nextflow.config
@ -0,0 +1,12 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+    
+    withName: "test_scramble_clusteranalysis:SCRAMBLE_CLUSTERANALYSIS" {
+        ext.args = "--eval-meis"
+    }
+
+    withName: "test_scramble_clusteranalysis_fasta:SCRAMBLE_CLUSTERANALYSIS" {
+        ext.args = "--eval-meis --eval-dels"
+    }
+}
--- a/tests/modules/scramble/clusteranalysis/test.yml
+++ b/tests/modules/scramble/clusteranalysis/test.yml
@ -0,0 +1,25 @@
+- name: scramble clusteranalysis test_scramble_clusteranalysis
+  command: nextflow run ./tests/modules/scramble/clusteranalysis -entry test_scramble_clusteranalysis -c ./tests/config/nextflow.config  -c ./tests/modules/scramble/clusteranalysis/nextflow.config
+  tags:
+    - scramble/clusteranalysis
+    - scramble
+  files:
+    - path: output/scramble/test.clusters.txt
+      md5sum: 9b2777a44bfbcff8fac1bf67c3985f1f
+    - path: output/scramble/test_MEIs.txt
+      md5sum: a14c40c7e5f3630defde68ae1de51bca
+
+- name: scramble clusteranalysis test_scramble_clusteranalysis_fasta
+  command: nextflow run ./tests/modules/scramble/clusteranalysis -entry test_scramble_clusteranalysis_fasta -c ./tests/config/nextflow.config  -c ./tests/modules/scramble/clusteranalysis/nextflow.config
+  tags:
+    - scramble/clusteranalysis
+    - scramble
+  files:
+    - path: output/scramble/test.clusters.txt
+      md5sum: 9b2777a44bfbcff8fac1bf67c3985f1f
+    - path: output/scramble/test.vcf
+      contains: [fileformat=VCFv4.2]
+    - path: output/scramble/test_MEIs.txt
+      md5sum: a14c40c7e5f3630defde68ae1de51bca
+    - path: output/scramble/test_PredictedDeletions.txt
+      md5sum: 1fa0d3d0a58fdf81bd259b3c71774ba8
--- a/tests/modules/scramble/clusteridentifier/main.nf
+++ b/tests/modules/scramble/clusteridentifier/main.nf
@ -0,0 +1,33 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { SCRAMBLE_CLUSTERIDENTIFIER } from '../../../../modules/scramble/clusteridentifier/main.nf'
+
+workflow test_scramble_clusteridentifier_bam {
+    
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        file(params.test_data['homo_sapiens']['scramble']['bam'], checkIfExists: true),   
+        file(params.test_data['homo_sapiens']['scramble']['bam_bai'], checkIfExists: true),   
+        []
+    ]
+
+    fasta = []
+
+    SCRAMBLE_CLUSTERIDENTIFIER ( input, fasta )
+}
+
+workflow test_scramble_clusteridentifier_cram {
+    
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        file(params.test_data['homo_sapiens']['scramble']['cram'], checkIfExists: true),   
+        file(params.test_data['homo_sapiens']['scramble']['cram_crai'], checkIfExists: true),   
+        []
+    ]
+
+    fasta = file(params.test_data['homo_sapiens']['scramble']['fasta'], checkIfExists: true)
+
+    SCRAMBLE_CLUSTERIDENTIFIER ( input, fasta )
+}
--- a/tests/modules/scramble/clusteridentifier/nextflow.config
+++ b/tests/modules/scramble/clusteridentifier/nextflow.config
@ -0,0 +1,5 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+    
+}
--- a/tests/modules/scramble/clusteridentifier/test.yml
+++ b/tests/modules/scramble/clusteridentifier/test.yml
@ -0,0 +1,17 @@
+- name: scramble clusteridentifier test_scramble_clusteridentifier_bam
+  command: nextflow run ./tests/modules/scramble/clusteridentifier -entry test_scramble_clusteridentifier_bam -c ./tests/config/nextflow.config -c ./tests/modules/scramble/clusteridentifier/nextflow.config
+  tags:
+    - scramble/clusteridentifier
+    - scramble
+  files:
+    - path: output/scramble/test.clusters.txt
+      md5sum: 9b2777a44bfbcff8fac1bf67c3985f1f
+
+- name: scramble clusteridentifier test_scramble_clusteridentifier_cram
+  command: nextflow run ./tests/modules/scramble/clusteridentifier -entry test_scramble_clusteridentifier_cram -c ./tests/config/nextflow.config -c ./tests/modules/scramble/clusteridentifier/nextflow.config
+  tags:
+    - scramble/clusteridentifier
+    - scramble
+  files:
+    - path: output/scramble/test.clusters.txt
+      md5sum: 9b2777a44bfbcff8fac1bf67c3985f1f