feat: Import HapLink validation script

2023-06-08 17:55:57 -05:00 · 2023-06-08 17:55:57 -05:00 · 7743bf31c6
commit 7743bf31c6
2 changed files with 247 additions and 0 deletions
--- a/main.nf
+++ b/main.nf
@ -0,0 +1,243 @@
+#!/usr/bin/env nextflow
+
+workflow {
+
+    Channel
+        .fromPath("*.fastq.gz")
+        .map { file -> tuple(file.simpleName, file) }
+        .set { ch_input }
+
+    EFETCH()
+    EFETCH
+        .out
+        .set { ch_reference }
+
+    NANOFILT( ch_input )
+    NANOFILT
+        .out
+        .set { ch_reads }
+
+    MINIMAP2( ch_reads, ch_reference )
+    MINIMAP2
+        .out
+        .set { ch_alignments }
+
+    HAPLINK_VARIANTS( ch_alignments, ch_reference )
+    HAPLINK_VARIANTS
+        .out
+        .set { ch_variants }
+
+    ch_alignments
+        .join( ch_variants )
+        .set { ch_haplotype_calling }
+
+
+    HAPLINK_RAW_HAPLOTYPES(
+        ch_haplotype_calling,
+        ch_reference
+    )
+    HAPLINK_RAW_HAPLOTYPES
+        .out
+    .map{ [ it[0], 'raw', it[1] ] }
+    .set{ ch_raw_haplotypes }
+
+    HAPLINK_ML_HAPLOTYPES(
+        ch_haplotype_calling,
+        ch_reference
+    )
+    HAPLINK_ML_HAPLOTYPES
+        .out
+        .map{ [ it[0], 'ml', it[1] ] }
+        .set{ ch_ml_haplotypes }
+
+    ch_raw_haplotypes
+        .mix(ch_ml_haplotypes)
+    .set{ ch_all_haplotypes }
+
+    HAPLINK_SEQUENCES(
+        ch_all_haplotypes,
+        ch_reference
+    )
+}
+
+process EFETCH {
+    cpus 1
+    memory '256.MB'
+    container 'quay.io/biocontainers/entrez-direct:16.2--he881be0_1'
+
+    publishDir "results", mode: 'copy'
+
+    output:
+    path 'idv4.fasta'
+
+    script:
+    """
+    esearch \\
+            -db nucleotide \\
+            -query "NC_036618.1" \\
+        | efetch \\
+            -format fasta \\
+        > idv4.fasta
+    """
+}
+
+process NANOFILT {
+    cpus 1
+    memory '8.GB'
+    container 'quay.io/biocontainers/nanofilt:2.8.0--py_0'
+
+    input:
+    tuple val(prefix), path(reads)
+
+    output:
+    tuple val(prefix), path("*_trimmed.fastq.gz")
+
+    script:
+    """
+    gzip \\
+            -cdf "${reads}" \\
+        | NanoFilt \\
+            --logfile "trimmed/${prefix}.nanofilt.log" \\
+            --length 100 \\
+            --quality 7 \\
+            --headcrop 30 \\
+            --tailcrop 30 \\
+            --minGC 0.1 \\
+            --maxGC 0.9 \\
+        | gzip \\
+        > "${prefix}_trimmed.fastq.gz"
+    """
+}
+
+process MINIMAP2 {
+    cpus 4
+    memory '8.GB'
+    container 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0'
+
+    input:
+    tuple val(prefix), path(reads)
+    path reference
+
+    publishDir "results", mode: 'copy'
+
+    output:
+    tuple val(prefix), path("*.bam"), path("*.bam.bai")
+    script:
+    """
+    minimap2 \\
+            -x map-ont \\
+            --MD \\
+            --eqx \\
+            -t ${task.cpus} \\
+            -a \\
+            "${reference}" \\
+            "${reads}" \\
+    | samtools sort \\
+    | samtools view \\
+            -@ ${task.cpus} \\
+            -b \\
+            -h \\
+            -o "${prefix}.bam"
+    samtools index "${prefix}.bam"
+    """
+}
+
+process HAPLINK_VARIANTS {
+    cpus 2
+    memory '12.GB'
+
+    input:
+    tuple val(prefix), path(bam), path(bai)
+    path reference
+
+    output:
+    tuple val(prefix), path("*.vcf")
+
+    publishDir "results", mode: 'copy'
+
+    script:
+    """
+    export JULIA_NUM_THREADS=${task.cpus}
+    haplink variants \\
+            "${reference}" \\
+            "${bam}" \\
+        > "${prefix}.vcf"
+    """
+}
+
+process HAPLINK_RAW_HAPLOTYPES {
+    cpus 2
+    memory '12.GB'
+
+    input:
+    tuple val(prefix), path(bam), path(bai), path(vcf)
+    path reference
+
+    output:
+    tuple val(prefix), path("*.yaml")
+
+    publishDir "results/raw-haplotypes", mode: 'copy'
+
+    script:
+    """
+    export JULIA_NUM_THREADS=${task.cpus}
+    haplink haplotypes \\
+            "${reference}" \\
+            "${vcf}" \\
+            "${bam}" \\
+        --frequency 0.01 \\
+        > "${prefix}.yaml"
+    """
+}
+
+process HAPLINK_ML_HAPLOTYPES {
+    cpus 8
+    memory '12.GB'
+
+    input:
+    tuple val(prefix), path(bam), path(bai), path(vcf)
+    path reference
+
+    output:
+    tuple val(prefix), path("*.yaml")
+
+    publishDir "results/ml-haplotypes", mode: 'copy'
+
+    script:
+    """
+    export JULIA_NUM_THREADS=${task.cpus}
+    haplink haplotypes \\
+            "${reference}" \\
+            "${vcf}" \\
+            "${bam}" \\
+        --simulated-reads \\
+        --overlap-min 20 \\
+        --overlap-max 8000 \\
+        --frequency 0.01 \\
+        > "${prefix}.yaml"
+    """
+}
+
+process HAPLINK_SEQUENCES {
+    cpus 1
+    memory '6.GB'
+
+    input:
+    tuple val(prefix), val(method), path(yaml)
+    path reference
+
+    output:
+    tuple val(prefix), val(method), path("*.fasta")
+
+    publishDir "results/${method}-haplotypes", mode: 'copy'
+
+    script:
+    """
+    export JULIA_NUM_THREADS=${task.cpus}
+    haplink sequences \\
+            "${reference}" \\
+            "${yaml}" \\
+            --prefix "${prefix}" \\
+        > "${prefix}.fasta"
+    """
+}
--- a/nextflow.config
+++ b/nextflow.config
@ -0,0 +1,4 @@
+process {
+    errorStrategy = 'finish'
+    time = '7d'
+}