From 7743bf31c6992e2378c957508623b3cce23104c5 Mon Sep 17 00:00:00 2001 From: "Thomas A. Christensen II" <25492070+MillironX@users.noreply.github.com> Date: Thu, 8 Jun 2023 17:55:57 -0500 Subject: [PATCH] feat: Import HapLink validation script --- main.nf | 243 ++++++++++++++++++++++++++++++++++++++++++++++++ nextflow.config | 4 + 2 files changed, 247 insertions(+) create mode 100755 main.nf create mode 100644 nextflow.config diff --git a/main.nf b/main.nf new file mode 100755 index 0000000..847d5dc --- /dev/null +++ b/main.nf @@ -0,0 +1,243 @@ +#!/usr/bin/env nextflow + +workflow { + + Channel + .fromPath("*.fastq.gz") + .map { file -> tuple(file.simpleName, file) } + .set { ch_input } + + EFETCH() + EFETCH + .out + .set { ch_reference } + + NANOFILT( ch_input ) + NANOFILT + .out + .set { ch_reads } + + MINIMAP2( ch_reads, ch_reference ) + MINIMAP2 + .out + .set { ch_alignments } + + HAPLINK_VARIANTS( ch_alignments, ch_reference ) + HAPLINK_VARIANTS + .out + .set { ch_variants } + + ch_alignments + .join( ch_variants ) + .set { ch_haplotype_calling } + + + HAPLINK_RAW_HAPLOTYPES( + ch_haplotype_calling, + ch_reference + ) + HAPLINK_RAW_HAPLOTYPES + .out + .map{ [ it[0], 'raw', it[1] ] } + .set{ ch_raw_haplotypes } + + HAPLINK_ML_HAPLOTYPES( + ch_haplotype_calling, + ch_reference + ) + HAPLINK_ML_HAPLOTYPES + .out + .map{ [ it[0], 'ml', it[1] ] } + .set{ ch_ml_haplotypes } + + ch_raw_haplotypes + .mix(ch_ml_haplotypes) + .set{ ch_all_haplotypes } + + HAPLINK_SEQUENCES( + ch_all_haplotypes, + ch_reference + ) +} + +process EFETCH { + cpus 1 + memory '256.MB' + container 'quay.io/biocontainers/entrez-direct:16.2--he881be0_1' + + publishDir "results", mode: 'copy' + + output: + path 'idv4.fasta' + + script: + """ + esearch \\ + -db nucleotide \\ + -query "NC_036618.1" \\ + | efetch \\ + -format fasta \\ + > idv4.fasta + """ +} + +process NANOFILT { + cpus 1 + memory '8.GB' + container 'quay.io/biocontainers/nanofilt:2.8.0--py_0' + + input: + tuple val(prefix), path(reads) + + output: + tuple val(prefix), path("*_trimmed.fastq.gz") + + script: + """ + gzip \\ + -cdf "${reads}" \\ + | NanoFilt \\ + --logfile "trimmed/${prefix}.nanofilt.log" \\ + --length 100 \\ + --quality 7 \\ + --headcrop 30 \\ + --tailcrop 30 \\ + --minGC 0.1 \\ + --maxGC 0.9 \\ + | gzip \\ + > "${prefix}_trimmed.fastq.gz" + """ +} + +process MINIMAP2 { + cpus 4 + memory '8.GB' + container 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' + + input: + tuple val(prefix), path(reads) + path reference + + publishDir "results", mode: 'copy' + + output: + tuple val(prefix), path("*.bam"), path("*.bam.bai") + script: + """ + minimap2 \\ + -x map-ont \\ + --MD \\ + --eqx \\ + -t ${task.cpus} \\ + -a \\ + "${reference}" \\ + "${reads}" \\ + | samtools sort \\ + | samtools view \\ + -@ ${task.cpus} \\ + -b \\ + -h \\ + -o "${prefix}.bam" + samtools index "${prefix}.bam" + """ +} + +process HAPLINK_VARIANTS { + cpus 2 + memory '12.GB' + + input: + tuple val(prefix), path(bam), path(bai) + path reference + + output: + tuple val(prefix), path("*.vcf") + + publishDir "results", mode: 'copy' + + script: + """ + export JULIA_NUM_THREADS=${task.cpus} + haplink variants \\ + "${reference}" \\ + "${bam}" \\ + > "${prefix}.vcf" + """ +} + +process HAPLINK_RAW_HAPLOTYPES { + cpus 2 + memory '12.GB' + + input: + tuple val(prefix), path(bam), path(bai), path(vcf) + path reference + + output: + tuple val(prefix), path("*.yaml") + + publishDir "results/raw-haplotypes", mode: 'copy' + + script: + """ + export JULIA_NUM_THREADS=${task.cpus} + haplink haplotypes \\ + "${reference}" \\ + "${vcf}" \\ + "${bam}" \\ + --frequency 0.01 \\ + > "${prefix}.yaml" + """ +} + +process HAPLINK_ML_HAPLOTYPES { + cpus 8 + memory '12.GB' + + input: + tuple val(prefix), path(bam), path(bai), path(vcf) + path reference + + output: + tuple val(prefix), path("*.yaml") + + publishDir "results/ml-haplotypes", mode: 'copy' + + script: + """ + export JULIA_NUM_THREADS=${task.cpus} + haplink haplotypes \\ + "${reference}" \\ + "${vcf}" \\ + "${bam}" \\ + --simulated-reads \\ + --overlap-min 20 \\ + --overlap-max 8000 \\ + --frequency 0.01 \\ + > "${prefix}.yaml" + """ +} + +process HAPLINK_SEQUENCES { + cpus 1 + memory '6.GB' + + input: + tuple val(prefix), val(method), path(yaml) + path reference + + output: + tuple val(prefix), val(method), path("*.fasta") + + publishDir "results/${method}-haplotypes", mode: 'copy' + + script: + """ + export JULIA_NUM_THREADS=${task.cpus} + haplink sequences \\ + "${reference}" \\ + "${yaml}" \\ + --prefix "${prefix}" \\ + > "${prefix}.fasta" + """ +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..abcd8d3 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,4 @@ +process { + errorStrategy = 'finish' + time = '7d' +}