feat: Import HapLink validation script

This commit is contained in:
Thomas A. Christensen II 2023-06-08 17:55:57 -05:00
commit 7743bf31c6
Signed by: millironx
GPG key ID: 09335146883990B9
2 changed files with 247 additions and 0 deletions

243
main.nf Executable file
View file

@ -0,0 +1,243 @@
#!/usr/bin/env nextflow
workflow {
Channel
.fromPath("*.fastq.gz")
.map { file -> tuple(file.simpleName, file) }
.set { ch_input }
EFETCH()
EFETCH
.out
.set { ch_reference }
NANOFILT( ch_input )
NANOFILT
.out
.set { ch_reads }
MINIMAP2( ch_reads, ch_reference )
MINIMAP2
.out
.set { ch_alignments }
HAPLINK_VARIANTS( ch_alignments, ch_reference )
HAPLINK_VARIANTS
.out
.set { ch_variants }
ch_alignments
.join( ch_variants )
.set { ch_haplotype_calling }
HAPLINK_RAW_HAPLOTYPES(
ch_haplotype_calling,
ch_reference
)
HAPLINK_RAW_HAPLOTYPES
.out
.map{ [ it[0], 'raw', it[1] ] }
.set{ ch_raw_haplotypes }
HAPLINK_ML_HAPLOTYPES(
ch_haplotype_calling,
ch_reference
)
HAPLINK_ML_HAPLOTYPES
.out
.map{ [ it[0], 'ml', it[1] ] }
.set{ ch_ml_haplotypes }
ch_raw_haplotypes
.mix(ch_ml_haplotypes)
.set{ ch_all_haplotypes }
HAPLINK_SEQUENCES(
ch_all_haplotypes,
ch_reference
)
}
process EFETCH {
cpus 1
memory '256.MB'
container 'quay.io/biocontainers/entrez-direct:16.2--he881be0_1'
publishDir "results", mode: 'copy'
output:
path 'idv4.fasta'
script:
"""
esearch \\
-db nucleotide \\
-query "NC_036618.1" \\
| efetch \\
-format fasta \\
> idv4.fasta
"""
}
process NANOFILT {
cpus 1
memory '8.GB'
container 'quay.io/biocontainers/nanofilt:2.8.0--py_0'
input:
tuple val(prefix), path(reads)
output:
tuple val(prefix), path("*_trimmed.fastq.gz")
script:
"""
gzip \\
-cdf "${reads}" \\
| NanoFilt \\
--logfile "trimmed/${prefix}.nanofilt.log" \\
--length 100 \\
--quality 7 \\
--headcrop 30 \\
--tailcrop 30 \\
--minGC 0.1 \\
--maxGC 0.9 \\
| gzip \\
> "${prefix}_trimmed.fastq.gz"
"""
}
process MINIMAP2 {
cpus 4
memory '8.GB'
container 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0'
input:
tuple val(prefix), path(reads)
path reference
publishDir "results", mode: 'copy'
output:
tuple val(prefix), path("*.bam"), path("*.bam.bai")
script:
"""
minimap2 \\
-x map-ont \\
--MD \\
--eqx \\
-t ${task.cpus} \\
-a \\
"${reference}" \\
"${reads}" \\
| samtools sort \\
| samtools view \\
-@ ${task.cpus} \\
-b \\
-h \\
-o "${prefix}.bam"
samtools index "${prefix}.bam"
"""
}
process HAPLINK_VARIANTS {
cpus 2
memory '12.GB'
input:
tuple val(prefix), path(bam), path(bai)
path reference
output:
tuple val(prefix), path("*.vcf")
publishDir "results", mode: 'copy'
script:
"""
export JULIA_NUM_THREADS=${task.cpus}
haplink variants \\
"${reference}" \\
"${bam}" \\
> "${prefix}.vcf"
"""
}
process HAPLINK_RAW_HAPLOTYPES {
cpus 2
memory '12.GB'
input:
tuple val(prefix), path(bam), path(bai), path(vcf)
path reference
output:
tuple val(prefix), path("*.yaml")
publishDir "results/raw-haplotypes", mode: 'copy'
script:
"""
export JULIA_NUM_THREADS=${task.cpus}
haplink haplotypes \\
"${reference}" \\
"${vcf}" \\
"${bam}" \\
--frequency 0.01 \\
> "${prefix}.yaml"
"""
}
process HAPLINK_ML_HAPLOTYPES {
cpus 8
memory '12.GB'
input:
tuple val(prefix), path(bam), path(bai), path(vcf)
path reference
output:
tuple val(prefix), path("*.yaml")
publishDir "results/ml-haplotypes", mode: 'copy'
script:
"""
export JULIA_NUM_THREADS=${task.cpus}
haplink haplotypes \\
"${reference}" \\
"${vcf}" \\
"${bam}" \\
--simulated-reads \\
--overlap-min 20 \\
--overlap-max 8000 \\
--frequency 0.01 \\
> "${prefix}.yaml"
"""
}
process HAPLINK_SEQUENCES {
cpus 1
memory '6.GB'
input:
tuple val(prefix), val(method), path(yaml)
path reference
output:
tuple val(prefix), val(method), path("*.fasta")
publishDir "results/${method}-haplotypes", mode: 'copy'
script:
"""
export JULIA_NUM_THREADS=${task.cpus}
haplink sequences \\
"${reference}" \\
"${yaml}" \\
--prefix "${prefix}" \\
> "${prefix}.fasta"
"""
}

4
nextflow.config Normal file
View file

@ -0,0 +1,4 @@
process {
errorStrategy = 'finish'
time = '7d'
}