New module: NGSCheckMate (#1290)

NGSCheckMate ncm mode, working on bam files and vcf files to check that (human) samples match as expected 

Co-authored-by: Simon Pearce <simon.pearce@cruk.manchester.ac.uk>
Co-authored-by: Mahesh Binzer-Panchal <mahesh.binzer-panchal@nbis.se>
This commit is contained in:
Simon Pearce 2022-03-11 09:02:10 +00:00 committed by GitHub
parent 62da45b0e1
commit 79a9d5e1ea
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 236 additions and 0 deletions

View file

@ -0,0 +1,49 @@
process NGSCHECKMATE_NCM {
label 'process_low'
conda (params.enable_conda ? "bioconda::ngscheckmate=1.0.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.0--py27r41hdfd78af_3':
'quay.io/biocontainers/ngscheckmate:1.0.0--py27r41hdfd78af_3' }"
input:
path files
path snp_bed
path fasta
output:
path "*.pdf" , emit: pdf
path "*_corr_matrix.txt", emit: corr_matrix
path "*_matched.txt" , emit: matched
path "*_all.txt" , emit: all
path "*.vcf" , emit: vcfs, optional: true
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "output"
def unzip = files.any { it.toString().endsWith(".vcf.gz") }
"""
if $unzip
then
for VCFGZ in *.vcf.gz; do
gunzip -cdf \$VCFGZ > \$( basename \$VCFGZ .gz );
done
fi
NCM_REF="./"${fasta} ncm.py -d . -bed ${snp_bed} -O . -N ${prefix} $args
if $unzip
then
rm -f *.vcf # clean up decompressed vcfs
fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g")
END_VERSIONS
"""
}

View file

@ -0,0 +1,64 @@
name: ngscheckmate_ncm
description: Determining whether sequencing data comes from the same individual by using SNP matching. Designed for humans on vcf or bam files.
keywords:
- ngscheckmate
- matching
- snp
tools:
- ngscheckmate:
description: NGSCheckMate is a software package for identifying next generation sequencing (NGS) data files from the same individual, including matching between DNA and RNA.
homepage: https://github.com/parklab/NGSCheckMate
documentation: https://github.com/parklab/NGSCheckMate
tool_dev_url: https://github.com/parklab/NGSCheckMate
doi: "doi:/10.1093/nar/gkx193"
licence: ['MIT']
input:
- files:
type: file
description: VCF or BAM files for each sample, in a merged channel (possibly gzipped). BAM files require an index too.
pattern: "*.{vcf,vcf.gz,bam,bai}"
- snp_bed:
type: file
description: BED file containing the SNPs to analyse
pattern: "*.{bed}"
- fasta:
type: file
description: fasta file for the genome, only used in the bam mode
pattern: "*.{bed}"
output:
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- pdf:
type: file
description: A pdf containing a dendrogram showing how the samples match up
pattern: "*.{pdf}"
- corr_matrix:
type: file
description: A text file containing the correlation matrix between each sample
pattern: "*corr_matrix.txt"
- matched:
type: file
description: A txt file containing only the samples that match with each other
pattern: "*matched.txt"
- all:
type: file
description: A txt file containing all the sample comparisons, whether they match or not
pattern: "*all.txt"
- vcfs:
type: file
description: If ran in bam mode, vcf files for each sample giving the SNP calls
pattern: "*.vcf"
authors:
- "@sppearce"

View file

@ -1145,6 +1145,10 @@ ngmaster:
- modules/ngmaster/**
- tests/modules/ngmaster/**
ngscheckmate/ncm:
- modules/ngscheckmate/ncm/**
- tests/modules/ngscheckmate/ncm/**
nucmer:
- modules/nucmer/**
- tests/modules/nucmer/**

View file

@ -0,0 +1,63 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { NGSCHECKMATE_NCM as NGSCHECKMATE_NCM_BAM} from '../../../../modules/ngscheckmate/ncm/main.nf'
include { NGSCHECKMATE_NCM as NGSCHECKMATE_NCM_VCF} from '../../../../modules/ngscheckmate/ncm/main.nf'
include { BEDTOOLS_MAKEWINDOWS } from '../../../../modules/bedtools/makewindows/main.nf'
include { BCFTOOLS_MPILEUP } from '../../../../modules/bcftools/mpileup/main.nf'
include { BCFTOOLS_MPILEUP as BCFTOOLS_MPILEUP2 } from '../../../../modules/bcftools/mpileup/main.nf'
workflow test_ngscheckmate_ncm_bam {
input = [ file(params.test_data['sarscov2']['illumina']['test_paired_end_methylated_sorted_bam'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_paired_end_methylated_sorted_bam_bai'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true)]
fasta = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ]
inputBed = [ [ id:'test'],
file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true)]
BEDTOOLS_MAKEWINDOWS(inputBed, true).
tab.
map{it[1]}.
view().
set{snp_channel}
NGSCHECKMATE_NCM_BAM(input, snp_channel, fasta)
}
workflow test_ngscheckmate_ncm_vcf {
input1 = [ [ id:'test1' ], // meta map
[ file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) ]
]
input2 = [ [ id:'test2' ], // meta map
[ file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) ]
]
fasta = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ]
inputBed = [ [ id:'test'],
file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true)]
BCFTOOLS_MPILEUP ( input1, fasta, false )
BCFTOOLS_MPILEUP2 ( input2, fasta, false )
BCFTOOLS_MPILEUP2.out.vcf.
combine( BCFTOOLS_MPILEUP.out.vcf ).
map { [ it[1], it[3] ] }.
set { vcf_channel }
BEDTOOLS_MAKEWINDOWS( inputBed, true ).tab.
map { it[1] }.
view().
set { snp_channel }
NGSCHECKMATE_NCM_VCF(vcf_channel, snp_channel, fasta)
}

View file

@ -0,0 +1,27 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: BEDTOOLS_MAKEWINDOWS {
ext.args = '-w 1'
}
withName: BCFTOOLS_MPILEUP {
ext.args2 = '--no-version --ploidy 1 --multiallelic-caller'
ext.args3 = '--no-version'
}
withName: BCFTOOLS_MPILEUP2 {
ext.args2 = '--no-version --ploidy 1 --multiallelic-caller'
ext.args3 = '--no-version'
}
withName: NGSCHECKMATE_NCM_VCF {
ext.args = '-V'
}
withName: NGSCHECKMATE_NCM_BAM {
ext.args = '-B'
}
}

View file

@ -0,0 +1,29 @@
- name: ngscheckmate ncm test_ngscheckmate_ncm_bam
command: nextflow run tests/modules/ngscheckmate/ncm -entry test_ngscheckmate_ncm_bam -c tests/config/nextflow.config
tags:
- ngscheckmate/ncm
- ngscheckmate
files:
- path: output/ngscheckmate/output_all.txt
md5sum: f71a712c3f6ecf64dd526365212f1b7c
- path: output/ngscheckmate/output_corr_matrix.txt
md5sum: 6777377aa9ae3d57f841b12896318db0
- path: output/ngscheckmate/output_matched.txt
md5sum: f71a712c3f6ecf64dd526365212f1b7c
- path: output/ngscheckmate/versions.yml
md5sum: fbb2bebd65b4f4e1e93c6bf5c08a6829
- name: ngscheckmate ncm test_ngscheckmate_ncm_vcf
command: nextflow run tests/modules/ngscheckmate/ncm -entry test_ngscheckmate_ncm_vcf -c tests/config/nextflow.config
tags:
- ngscheckmate/ncm
- ngscheckmate
files:
- path: output/ngscheckmate/output_all.txt
md5sum: fd74956dcac279b6f58e82ea73e344f8
- path: output/ngscheckmate/output_corr_matrix.txt
md5sum: 0c86bdad2721c470fe6be119f291c8e5
- path: output/ngscheckmate/output_matched.txt
md5sum: fd74956dcac279b6f58e82ea73e344f8
- path: output/ngscheckmate/versions.yml
md5sum: f06910b83dde194a47870c553cefe193