Merge pull request #1705 from jtangrot/vsearch_usearch_global

Vsearch usearch global
This commit is contained in:
Jeanette Tångrot 2022-05-30 14:28:01 +02:00 committed by GitHub
commit 3bb32b2def
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 209 additions and 0 deletions

View file

@ -0,0 +1,67 @@
process VSEARCH_USEARCHGLOBAL {
tag "${meta.id}"
label 'process_low'
conda (params.enable_conda ? "bioconda::vsearch=2.21.1" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/vsearch:2.21.1--h95f258a_0':
'quay.io/biocontainers/vsearch:2.21.1--h95f258a_0' }"
input:
tuple val(meta), path(queryfasta)
path db
val idcutoff
val outoption
val user_columns
output:
tuple val(meta), path('*.aln') , optional: true, emit: aln
tuple val(meta), path('*.biom') , optional: true, emit: biom
tuple val(meta), path('*.lca') , optional: true, emit: lca
tuple val(meta), path('*.mothur') , optional: true, emit: mothur
tuple val(meta), path('*.otu') , optional: true, emit: otu
tuple val(meta), path('*.sam') , optional: true, emit: sam
tuple val(meta), path('*.tsv') , optional: true, emit: tsv
tuple val(meta), path('*.txt') , optional: true, emit: txt
tuple val(meta), path('*.uc') , optional: true, emit: uc
path "versions.yml" , emit: versions
when:
task.ext.when == null || task.ext.when
script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def columns = user_columns ? "--userfields ${user_columns}" : ''
switch ( outoption ) {
case "alnout": outfmt = "--alnout"; out_ext = 'aln'; break
case "biomout": outfmt = "--biomout"; out_ext = 'biom'; break
case "blast6out": outfmt = "--blast6out"; out_ext = 'txt'; break
case "mothur_shared_out": outfmt = "--mothur_shared_out"; out_ext = 'mothur'; break
case "otutabout": outfmt = "--otutabout"; out_ext = 'otu'; break
case "samout": outfmt = "--samout"; out_ext = 'sam'; break
case "uc": outfmt = "--uc"; out_ext = 'uc'; break
case "userout": outfmt = "--userout"; out_ext = 'tsv'; break
case "lcaout": outfmt = "--lcaout"; out_ext = 'lca'; break
default:
outfmt = "--alnout";
out_ext = 'aln';
log.warn("Unknown output file format provided (${outoption}): selecting pairwise alignments (alnout)");
break
}
"""
vsearch \\
--usearch_global $queryfasta \\
--db $db \\
--id $idcutoff \\
--threads $task.cpus \\
$args \\
${columns} \\
${outfmt} ${prefix}.${out_ext}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
vsearch: \$(vsearch --version 2>&1 | head -n 1 | sed 's/vsearch //g' | sed 's/,.*//g' | sed 's/^v//' | sed 's/_.*//')
END_VERSIONS
"""
}

View file

@ -0,0 +1,83 @@
name: "vsearch_usearchglobal"
description: Compare target sequences to fasta-formatted query sequences using global pairwise alignment.
keywords:
- vsearch
- usearch
- alignment
- fasta
tools:
- "vsearch":
description: "VSEARCH is a versatile open-source tool for microbiome analysis, including chimera detection, clustering, dereplication and rereplication, extraction, FASTA/FASTQ/SFF file processing, masking, orienting, pair-wise alignment, restriction site cutting, searching, shuffling, sorting, subsampling, and taxonomic classification of amplicon sequences for metagenomics, genomics, and population genetics. (USEARCH alternative)"
homepage: "https://github.com/torognes/vsearch"
documentation: "None"
tool_dev_url: "https://github.com/torognes/vsearch"
doi: "doi: 10.7717/peerj.2584"
licence: "['GPL v3-or-later OR BSD-2-clause']"
input:
- meta:
type: map
description: Groovy Map containing sample information e.g. [ id:'test' ]
- queryfasta:
type: file
description: Query sequences in FASTA format
pattern: "*.{fasta,fa,fna,faa}"
- db:
type: file
description: Reference database file in FASTA or UDB format
pattern: "*"
- idcutoff:
type: real
description: Reject the sequence match if the pairwise identity is lower than the given id cutoff value (value ranging from 0.0 to 1.0 included)
- outoption:
type: string
description: Specify the type of output file to be generated by selecting one of the vsearch output file options
pattern: "alnout|biomout|blast6out|mothur_shared_out|otutabout|samout|uc|userout|lcaout"
- user_columns:
type: string
description: If using the `userout` option, specify which columns to include in output, with fields separated with `+` (e.g. query+target+id). See USEARCH manual for valid options. For other output options, use an empty string.
output:
- aln:
type: file
description: Results in pairwise alignment format
pattern: "*.{aln}"
- biom:
type: file
description: Results in an OTU table in the biom version 1.0 file format
pattern: "*.{biom}"
- lca:
type: file
description: Last common ancestor (LCA) information about the hits of each query in tab-separated format
pattern: "*.{lca}"
- mothur:
type: file
description: Results in an OTU table in the mothur shared tab-separated plain text file format
pattern: "*.{mothur}"
- otu:
type: file
description: Results in an OTU table in the classic tab-separated plain text format
pattern: "*.{otu}"
- sam:
type: file
description: Results written in sam format
pattern: "*.{sam}"
- tsv:
type: file
description: Results in tab-separated output, columns defined by user
pattern: "*.{tsv}"
- txt:
type: file
description: Tab delimited results in blast-like tabular format
pattern: "*.{txt}"
- uc:
type: file
description: Tab delimited results in a uclust-like format with 10 columns
pattern: "*.{uc}"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@jtangrot"

View file

@ -2052,6 +2052,10 @@ vcftools:
- modules/vcftools/**
- tests/modules/vcftools/**
vsearch/usearchglobal:
- modules/vsearch/usearchglobal/**
- tests/modules/vsearch/usearchglobal/**
yara/index:
- modules/yara/index/**
- tests/modules/yara/index/**

View file

@ -0,0 +1,25 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { VSEARCH_USEARCHGLOBAL } from '../../../../modules/vsearch/usearchglobal/main.nf'
workflow test_vsearch_usearchglobal {
query = file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)
db = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
idcutoff = 0.985
outoption = "xcfert" // Nonsense text to check default case.
columns = ""
VSEARCH_USEARCHGLOBAL ( [[id:'test'], query], db, idcutoff, outoption, columns )
}
workflow test_vsearch_usearchglobal_userout {
query = file(params.test_data['sarscov2']['genome']['transcriptome_fasta'], checkIfExists: true)
db = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
idcutoff = 0.985
outoption = "userout"
columns = "query+target+id"
VSEARCH_USEARCHGLOBAL ( [[id:'test'], query], db, idcutoff, outoption, columns )
}

View file

@ -0,0 +1,4 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,26 @@
- name: vsearch usearchglobal test_vsearch_usearchglobal
command: nextflow run ./tests/modules/vsearch/usearchglobal -entry test_vsearch_usearchglobal -c ./tests/config/nextflow.config -c ./tests/modules/vsearch/usearchglobal/nextflow.config
tags:
- vsearch/usearchglobal
- vsearch
files:
- path: output/vsearch/test.aln
contains:
- "vsearch --usearch_global transcriptome.fasta --db genome.fasta --id 0.985 --threads 2 --alnout test.aln"
- "Query >lcl|MT192765.1_cds_QIK50427.1_2"
- "%Id TLen Target"
- "100% 29829 MT192765.1"
- "Query 3822nt >lcl|MT192765.1_cds_QIK50427.1_2"
- "Target 29829nt >MT192765.1"
- "Qry 21249 + CAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAA 21291"
- "Tgt 21506 + CAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAA 21548"
- "21291 cols, 21290 ids (100.0%), 1 gaps (0.0%)"
- name: vsearch usearchglobal test_vsearch_usearchglobal_userout
command: nextflow run ./tests/modules/vsearch/usearchglobal -entry test_vsearch_usearchglobal_userout -c ./tests/config/nextflow.config -c ./tests/modules/vsearch/usearchglobal/nextflow.config
tags:
- vsearch/usearchglobal
- vsearch
files:
- path: output/vsearch/test.tsv
md5sum: b6cc50f7c8d18cb82e74dab70ed4baab