Add docs and tests for markduplicates

This commit is contained in:
drpatelh 2020-08-07 16:30:28 +01:00
parent cb85acddf0
commit 3547b64eab
6 changed files with 119 additions and 0 deletions

View file

@ -0,0 +1,67 @@
name: picard_markduplicates
description: Locate and tag duplicate reads in a BAM file
keywords:
- markduplicates
- pcr
- duplicates
- bam
- sam
- cram
tools:
- picard:
description: |
A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS)
data and formats such as SAM/BAM/CRAM and VCF.
homepage: https://broadinstitute.github.io/picard/
documentation: https://broadinstitute.github.io/picard/
params:
- outdir:
type: string
description: |
The pipeline's output directory. By default, the module will
output files into `$params.outdir/<SOFTWARE>`
- publish_dir_mode:
type: string
description: |
Value for the Nextflow `publishDir` mode parameter.
Available: symlink, rellink, link, copy, copyNoFollow, move.
- conda:
type: boolean
description: |
Run the module with Conda using the software specified
via the `conda` directive
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: BAM file
pattern: "*.{bam}"
- options:
type: map
description: |
Groovy Map containing module options for passing command-line arguments and
output file paths.
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- bam:
type: file
description: BAM file with duplicate reads marked/removed
pattern: "*.{bam}"
- metrics:
type: file
description: Duplicate metrics file generated by picard
pattern: "*.{metrics.txt}"
- version:
type: file
description: File containing software version
pattern: "*.{version.txt}"
authors:
- "@drpatelh"

View file

@ -0,0 +1 @@
../../../../../tests/data/bam/test.paired_end.sorted.bam

View file

@ -0,0 +1,18 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { PICARD_MARKDUPLICATES } from '../main.nf'
workflow test {
def input = []
input = [ [ id:'test', single_end:false ], // meta map
file("${baseDir}/input/test.paired_end.sorted.bam", checkIfExists: true) ]
PICARD_MARKDUPLICATES ( input, [:] )
}
workflow {
test()
}

View file

@ -0,0 +1,20 @@
params {
outdir = "output/"
publish_dir_mode = "copy"
conda = false
}
profiles {
conda {
params.conda = true
}
docker {
docker.enabled = true
docker.runOptions = '-u \$(id -u):\$(id -g)'
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
}
}

View file

@ -0,0 +1,13 @@
## htsjdk.samtools.metrics.StringHeader
# MarkDuplicates INPUT=[test.paired_end.sorted.bam] OUTPUT=test.bam METRICS_FILE=test.MarkDuplicates.metrics.txt MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag CLEAR_DT=true DUPLEX_UMI=false ADD_PG_TAG_TO_READS=true REMOVE_DUPLICATES=false ASSUME_SORTED=false DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
## htsjdk.samtools.metrics.StringHeader
# Started on: Fri Aug 07 15:11:32 GMT 2020
## METRICS CLASS picard.sam.DuplicationMetrics
LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE
Unknown Library 0 10000 0 0 0 0 0 0
## HISTOGRAM java.lang.Double
set_size all_sets non_optical_sets
1.0 10000 10000