diff --git a/tools/umi_tools/Dockerfile b/tools/umi_tools/Dockerfile index 31e2066b..c17ebd98 100644 --- a/tools/umi_tools/Dockerfile +++ b/tools/umi_tools/Dockerfile @@ -11,4 +11,5 @@ RUN apt-get update \ # Install conda packages COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a \ No newline at end of file +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/nfcore-module-umitools/bin:$PATH \ No newline at end of file diff --git a/tools/umi_tools/dedup/dedup.yml b/tools/umi_tools/dedup.yml similarity index 100% rename from tools/umi_tools/dedup/dedup.yml rename to tools/umi_tools/dedup.yml diff --git a/tools/umi_tools/dedup/main.nf b/tools/umi_tools/dedup/main.nf deleted file mode 100644 index 6d92cfce..00000000 --- a/tools/umi_tools/dedup/main.nf +++ /dev/null @@ -1,372 +0,0 @@ -#!/usr/bin/env nextflow - -// Specify DSL2 -nextflow.preview.dsl = 2 - -// Local default params -params.internal_outdir = 'results' -params.internal_process_name = 'dedup' - -// Check for internal parameter overrides -nfUtils.check_internal_overrides(module_name, params) - -/*-------------------------------------------------> DEDUP OPTIONS <-----------------------------------------------------*/ - -/*----------------------------------------------------------------------------------------------------------------------------- -EXTRACT BARCODE OPTION -> --extract-umi-method --------------------------------------------------------------------------------------------------------------------------------*/ - -//Default extract barcode -> Barcodes are contained at the end of the read separated as specified with --umi-separator option -//Enter the character that corresponds to the UMI separator -params.internal_umi_separator = ':' - -/*-----------------------------------------------------------------------------------------------------------------------------*/ - -//Barcodes contained in a tag(s) -> --extract-umi-method=tag -params.internal_extract_method_tag = false - -// IF params.internal_extract_method_tag = true -> choose between the 6 options below -//Umi tag options -//--umi-tag=[TAG], --umi-tag-split=[SPLIT], --umi-tag-delimiter=[DELIMITER] -//Select tag, split ot delimiter -params.internal_umi_tag_active = false -params.internal_umi_tag_split_active = false -params.internal_umi_tag_delim_active = false -//If one set to true, insert tag, split or delimiter in the param below -params.internal_umi_tag = '' - - -//Cell tag options -//--umi-tag=[TAG], --umi-tag-split=[SPLIT], --umi-tag-delimiter=[DELIMITER] -//Select tag, split ot delimiter -params.internal_cell_tag_active = false -params.internal_cell_tag_split_active = false -params.internal_cell_tag_delim_active = false -//If one set to true, insert tag, split or delimiter in the param below -params.internal_cell_tag = '' - -/*----------------------------------------------------------------------------------------------------------------------------- -OUTPUT OPTION -> --stdout or -S --------------------------------------------------------------------------------------------------------------------------------*/ -//Insert output file name -params.internal_output_file_name = '' -//Activate this option to use sample_id in the output file names -> e.g. sample1.dedup.bam -params.internal_output_sampleid = true - -/*----------------------------------------------------------------------------------------------------------------------------- -OUTPUT STATS OPTION -> --output-stats=[PREFIX] --------------------------------------------------------------------------------------------------------------------------------*/ -//Enter the prefix -params.internal_output_stats = '' -//Activate this option to use sample_id as the prefix -> e.g. sample1_edit_distance -params.internal_output_stats_sampleid = true - -/*----------------------------------------------------------------------------------------------------------------------------- -GROUPING METHOD OPTION -> --method=[method] --------------------------------------------------------------------------------------------------------------------------------*/ -//What method to use to identify group of reads with the same (or similar) UMI(s) -//Default method is directional -//Choose a grouping method: unique, percentile, cluster or adjacency - -//Reads group share the exact same UMI -params.internal_grouping_unique = false - -//Reads group share the exact same UMI. UMIs with counts < 1% of the median counts for UMIs at the same position are ignored. -params.internal_grouping_percentile = false - -//Identify clusters of connected UMIs (based on hamming distance threshold). Each network is a read group -params.internal_grouping_cluster = false - -//Cluster UMIs as above. For each cluster, select the node (UMI) with the highest counts. -params.internal_grouping_adjacency = false - -/*-----------------------------------------------------------------------------------------------------------------------------*/ -//Additional grouping method options - -//--edit-threshold-distance -//For the adjacency and cluster methods, the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (>14bp). -params.internal_edit_threshold_distance = '' - -//--spliced-is-unique -//Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. -params.internal_unique_spliced = false - -//--soft-clip-threshold -//By setting this option, you can treat reads with at least this many bases soft-clipped at the 3’ end as spliced. Default=4 -params.internal_soft_clip_threshold = '' - -//--read-length -//Use the read length as a criteria when deduping, for e.g sRNA-Seq. -params.internal_read_length = true - -/*----------------------------------------------------------------------------------------------------------------------------- -SINGLE-CELL RNA-SEQ OPTIONS --------------------------------------------------------------------------------------------------------------------------------*/ -//--per-gene -//Reads will be grouped together if they have the same gene. -//MUST SUPPLY --per-contig OR --gene-tag -params.internal_per_gene = false - -//--gene-tag -//Deduplicate per gene. The gene information is encoded in the bam read tag specified -// USE WITH --per-gene -params.internal_gene_tag = false - -//--assigned-status-tag -//BAM tag which describes whether a read is assigned to a gene. Defaults to the same value as given for --gene-tag -params.internal_assigned_status_tag = '' - -//--skip-tags-regex -//Use in conjunction with the --assigned-status-tag option to skip any reads where the tag matches this regex. -//Default ("^[__|Unassigned]") matches anything which starts with “__” or “Unassigned”: -params.internal_skip_tags_regex = '' - -//--per-contig -//Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same contig will be considered to have the same alignment position. -params.internal_per_contig = false - -//--gene-transcript-map -//File mapping genes to transcripts (tab separated) -params.internal_gene_transcript_map = false - -//--per-cell -//Reads will only be grouped together if they have the same cell barcode. Can be combined with --per-gene. -params.internal_per_cell = false - -/*----------------------------------------------------------------------------------------------------------------------------- -SAM/BAM OPTIONS -> --method=[method] --------------------------------------------------------------------------------------------------------------------------------*/ - -//--mapping-quality -//Minimium mapping quality (MAPQ) for a read to be retained. Default is 0. -params.internal_mapping_quality = 0 - -//--unmapped-reads -> How should unmapped reads be handled -// Activate one of those three options: discard (default), use, output -params.internal_unmapped_reads_discard = true -params.internal_unmapped_reads_use = false -params.internal_unmapped_reads_output = false - - -//--chimeric-pairs -> How should chimeric pairs be handled -// Activate one of those three options: discard, use (default), output -params.internal_chimeric_pairs_discard = false -params.internal_chimeric_pairs_use = false -params.internal_chimeric_pairs_output = false - - -//--unpaired-reads -> How should unpaired reads be handled -// Activate one of those three options: discard, use (default), output -params.internal_unpaired_reads_discard = false -params.internal_unpaired_reads_use = false -params.internal_unpaired_reads_output = false - -//--ignore-umi -//Ignore the UMI and group reads using mapping coordinates only -//CANNOT BE USED WITH --output-stats -params.internal_ignore_umi = false - -//--subset -//Only consider a fraction of the reads, chosen at random. This is useful for doing saturation analyses. -params.internal_subset = '' - -//--chrom -//Only consider a single chromosome. This is useful for debugging/testing purposes -params.internal_chrom = false - -// Input/output BAM options - -// Parameters for input and output in SAM format instead of BAM -params.internal_in_sam = false -params.internal_out_sam = false - -//BAM is paired end - output both read pairs. This will also force the use of the template length to determine reads with the same mapping coordinates. -//--paired -params.internal_paired_end = false - -/*----------------------------------------------------------------------------------------------------------------------------- -GROUP/DEDUP OPTIONS --------------------------------------------------------------------------------------------------------------------------------*/ - -//--no-sort-output -params.internal_no_sort_output = false - -//--buffer-whole-contig -params.internal_buffer_whole_contig = false - -/*-----------------------------------------------------------------------------------------------------------------------------*/ - - - - -// dedup reusable component -process dedup { - publishDir "umi_tools/dedup/${params.internal_outdir}/${params.internal_process_name}", - mode: "copy", overwrite: true - - container 'luslab/nf-modules-umitools:latest' - - input: - tuple val(sample_id), path(bai), path(bam) - - output: - tuple val(sample_id), path(bam), emit: dedupBam - - script: - - //Initializing dedup arguments - dedup_pre_args = "umi_tools dedup " - dedup_post_args = '' - - //Method used to extract barcodes - if (params.internal_umi_separator != ''){ - dedup_pre_args += "--umi-separator=\"$params.internal_umi_separator\" " - } - - //Input args - dedup_pre_args += "-I " - - //Output_stats option - if (params.internal_output_stats_sampleid){ - dedup_post_args += "--output-stats=$sample_id " - } else { - if (params.internal_output_stats != ''){ - dedup_post_args += "--output-stats=$params.internal_output_stats " - } - } - - //Output/ stdout option - if (params.internal_output_sampleid){ - dedup_post_args += "-S ${sample_id}.dedup.bam " - }else { - if (params.internal_output_file_name != ''){ - dedup_post_args += "-S $params.internal_output_file_name " - } - } - - //Grouping method option - if (params.internal_grouping_unique){ - dedup_post_args += "--method=unique " - } - if (params.internal_grouping_percentile){ - dedup_post_args += "--method=percentile " - } - if (params.internal_grouping_cluster){ - dedup_post_args += "--method=cluster " - } - if (params.internal_grouping_adjacency){ - dedup_post_args += "--method=adjacency " - } - - //Additional grouping method options - if (params.internal_edit_threshold_distance != ''){ - dedup_post_args += "--edit-threshold-distance=$params.internal_edit_threshold_distance " - } - if (params.internal_unique_spliced){ - dedup_post_args += "--spliced-is-unique " - } - if (params.internal_soft_clip_threshold != ''){ - dedup_post_args += "--soft-clip-threshold=$params.internal_soft_clip_threshold " - } - if (params.internal_read_length){ - dedup_post_args += "--read-length " - } - - //Single-cell RNA-seq options - if (params.internal_per_gene){ - dedup_post_args += "--per-gene " - } - if (params.internal_gene_tag){ - dedup_post_args += "--gene-tag " - } - if (params.internal_assigned_status_tag != ''){ - dedup_post_args += "--assigned-status-tag=$params.internal_assigned_status_tag " - } - if (params.internal_skip_tags_regex != ''){ - dedup_post_args += "--skip-tags-regex=$params.internal_skip_tags_regex " - } - if (params.internal_per_contig){ - dedup_post_args += "--per-contig " - } - if (params.internal_gene_transcript_map){ - dedup_post_args += "--gene-transcript-map " - } - if (params.internal_per_cell){ - dedup_post_args += "--per-cell " - } - - // SAM/BAM options - // Unmapped reads options - if (params.internal_unmapped_reads_discard){ - dedup_post_args += "--unmapped-reads " - } - if (params.internal_unmapped_reads_use){ - dedup_post_args += "--unmapped-reads=use " - } - if (params.internal_unmapped_reads_output){ - dedup_post_args += "--unmapped-reads=output " - } - - // Chimeric pairs options - if (params.internal_chimeric_pairs_discard){ - dedup_post_args += "--chimeric-pairs=discard " - } - if (params.internal_chimeric_pairs_use){ - dedup_post_args += "--chimeric-pairs " - } - if (params.internal_chimeric_pairs_output){ - dedup_post_args += "--chimeric-pairs=output " - } - - // Unpaired reads options - if (params.internal_unpaired_reads_discard){ - dedup_post_args += "--unpaired-reads=discard " - } - if (params.internal_unpaired_reads_use){ - dedup_post_args += "--unpaired-reads " - } - if (params.internal_unpaired_reads_output){ - dedup_post_args += "--unpaired-reads=output " - } - - // Additional SAM/BAM options - if (params.internal_mapping_quality != 0){ - dedup_post_args += "--mapping-quality=$params.internal_mapping_quality " - } - if (params.internal_ignore_umi){ - dedup_post_args += "--ignore-umi " - } - if (params.internal_subset != ''){ - dedup_post_args += "--subset=$params.internal_subset " - } - if (params.internal_chrom){ - dedup_post_args += "--chrom " - } - - // Input/output BAM options - if (params.internal_in_sam){ - dedup_post_args += "--in-sam " - } - if (params.internal_out_sam){ - dedup_post_args += "--out-sam " - } - if (params.internal_paired_end){ - dedup_post_args += "--paired " - } - - //Group dedup options - if (params.internal_no_sort_output){ - dedup_post_args += "--no-sort-output " - } - if (params.internal_buffer_whole_contig){ - dedup_post_args += "--buffer-whole-contig " - } - - // Displays the umi_tools command line to check for mistakes - println dedup_pre_args - println dedup_post_args - - """ - $dedup_pre_args $bam $dedup_post_args - """ -} \ No newline at end of file diff --git a/tools/umi_tools/dedup/test/test.nf b/tools/umi_tools/dedup/test/test.nf deleted file mode 100644 index 3534454c..00000000 --- a/tools/umi_tools/dedup/test/test.nf +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env nextflow - -// Define DSL2 -nextflow.preview.dsl=2 - -// Log -log.info ("Starting test for BAM deduplication") - -/* Module inclusions ---------------------------------------------------------------------------------------*/ - -include dedup from '../main.nf' - -/*------------------------------------------------------------------------------------*/ -/* Define input channels ---------------------------------------------------------------------------------------*/ - -testMetaDataBam = [ - ['Sample1', "$baseDir/input/sample1.bam"], - ['Sample2', "$baseDir/input/sample2.bam"], - ['Sample3', "$baseDir/input/sample3.bam"], - ['Sample4', "$baseDir/input/sample4.bam"], - ['Sample5', "$baseDir/input/sample5.bam"], - ['Sample6', "$baseDir/input/sample6.bam"] -] - -testMetaDataBai = [ - ['Sample1', "$baseDir/input/sample1.bai"], - ['Sample2', "$baseDir/input/sample2.bai"], - ['Sample3', "$baseDir/input/sample3.bai"], - ['Sample4', "$baseDir/input/sample4.bai"], - ['Sample5', "$baseDir/input/sample5.bai"], - ['Sample6', "$baseDir/input/sample6.bai"] -] - -// Create channels of test data - -//Bam input channel -Channel - .from(testMetaDataBam) - .map { row -> [ row[0], file(row[1], checkIfExists: true) ]} - .set {ch_test_meta_bam} - -//BamBai input channel -Channel - .from(testMetaDataBai) - .map { row -> [ row[0], file(row[1], checkIfExists: true) ] } - .join(ch_test_meta_bam) - .set {ch_test_meta_bambai} - -// Run workflow -workflow { - - // Run dedup - dedup ( ch_test_meta_bambai ) - - // Collect file names and view output - dedup.out.dedupBam | view -} \ No newline at end of file diff --git a/tools/umi_tools/main.nf b/tools/umi_tools/main.nf new file mode 100644 index 00000000..1da8a49f --- /dev/null +++ b/tools/umi_tools/main.nf @@ -0,0 +1,48 @@ +#!/usr/bin/env nextflow + +// Specify DSL2 +nextflow.preview.dsl = 2 + +// Local default params +params.internal_process_name = 'umitools_dedup' + +// Process definition +process umitools_dedup { + publishDir "${params.outdir}/${params.internal_process_name}", + mode: "copy", overwrite: true + + container 'luslab/nf-modules-umitools:latest' + + input: + tuple val(sample_id), path(bai), path(bam) + + output: + tuple val(sample_id), path(bam), emit: dedupBam + + shell: + + // Init + error_message = "" + args_exist = "false" + internal_prog = "umi_tools dedup " + internal_args = "" + + // Check main args string exists and strip whitespace + if(params.umitools_dedup_args) { + internal_args = params.umitools_dedup_args + internal_args = internal_args.trim() + " " + args_exist = "true" + } + else { + error_message = "params.umitools_dedup_args does not exist, please define to run this module." + } + + """ + if ${args_exist}; then + ${internal_prog}${internal_args}-I $bam + else + echo "${error_message}" 1>&2 + exit 1 + fi + """ +} \ No newline at end of file diff --git a/tools/umi_tools/nextflow.config b/tools/umi_tools/nextflow.config deleted file mode 100644 index ad72e828..00000000 --- a/tools/umi_tools/nextflow.config +++ /dev/null @@ -1,8 +0,0 @@ -/* - * ------------------------------------------------- - * luslab/nfcore-mod-umi-tools Nextflow config file - * ------------------------------------------------- - * Config options for running individual module test patterns - */ - -docker.enabled = true \ No newline at end of file diff --git a/tools/umi_tools/test/main.nf b/tools/umi_tools/test/main.nf new file mode 100644 index 00000000..a4b013a8 --- /dev/null +++ b/tools/umi_tools/test/main.nf @@ -0,0 +1,77 @@ +#!/usr/bin/env nextflow + +// Define DSL2 +nextflow.preview.dsl=2 + +// Log +log.info ("Starting test for umi_tools dedup...") + +/*------------------------------------------------------------------------------------*/ +/* Define params +--------------------------------------------------------------------------------------*/ + +params.umitools_dedup_args = '--umi-separator=":"' +params.verbose = true + +/*------------------------------------------------------------------------------------*/ +/* Module inclusions +--------------------------------------------------------------------------------------*/ + +include umitools_dedup from '../main.nf' + +/*------------------------------------------------------------------------------------*/ +/* Define input channels +--------------------------------------------------------------------------------------*/ + + //fileName=`basename $bam` + // sampleName="\${fileName%.Aligned.sortedByCoord.out.bam}" + // umi_tools dedup --umi-separator=":" -I $bam -S \${sampleName}.dedup.bam --output-stats=\${sampleName} + +// Meta data +testMetaDataBam = [ + ['sample1', "$baseDir/input/sample1.bam"], + ['sample2', "$baseDir/input/sample2.bam"], + ['sample3', "$baseDir/input/sample3.bam"], + ['sample4', "$baseDir/input/sample4.bam"], + ['sample5', "$baseDir/input/sample5.bam"], + ['sample6', "$baseDir/input/sample6.bam"] +] + +testMetaDataBai = [ + ['sample1', "$baseDir/input/sample1.bai"], + ['sample2', "$baseDir/input/sample2.bai"], + ['sample3', "$baseDir/input/sample3.bai"], + ['sample4', "$baseDir/input/sample4.bai"], + ['sample5', "$baseDir/input/sample5.bai"], + ['sample6', "$baseDir/input/sample6.bai"] +] + +//Bam input channel +Channel + .from(testMetaDataBam) + .map { row -> [ row[0], file(row[1], checkIfExists: true) ]} + .set {ch_test_meta_bam} + +//BamBai input channel +Channel + .from(testMetaDataBai) + .map { row -> [ row[0], file(row[1], checkIfExists: true) ] } + .join(ch_test_meta_bam) + .set {ch_test_meta_bambai} + +/*------------------------------------------------------------------------------------*/ +/* Run tests +--------------------------------------------------------------------------------------*/ + +workflow { + // Logging + if (params.verbose){ + println ("[MODULE] umi_tools/dedup ARGS: " + params.umitools_dedup_args) + } + + // Run dedup + umitools_dedup ( ch_test_meta_bambai ) + + // Collect file names and view output + umitools_dedup.out.dedupBam | view +} \ No newline at end of file diff --git a/tools/umi_tools/test/nextflow.config b/tools/umi_tools/test/nextflow.config new file mode 100644 index 00000000..22f424b4 --- /dev/null +++ b/tools/umi_tools/test/nextflow.config @@ -0,0 +1,2 @@ +params.outdir = './results' +docker.enabled = true \ No newline at end of file