Debugging run merging

2024-11-25 05:59:55 +00:00 · 2022-03-27 09:30:23 +02:00 · 2022-03-27 09:30:23 +02:00 · 8dc9e583ad
commit 8dc9e583ad
parent e6e8ed7cc9
5 changed files with 32 additions and 21 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -65,7 +65,7 @@ process {
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        ext.args   = [
-            // collapsing options
+            // collapsing options - option to retain singletons
            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
            // trimming options
            params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
@ -105,7 +105,7 @@ process {
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
    }

    withName: KRAKEN2_KRAKEN2 {
@ -115,7 +115,7 @@ process {
            pattern: '*.{fastq.gz,txt}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
    }

    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@ -32,12 +32,13 @@ process KRAKEN2_KRAKEN2 {
        --threads $task.cpus \\
        --unclassified-out $unclassified \\
        --classified-out $classified \\
+        $args \\
        --report ${prefix}.kraken2.report.txt \\
        --gzip-compressed \\
        $paired \\
-        $args \\
        $reads
        
+
    pigz -p $task.cpus *.fastq

    cat <<-END_VERSIONS > versions.yml
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -7,7 +7,7 @@ include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fast

 workflow SHORTREAD_FASTP {
    take:
-    reads // file: /path/to/samplesheet.csv
+    reads // [[meta], [reads]]

    main:
    ch_versions = Channel.empty()
@ -24,9 +24,11 @@ workflow SHORTREAD_FASTP {
    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")

    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
+    // Last parameter here turns on merging of PE data
    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )

    if ( params.shortread_clipmerge_mergepairs ) {
+        // TODO update to replace meta suffix
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                    .mix( FASTP_SINGLE.out.reads )
                                    .map {
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -14,13 +14,6 @@ workflow SHORTREAD_PREPROCESSING {
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()

-    //
-    // STEP: Read clipping and merging
-    //
-    // TODO give option to clip only and retain pairs
-    // TODO give option to retain singletons (probably fastp option likely)
-    // TODO move to subworkflow
-
    if ( params.shortread_clipmerge_tool == "fastp" ) {
        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -120,25 +120,40 @@ workflow TAXPROFILER {
        MODULE: PERFORM SHORT READ RUN MERGING
    */

-    // TODO: Check not necessary for long reads too?
-    // TODO: source of clash - combined should only occur when
-    // files ARE to be combined. SE/unmerged (see not below)
+    // Remove run accession to allow grouping by sample. Will only merge
+    // if pairment type is the same.
+
+    // TODO Current Branch system currently problematic - when single file not in a list, splits at
+    // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream
+    // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz`
+    // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect  input format)
+    // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check...
    ch_processed_for_combine = ch_shortreads_preprocessed
        .dump(tag: "prep_for_combine_grouping")
        .map {
            meta, reads ->
            def meta_new = meta.clone()
-            //meta_new['run_accession'] = 'combined'
+
+            // remove run accession to allow group by sample
+            meta_new.remove('run_accession')
+
+            // update id to prevent file name clashes when unable to group
+            // unmerged PE and SE runs of same sample
+            def type = meta_new['single_end'] ? "_se" : "_pe"
+            meta_new['id'] = meta['id'] + type
+
            [ meta_new, reads ]
        }
        .groupTuple ( by: 0 )
+        .dump(tag: "files_for_cat_fastq_branch")
        .branch{
-            combine: it[1].size() >= 2
-            skip: it[1].size() < 2
+            combine: it[1] && it[1].size() > 1
+            skip: true
        }

    // NOTE: this does not allow CATing of SE & PE runs of same sample
    // when --shortread_clipmerge_mergepairs is false
+    ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq")
    CAT_FASTQ ( ch_processed_for_combine.combine )

    ch_reads_for_profiling = ch_processed_for_combine.skip