Debugging run merging

2024-11-25 16:59:54 +00:00 · 2022-03-27 09:30:23 +02:00 · 2022-03-27 09:30:23 +02:00 · 8dc9e583ad
commit 8dc9e583ad
parent e6e8ed7cc9
5 changed files with 32 additions and 21 deletions
--- a/conf/modules.config
+++ b/conf/modules.config
@ -65,7 +65,7 @@ process {
    withName: FASTP {
        ext.prefix = { "${meta.id}_${meta.run_accession}" }
        ext.args   = [
-            // collapsing options
+            // collapsing options - option to retain singletons
            params.shortread_clipmerge_excludeunmerged ? '' : "--include_unmerged",
            // trimming options
            params.shortread_clipmerge_skiptrim ? "--disable_adapter_trimming" : "",
@ -105,7 +105,7 @@ process {
            pattern: '*.{rma6,tab,text,sam,log}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
    }
    withName: KRAKEN2_KRAKEN2 {
@ -115,7 +115,7 @@ process {
            pattern: '*.{fastq.gz,txt}'
        ]
        ext.args = { "${meta.db_params}" }
-        ext.prefix = { "${meta.id}-${meta.run_accession}-${meta.db_name}" }
+        ext.prefix = { "${meta.id}-${meta.db_name}" }
    }
    withName: CUSTOM_DUMPSOFTWAREVERSIONS {
--- a/modules/nf-core/modules/kraken2/kraken2/main.nf
+++ b/modules/nf-core/modules/kraken2/kraken2/main.nf
@ -32,12 +32,13 @@ process KRAKEN2_KRAKEN2 {
        --threads $task.cpus \\
        --unclassified-out $unclassified \\
        --classified-out $classified \\
        $args \\
        --report ${prefix}.kraken2.report.txt \\
        --gzip-compressed \\
        $paired \\
        $args \\
        $reads
    pigz -p $task.cpus *.fastq
    cat <<-END_VERSIONS > versions.yml
--- a/subworkflows/local/shortread_fastp.nf
+++ b/subworkflows/local/shortread_fastp.nf
@ -7,7 +7,7 @@ include { FASTP as FASTP_PAIRED       } from '../../modules/nf-core/modules/fast
 workflow SHORTREAD_FASTP {
    take:
-    reads // file: /path/to/samplesheet.csv
+    reads // [[meta], [reads]]
    main:
    ch_versions = Channel.empty()
@ -24,16 +24,18 @@ workflow SHORTREAD_FASTP {
    ch_input_for_fastp.paired.dump(tag: "input_fastp_paired")
    FASTP_SINGLE ( ch_input_for_fastp.single, false, false )
    // Last parameter here turns on merging of PE data
    FASTP_PAIRED ( ch_input_for_fastp.paired, false, params.shortread_clipmerge_mergepairs )
    if ( params.shortread_clipmerge_mergepairs ) {
        // TODO update to replace meta suffix
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads_merged
                                    .mix( FASTP_SINGLE.out.reads )
                                    .map {
                                        meta, reads ->
-                                        def meta_new = meta.clone()
+                                            def meta_new = meta.clone()
-                                        meta_new['single_end'] = 1
+                                            meta_new['single_end'] = 1
-                                        [ meta_new, reads ]
+                                            [ meta_new, reads ]
                                        }
    } else {
        ch_fastp_reads_prepped = FASTP_PAIRED.out.reads
--- a/subworkflows/local/shortread_preprocessing.nf
+++ b/subworkflows/local/shortread_preprocessing.nf
@ -14,13 +14,6 @@ workflow SHORTREAD_PREPROCESSING {
    ch_versions       = Channel.empty()
    ch_multiqc_files  = Channel.empty()
    //
    // STEP: Read clipping and merging
    //
    // TODO give option to clip only and retain pairs
    // TODO give option to retain singletons (probably fastp option likely)
    // TODO move to subworkflow
    if ( params.shortread_clipmerge_tool == "fastp" ) {
        ch_processed_reads = SHORTREAD_FASTP ( reads ).reads
        ch_versions        =  ch_versions.mix( SHORTREAD_FASTP.out.versions )
--- a/workflows/taxprofiler.nf
+++ b/workflows/taxprofiler.nf
@ -120,25 +120,40 @@ workflow TAXPROFILER {
        MODULE: PERFORM SHORT READ RUN MERGING
    */
-    // TODO: Check not necessary for long reads too?
+    // Remove run accession to allow grouping by sample. Will only merge
-    // TODO: source of clash - combined should only occur when
+    // if pairment type is the same.
-    // files ARE to be combined. SE/unmerged (see not below)
+
    // TODO Current Branch system currently problematic - when single file not in a list, splits at
    // `/` so makes list >= 2, so tries to merge, but then breaks kraken downstream
    // e.g. `home jfellows Documents git nf-core taxprofiler testing work 68 9a2c8362add37832a776058d280bb7 2612_se.merged.fastq.gz`
    // So theoretically need to force this into a list, (but results the can't access meta.id error as incorrect  input format)
    // But second issue >= 2 is MAYBE sufficient because what if merging two paired-end files? Need to chcek if the input channel formatted correctly for this? Need to check...
    ch_processed_for_combine = ch_shortreads_preprocessed
        .dump(tag: "prep_for_combine_grouping")
        .map {
            meta, reads ->
            def meta_new = meta.clone()
-            //meta_new['run_accession'] = 'combined'
+
            // remove run accession to allow group by sample
            meta_new.remove('run_accession')
            // update id to prevent file name clashes when unable to group
            // unmerged PE and SE runs of same sample
            def type = meta_new['single_end'] ? "_se" : "_pe"
            meta_new['id'] = meta['id'] + type
            [ meta_new, reads ]
        }
        .groupTuple ( by: 0 )
        .dump(tag: "files_for_cat_fastq_branch")
        .branch{
-            combine: it[1].size() >= 2
+            combine: it[1] && it[1].size() > 1
-            skip: it[1].size() < 2
+            skip: true
        }
    // NOTE: this does not allow CATing of SE & PE runs of same sample
    // when --shortread_clipmerge_mergepairs is false
    ch_processed_for_combine.combine.dump(tag: "input_into_cat_fastq")
    CAT_FASTQ ( ch_processed_for_combine.combine )
    ch_reads_for_profiling = ch_processed_for_combine.skip