From d6244b42f596fa26d2ecba4ce862755821ed9da8 Mon Sep 17 00:00:00 2001
From: Lasse Folkersen <lassefolkersen@gmail.com>
Date: Tue, 15 Mar 2022 11:18:43 +0100
Subject: [PATCH] ASCAT (#1332)

* First commit

* putting correct links for singularity and docker containers (just had to search for bioconda+ascat to find them, and then put them in like the rest of the nf-core tools had it

* adding first try of relevant commands (not working yet, just took their basic pipeline example

* test commit

* remove test

* starting up work with module after 3.0.0 upgrade

* add ascat.prepareHTS statemet

* add location of docker for new mulled alleleCounter+ASCAT container

* first full run with ASCAT on HG00154.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam

* add notes on dropbox download

* use a newer pytest_modules.yml

* add outpit

* trying to align with current Sarek output

* adding in FH comments

* busy clearing up arguments and testing. Still WIP

* first working run, in nextflow, with sarek-like output. Still needs more work on input arguments

* cleaning up before writing up findings

* testing with putting in arguments in args

* draft for solution 3 style for arguments

* one more test added

* adding FH map

* finished testing maps for args

* wrap-up cram/crai test successfully

* updates to address ability to put in ref.fasta argument for cram running

* adding remaining import-HTS commands in as args, and removing the chr21/chr22 only testing to test-nextflow.config

* first test with auto-downloading the s3-data (when not given as an argument)

* removing download-logic for supporting files, documenting in meta.yml, fixing ref_fasta bug

* adding mulled singularity container

* removing tests

* fix left padding lint issue

* lint failure in meta.yml

* more linting errors

* add when argument

* adding stub functionality

* add stub run

* correct md5sum for versions.yml

* more testing with -runstub

* stub code in pure bash - not mixed with R

* reformat version.yml

* get rid of absolute paths in test.yml

* correct wrong md5sum

* adding allelecount conda link

* rename normal_bam to input_bam etc

* let the pipeline dev worry about matching the right loci and allele files

* dont hardcode default genomebuild

* adding download instruction comment

* add doi

* fix conda addition bug

* add args documentation

* test new indent

* new test with meta.yml indentation

* retry with new meta.yml

* retry with new meta.yml - now with empty lines around

* retry with new meta.yml - remove trailing whitepsace

* trying to fix found quote character that cannot start any token error

* try with one empty line above triple-quote and no empty line below

* trying with pipe character

* checking if its the ending triple quote

* one more try with meta.yml

* test update bioconda versioning for linting failure

* test update bioconda versioning for linting failure 2

* testing allelecounter version error on conda

Co-authored-by: @lassefolkersen
Co-authored-by: @FriederikeHanssen
---
 modules/ascat/main.nf               | 155 ++++++++++++++++++++++++++++
 modules/ascat/meta.yml              |  92 +++++++++++++++++
 tests/config/pytest_modules.yml     |   4 +
 tests/modules/ascat/main.nf         |  64 ++++++++++++
 tests/modules/ascat/nextflow.config |  39 +++++++
 tests/modules/ascat/test.yml        |  25 +++++
 6 files changed, 379 insertions(+)
 create mode 100644 modules/ascat/main.nf
 create mode 100644 modules/ascat/meta.yml
 create mode 100644 tests/modules/ascat/main.nf
 create mode 100644 tests/modules/ascat/nextflow.config
 create mode 100644 tests/modules/ascat/test.yml

diff --git a/modules/ascat/main.nf b/modules/ascat/main.nf
new file mode 100644
index 00000000..1d2bd96f
--- /dev/null
+++ b/modules/ascat/main.nf
@@ -0,0 +1,155 @@
+process ASCAT {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda (params.enable_conda ? "bioconda::ascat=3.0.0 bioconda::cancerit-allelecount-4.3.0": null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:dfe5aaa885de434adb2b490b68972c5840c6d761-0':
+        'quay.io/biocontainers/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:dfe5aaa885de434adb2b490b68972c5840c6d761-0' }"
+
+    input:
+    tuple val(meta), path(input_normal), path(index_normal), path(input_tumor), path(index_tumor)
+    path(allele_files)
+    path(loci_files)
+
+    output:
+    tuple val(meta), path("*png"),               emit: png
+    tuple val(meta), path("*cnvs.txt"),          emit: cnvs
+    tuple val(meta), path("*purityploidy.txt"),  emit: purityploidy
+    tuple val(meta), path("*segments.txt"),      emit: segments
+    path "versions.yml",                         emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args           = task.ext.args        ?: ''
+    def prefix         = task.ext.prefix      ?: "${meta.id}"
+    def gender         = args.gender          ?  "$args.gender" :        "NULL"
+    def genomeVersion  = args.genomeVersion   ?  "$args.genomeVersion" : "NULL"
+    def purity         = args.purity          ?  "$args.purity" :        "NULL"
+    def ploidy         = args.ploidy          ?  "$args.ploidy" :        "NULL"
+    def gc_files       = args.gc_files        ?  "$args.gc_files" :      "NULL"
+
+    def minCounts_arg                    = args.minCounts                     ?  ",minCounts = $args.minCounts" : ""
+    def chrom_names_arg                  = args.chrom_names                   ?  ",chrom_names = $args.chrom_names" : ""
+    def min_base_qual_arg                = args.min_base_qual                 ?  ",min_base_qual = $args.min_base_qual" : ""
+    def min_map_qual_arg                 = args.min_map_qual                  ?  ",min_map_qual = $args.min_map_qual" : ""
+    def ref_fasta_arg                    = args.ref_fasta                     ?  ",ref.fasta = '$args.ref_fasta'" : ""
+    def skip_allele_counting_tumour_arg  = args.skip_allele_counting_tumour   ?  ",skip_allele_counting_tumour = $args.skip_allele_counting_tumour" : ""
+    def skip_allele_counting_normal_arg  = args.skip_allele_counting_normal   ?  ",skip_allele_counting_normal = $args.skip_allele_counting_normal" : ""
+
+
+
+    """
+    #!/usr/bin/env Rscript
+    library(RColorBrewer)
+    library(ASCAT)
+    options(bitmapType='cairo')
+
+
+    #prepare from BAM files
+    ascat.prepareHTS(
+        tumourseqfile = "$input_tumor",
+        normalseqfile = "$input_normal",
+        tumourname = "Tumour",
+        normalname = "Normal",
+        allelecounter_exe = "alleleCounter",
+        alleles.prefix = "$allele_files",
+        loci.prefix = "$loci_files",
+        gender = "$gender",
+        genomeVersion = "$genomeVersion",
+        nthreads = $task.cpus
+        $minCounts_arg
+        $chrom_names_arg
+        $min_base_qual_arg
+        $min_map_qual_arg
+        $ref_fasta_arg
+        $skip_allele_counting_tumour_arg
+        $skip_allele_counting_normal_arg
+    )
+
+
+    #Load the data
+    ascat.bc = ascat.loadData(
+        Tumor_LogR_file = "Tumour_tumourLogR.txt",
+        Tumor_BAF_file = "Tumour_normalBAF.txt",
+        Germline_LogR_file = "Tumour_normalLogR.txt",
+        Germline_BAF_file = "Tumour_normalBAF.txt",
+        genomeVersion = "$genomeVersion",
+        gender = "$gender"
+    )
+
+    #optional GC wave correction
+    if(!is.null($gc_files)){
+        ascat.bc = ascat.GCcorrect(ascat.bc, $gc_files)
+    }
+
+    #Plot the raw data
+    ascat.plotRawData(ascat.bc)
+
+    #Segment the data
+    ascat.bc = ascat.aspcf(ascat.bc)
+
+    #Plot the segmented data
+    ascat.plotSegmentedData(ascat.bc)
+
+    #Run ASCAT to fit every tumor to a model, inferring ploidy, normal cell contamination, and discrete copy numbers
+    #If psi and rho are manually set:
+    if (!is.null($purity) && !is.null($ploidy)){
+        ascat.output <- ascat.runAscat(ascat.bc, gamma=1, rho_manual=$purity, psi_manual=$ploidy)
+    } else if(!is.null($purity) && is.null($ploidy)){
+        ascat.output <- ascat.runAscat(ascat.bc, gamma=1, rho_manual=$purity)
+    } else if(!is.null($ploidy) && is.null($purity)){
+        ascat.output <- ascat.runAscat(ascat.bc, gamma=1, psi_manual=$ploidy)
+    } else {
+        ascat.output <- ascat.runAscat(ascat.bc, gamma=1)
+    }
+
+    #Write out segmented regions (including regions with one copy of each allele)
+    write.table(ascat.output[["segments"]], file=paste0("$prefix", ".segments.txt"), sep="\t", quote=F, row.names=F)
+
+    #Write out CNVs in bed format
+    cnvs=ascat.output[["segments"]][2:6]
+    write.table(cnvs, file=paste0("$prefix",".cnvs.txt"), sep="\t", quote=F, row.names=F, col.names=T)
+
+    #Write out purity and ploidy info
+    summary <- tryCatch({
+            matrix(c(ascat.output[["aberrantcellfraction"]], ascat.output[["ploidy"]]), ncol=2, byrow=TRUE)}, error = function(err) {
+                # error handler picks up where error was generated
+                print(paste("Could not find optimal solution:  ",err))
+                return(matrix(c(0,0),nrow=1,ncol=2,byrow = TRUE))
+        }
+    )
+    colnames(summary) <- c("AberrantCellFraction","Ploidy")
+    write.table(summary, file=paste0("$prefix",".purityploidy.txt"), sep="\t", quote=F, row.names=F, col.names=T)
+
+    #version export. Have to hardcode process name and software name because
+    #won't run inside an R-block
+    version_file_path="versions.yml"
+    f <- file(version_file_path,"w")
+    writeLines("ASCAT:", f)
+    writeLines(" ascat: 3.0.0",f)
+    close(f)
+    """
+
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.cnvs.txt
+    touch ${prefix}.purityploidy.txt
+    touch ${prefix}.segments.txt
+    touch Tumour.ASCATprofile.png
+    touch Tumour.ASPCF.png
+    touch Tumour.germline.png
+    touch Tumour.rawprofile.png
+    touch Tumour.sunrise.png
+    touch Tumour.tumour.png
+
+    echo 'ASCAT:' > versions.yml
+    echo ' ascat: 3.0.0' >> versions.yml
+    """
+
+
+}
diff --git a/modules/ascat/meta.yml b/modules/ascat/meta.yml
new file mode 100644
index 00000000..949afd6a
--- /dev/null
+++ b/modules/ascat/meta.yml
@@ -0,0 +1,92 @@
+name: ascat
+description: copy number profiles of tumour cells.
+keywords:
+  - sort
+tools:
+  - ascat:
+      description: ASCAT is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. ASCAT infers tumour purity (the fraction of tumour cells) and ploidy (the amount of DNA per tumour cell), expressed as multiples of haploid genomes from SNP array or massively parallel sequencing data, and calculates whole-genome allele-specific copy number profiles (the number of copies of both parental alleles for all SNP loci across the genome).
+      homepage: None
+      documentation: None
+      tool_dev_url: https://github.com/Crick-CancerGenomics/ascat
+      doi: "10.1093/bioinformatics/btaa538"
+      licence: ['GPL v3']
+
+input:
+  - args:
+      type: map
+      description: |
+        Groovy Map containing tool parameters. MUST follow the structure/keywords below and be provided via modules.config. Parameters must be set between quotes. <optional> parameters can be removed from the map, if they are not set. For default values, please check the documentation above.
+
+        ```
+        {
+          [
+            "gender": "XX",
+            "genomeVersion": "hg19"
+            "purity": <optional>,
+            "ploidy": <optional>,
+            "gc_files": <optional>,
+            "minCounts": <optional>,
+            "chrom_names": <optional>,
+            "min_base_qual": <optional>,
+            "min_map_qual": <optional>,
+            "ref_fasta": <optional>,
+            "skip_allele_counting_tumour": <optional>,
+            "skip_allele_counting_normal": <optional>
+          ]
+        }
+        ```
+
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - input_normal:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - index_normal:
+      type: file
+      description: index for normal_bam
+      pattern: "*.{bai}"
+  - input_tumor:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - index_tumor:
+      type: file
+      description: index for tumor_bam
+      pattern: "*.{bai}"
+  - allele_files:
+      type: file
+      description: allele files for ASCAT. Can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS
+  - loci_files:
+      type: file
+      description: loci files for ASCAT. Can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - png:
+      type: file
+      description: ASCAT plots
+      pattern: "*.{png}"
+  - purityploidy:
+      type: file
+      description: purity and ploidy data
+      pattern: "*.purityploidy.txt"
+  - segments:
+      type: file
+      description: segments data
+      pattern: "*.segments.txt"
+authors:
+  - "@aasNGC"
+  - "@lassefolkersen"
+  - "@FriederikeHanssen"
+  - "@maxulysse"
diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml
index 6ecab096..48c3bb7d 100644
--- a/tests/config/pytest_modules.yml
+++ b/tests/config/pytest_modules.yml
@@ -46,6 +46,10 @@ artic/minion:
   - modules/artic/minion/**
   - tests/modules/artic/minion/**
 
+ascat:
+  - modules/ascat/**
+  - tests/modules/ascat/**
+
 assemblyscan:
   - modules/assemblyscan/**
   - tests/modules/assemblyscan/**
diff --git a/tests/modules/ascat/main.nf b/tests/modules/ascat/main.nf
new file mode 100644
index 00000000..e1f4f798
--- /dev/null
+++ b/tests/modules/ascat/main.nf
@@ -0,0 +1,64 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { ASCAT as ASCAT_SIMPLE} from '../../../modules/ascat/main.nf'
+include { ASCAT as ASCAT_PLOIDY_AND_PURITY} from '../../../modules/ascat/main.nf'
+include { ASCAT as ASCAT_CRAM} from '../../../modules/ascat/main.nf'
+
+
+
+
+workflow test_ascat {
+    input = [
+        [ id:'test', single_end:false ], // meta map
+        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true),
+        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true),
+        file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_sorted_bam'], checkIfExists: true),
+        file(params.test_data['homo_sapiens']['illumina']['test2_paired_end_sorted_bam_bai'], checkIfExists: true)
+    ]
+
+    ASCAT_SIMPLE ( input , [], [])
+}
+
+
+
+
+
+// extended tests running with 1000 genomes data. Data is downloaded as follows:
+// wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/data/HG00154/alignment/HG00154.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam
+// wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/data/HG00154/alignment/HG00154.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam.bai
+// wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/data/HG00155/alignment/HG00155.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam
+// wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/data/HG00155/alignment/HG00155.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam.bai
+//workflow test_ascat_with_ploidy_and_purity {  
+//   input = [
+//        [ id:'test', single_end:false ], // meta map
+//        file("/home/ec2-user/input_files/bams/HG00154.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam", checkIfExists: true),
+//        file("/home/ec2-user/input_files/bams/HG00154.mapped.ILLUMINA.bwa.GBR.low_coverage.20101123.bam.bai", checkIfExists: true),
+//        file("/home/ec2-user/input_files/bams/test2.bam", checkIfExists: true),
+//        file("/home/ec2-user/input_files/bams/test2.bam.bai", checkIfExists: true)
+//    ]
+//
+//    ASCAT_PLOIDY_AND_PURITY ( input , "/home/ec2-user/input_files/allele_files/G1000_alleles_hg19_chr", "/home/ec2-user/input_files/loci_files/G1000_alleles_hg19_chr")
+//}
+
+
+// extended tests running with 1000 genomes data. Data is downloaded as follows:
+// wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00145/alignment/HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai
+// wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00145/alignment/HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram
+// wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00146/alignment/HG00146.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai
+// wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00146/alignment/HG00146.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram
+//workflow test_ascat_with_crams {
+//    input = [
+//        [ id:'test', single_end:false ], // meta map
+//        file("/home/ec2-user/input_files/crams/HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram", checkIfExists: true),
+//        file("/home/ec2-user/input_files/crams/HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai", checkIfExists: true),
+//        file("/home/ec2-user/input_files/crams/duplicate_test.cram", checkIfExists: true),
+//        file("/home/ec2-user/input_files/crams/duplicate_test.cram.crai", checkIfExists: true)
+//    ]
+//
+//    ASCAT_CRAM ( input , "/home/ec2-user/input_files/allele_files/G1000_alleles_hg19_chr", "/home/ec2-user/input_files/loci_files/G1000_alleles_hg19_chr")
+//}
+
+
+
diff --git a/tests/modules/ascat/nextflow.config b/tests/modules/ascat/nextflow.config
new file mode 100644
index 00000000..3c6cc53a
--- /dev/null
+++ b/tests/modules/ascat/nextflow.config
@@ -0,0 +1,39 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+ 
+
+    withName: ASCAT_SIMPLE {
+        ext.args = [
+          gender    :  'XY',
+          genomeVersion : 'hg19',
+          minCounts :  '1',
+          min_base_qual : '1',
+          min_map_qual  : '1',
+          chrom_names  : 'c("21","22")'
+                  ]
+    }
+
+
+
+    withName: ASCAT_PLOIDY_AND_PURITY {
+        ext.args = [
+          gender : 'XX', 
+          genomeVersion : 'hg19',
+          ploidy  : '1.7', 
+          purity : '0.24',
+          chrom_names  : 'c("21","22")'
+                  ]
+    }
+
+    withName: ASCAT_CRAM {
+        ext.args = [
+          gender    : 'XX', 
+          genomeVersion : 'hg19',
+          ref_fasta : '/home/ec2-user/input_files/fasta/human_g1k_v37.fasta',
+          chrom_names  : 'c("21","22")'
+                  ]
+    }
+
+}
+
diff --git a/tests/modules/ascat/test.yml b/tests/modules/ascat/test.yml
new file mode 100644
index 00000000..e46c66b4
--- /dev/null
+++ b/tests/modules/ascat/test.yml
@@ -0,0 +1,25 @@
+- name: ascat test_ascat
+  command: nextflow run tests/modules/ascat -entry test_ascat -c tests/config/nextflow.config -stub-run
+  tags:
+    - ascat
+  files:
+    - path: output/ascat/Tumour.ASCATprofile.png
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/Tumour.ASPCF.png
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/Tumour.germline.png
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/Tumour.rawprofile.png
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/Tumour.sunrise.png
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/Tumour.tumour.png
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/test.cnvs.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/test.purityploidy.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/test.segments.txt
+      md5sum: d41d8cd98f00b204e9800998ecf8427e
+    - path: output/ascat/versions.yml
+      md5sum: 1af20694ec11004c4f8bc0c609b06386