Add Cell Ranger mkfastq, mkgtf, and count (#979)

* feat(cellranger): Add initial count module

Co-authored-by: Gisela Gabernet <gisela.gabernet@gmail.com>

* feat(cellranger): Add mkgtf module

* test(cellranger): Fix count test with mkgtf

* fix(cellranger): Generalize gtf attribute filters

* chore: Add .gitignore for cellranger tar

* build(cellranger): Update dockerfile

https://joshtronic.com/2021/09/12/fixed-repository-debian-security-buster-updates-changed-suite-from-stable-to-oldstable/

* Apply suggestions from code review

Co-authored-by: Gisela Gabernet <gisela.gabernet@gmail.com>

* Apply suggestions from code review

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>

* Update modules/cellranger/mkgtf/main.nf

Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>

* style: Capitalize README

* test(cellranger): Update pytest_modules

* feat(cellranger): Add initial mkfastq module

* ci: Update pytest modules

* refactor(cellranger): Update modules to new syntax

* docs(cellranger): Update meta files

There is some terrible copy-pasting going on.

* fix(cellranger): Add args

Co-authored-by: Gisela Gabernet <gisela.gabernet@gmail.com>
Co-authored-by: Harshil Patel <drpatelh@users.noreply.github.com>
This commit is contained in:
Edmund Miller 2021-12-02 08:27:20 -06:00 committed by GitHub
parent 9d0cad583b
commit e2ba70ed9a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 441 additions and 33 deletions

1
modules/cellranger/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
cellranger-*.tar.gz

View file

@ -4,7 +4,7 @@ LABEL authors="Gisela Gabernet <gisela.gabernet@gmail.com>" \
# Disclaimer: this container is not provided nor supported by 10x Genomics.
# Install procps and clean apt cache
RUN apt-get update \
RUN apt-get update --allow-releaseinfo-change \
&& apt-get install -y procps \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

View file

@ -0,0 +1,49 @@
process CELLRANGER_COUNT {
tag "$meta.gem"
label 'process_high'
if (params.enable_conda) {
exit 1, "Conda environments cannot be used when using the Cell Ranger tool. Please use docker or singularity containers."
}
container "nfcore/cellranger:6.0.2"
input:
tuple val(meta), path(reads)
path reference
output:
path("sample-${meta.gem}/outs/*"), emit: outs
path "versions.yml" , emit: versions
script:
def args = task.ext.args ?: ''
def sample_arg = meta.samples.unique().join(",")
def reference_name = reference.name
"""
cellranger \\
count \\
--id='sample-${meta.gem}' \\
--fastqs=. \\
--transcriptome=$reference_name \\
--sample=$sample_arg \\
--localcores=$task.cpus \\
--localmem=${task.memory.toGiga()} \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' )
END_VERSIONS
"""
stub:
"""
mkdir -p "sample-${meta.gem}/outs/"
touch sample-${meta.gem}/outs/fake_file.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,40 @@
name: cellranger_count
description: Module to use Cell Ranger's pipelines analyze sequencing data produced from Chromium Single Cell Gene Expression.
keywords:
- align
- count
- reference
tools:
- cellranger:
description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more.
homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger
documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
doi: ""
licence: 10x Genomics EULA
input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- reads:
type: file
description: |
List of input FastQ files of size 1 and 2 for single-end and paired-end data,
respectively.
- reference:
type: folder
description: Folder containing all the reference indices needed by Cell Ranger
output:
- outs:
type: file
description: Files containing the outputs of Cell Ranger
pattern: "sample-${meta.gem}/outs/*"
- versions:
type: file
description: File containing software version
pattern: "versions.yml"
authors:
- "@ggabernet"
- "@Emiller88"

View file

@ -0,0 +1,31 @@
process CELLRANGER_MKFASTQ {
tag "mkfastq"
label 'process_medium'
if (params.enable_conda) {
exit 1, "Conda environments cannot be used when using the Cell Ranger tool. Please use docker or singularity containers."
}
container "litd/docker-cellranger:v6.1.1" // FIXME Add bcl2fastq to nf-core docker image
input:
path bcl
path csv
output:
path "versions.yml", emit: versions
path "*.fastq.gz" , emit: fastq
script:
def args = task.ext.args ?: ''
"""
cellranger mkfastq --id=${bcl.getSimpleName()} \
--run=$bcl \
--csv=$csv
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,38 @@
name: cellranger_mkfastq
description: Module to create fastqs needed by the 10x Genomics Cell Ranger tool. Uses the cellranger mkfastq command.
keywords:
- reference
- mkfastq
- fastq
- illumina
- bcl2fastq
tools:
- cellranger:
description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more.
homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger
documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
doi: ""
licence: 10x Genomics EULA
input:
- bcl:
type: file
description: Base call files
pattern: "*.bcl.bgzf"
- csv:
type: file
description: Sample sheet
pattern: "*.csv"
output:
- fastq:
type: file
description: Unaligned FastQ files
pattern: "*.fastq.gz"
- versions:
type: file
description: File containing software version
pattern: "versions.yml"
authors:
- "@ggabernet"
- "@Emiller88"
- "@RHReynolds"

View file

@ -0,0 +1,31 @@
process CELLRANGER_MKGTF {
tag "$gtf"
label 'process_low'
if (params.enable_conda) {
exit 1, "Conda environments cannot be used when using the Cell Ranger tool. Please use docker or singularity containers."
}
container "nfcore/cellranger:6.0.2"
input:
path gtf
output:
path "*.filtered.gtf", emit: gtf
path "versions.yml" , emit: versions
script:
def args = task.ext.args ?: ''
"""
cellranger \\
mkgtf \\
$gtf \\
${gtf.baseName}.filtered.gtf \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' )
END_VERSIONS
"""
}

View file

@ -0,0 +1,31 @@
name: cellranger_mkgtf
description: Module to build a filtered gtf needed by the 10x Genomics Cell Ranger tool. Uses the cellranger mkgtf command.
keywords:
- reference
- mkref
- index
tools:
- cellranger:
description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more.
homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger
documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
doi: ""
licence: 10x Genomics EULA
input:
- gtf:
type: file
description:
pattern: "*.gtf"
output:
- gtf:
type: folder
description: gtf transcriptome file
pattern: "*.filtered.gtf"
- versions:
type: file
description: File containing software version
pattern: "versions.yml"
authors:
- "@ggabernet"
- "@Emiller88"

View file

@ -12,7 +12,6 @@ tools:
tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov
doi: ""
licence: 10x Genomics EULA
input:
- fasta:
type: file
@ -26,14 +25,13 @@ input:
type: val
description: name to give the reference folder
pattern: str
output:
- reference:
type: folder
description: Folder containing all the reference indices needed by Cell Ranger
- versions:
type: file
description: File containing software version
pattern: "versions.yml"
- reference:
type: folder
description: Folder containing all the reference indices needed by Cell Ranger
authors:
- "@ggabernet"

View file

@ -270,9 +270,25 @@ cat/fastq:
- modules/cat/fastq/**
- tests/modules/cat/fastq/**
cellranger/mkref:
cellranger/gtf: # &cellranger/gtf
- modules/cellranger/gtf/**
- tests/modules/cellranger/gtf/**
cellranger/mkref: # &cellranger/mkref
- modules/cellranger/mkref/**
- tests/modules/cellranger/mkref/**
# - *cellranger/gtf
- modules/cellranger/gtf/**
- tests/modules/cellranger/gtf/**
cellranger/count:
- modules/cellranger/count/**
- tests/modules/cellranger/count/**
# - *cellranger/mkref
- modules/cellranger/mkref/**
- tests/modules/cellranger/mkref/**
- modules/cellranger/gtf/**
- tests/modules/cellranger/gtf/**
checkm/lineagewf:
- modules/checkm/lineagewf/**

View file

@ -0,0 +1,33 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { CELLRANGER_MKGTF } from '../../../../modules/cellranger/mkgtf/main.nf'
include { CELLRANGER_MKREF } from '../../../../modules/cellranger/mkref/main.nf'
include { CELLRANGER_COUNT } from '../../../../modules/cellranger/count/main.nf'
workflow test_cellranger_count {
input = [ [ id:'test', single_end:true, strandedness:'forward', gem: '123', samples: ["test_10x"] ], // meta map
[ file(params.test_data['homo_sapiens']['illumina']['test_10x_1_fastq_gz'], checkIfExists: true),
file(params.test_data['homo_sapiens']['illumina']['test_10x_2_fastq_gz'], checkIfExists: true)
]
]
fasta = file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)
gtf = file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)
reference_name = "homo_sapiens_chr22_reference"
CELLRANGER_MKGTF ( gtf )
CELLRANGER_MKREF (
fasta,
CELLRANGER_MKGTF.out.gtf,
reference_name
)
CELLRANGER_COUNT(
input,
CELLRANGER_MKREF.out.reference
)
}

View file

@ -0,0 +1,31 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: CELLRANGER_MKGTF {
ext.args = '--attribute=gene_biotype:protein_coding \
--attribute=gene_biotype:lincRNA \
--attribute=gene_biotype:antisense \
--attribute=gene_biotype:IG_LV_gene \
--attribute=gene_biotype:IG_V_gene \
--attribute=gene_biotype:IG_V_pseudogene \
--attribute=gene_biotype:IG_D_gene \
--attribute=gene_biotype:IG_J_gene \
--attribute=gene_biotype:IG_J_pseudogene \
--attribute=gene_biotype:IG_C_gene \
--attribute=gene_biotype:IG_C_pseudogene \
--attribute=gene_biotype:TR_V_gene \
--attribute=gene_biotype:TR_V_pseudogene \
--attribute=gene_biotype:TR_D_gene \
--attribute=gene_biotype:TR_J_gene \
--attribute=gene_biotype:TR_J_pseudogene \
--attribute=gene_biotype:TR_C_gene'
}
withName: CELLRANGER_COUNT {
ext.args = '--chemistry SC3Pv3'
}
}

View file

@ -0,0 +1,19 @@
- name: cellranger count test_cellranger_count
command: nextflow run tests/modules/cellranger/count -entry test_cellranger_count -c tests/config/nextflow.config -c tests/modules/cellranger/count/nextflow.config
tags:
- cellranger
- cellranger/count
files:
- path: output/cellranger/sample-123/outs/filtered_feature_bc_matrix.h5
- path: output/cellranger/sample-123/outs/metrics_summary.csv
md5sum: 707df0f101d479d93f412ca74f9c4131
- path: output/cellranger/sample-123/outs/molecule_info.h5
md5sum: cf03b2b3ca776a1c37aa3518e91268ba
- path: output/cellranger/sample-123/outs/possorted_genome_bam.bam
md5sum: 15441da9cfceea0bb48c8b66b1b860df
- path: output/cellranger/sample-123/outs/possorted_genome_bam.bam.bai
md5sum: 7c3d49c77016a09535aff61a027f750c
- path: output/cellranger/sample-123/outs/raw_feature_bc_matrix
- path: output/cellranger/sample-123/outs/raw_feature_bc_matrix.h5
md5sum: 40c8df814eb8723b7317b234dc8222e9
- path: output/cellranger/sample-123/outs/web_summary.html

View file

@ -0,0 +1,26 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { UNTAR } from '../../../../modules/untar/main.nf'
include { CELLRANGER_MKFASTQ } from '../../../../modules/cellranger/mkfastq/main.nf'
workflow test_cellranger_mkfastq_simple {
simple_csv = file("https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-simple-1.2.0.csv", checkIfExists: true)
tiny_bcl = file("https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz", checkIfExists: true)
UNTAR ( tiny_bcl )
CELLRANGER_MKFASTQ ( UNTAR.out.untar, simple_csv)
}
workflow test_cellranger_mkfastq_illumina {
samplesheet_csv = file("https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-samplesheet-1.2.0.csv", checkIfExists: true)
tiny_bcl = file("https://cf.10xgenomics.com/supp/cell-exp/cellranger-tiny-bcl-1.2.0.tar.gz", checkIfExists: true)
UNTAR ( tiny_bcl )
CELLRANGER_MKFASTQ ( UNTAR.out.untar, samplesheet_csv)
}

View file

@ -0,0 +1,5 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}

View file

@ -0,0 +1,13 @@
- name: cellranger mkfastq test_cellranger_mkfastq_simple
command: nextflow run tests/modules/cellranger/mkfastq -entry test_cellranger_mkfastq_simple -c tests/config/nextflow.config -c ./tests/modules/cellranger/mkfastq/nextflow.config
tags:
- cellranger
- cellranger/mkfastq
# files:
# - path: output/cellranger/genome.filtered.gtf
# md5sum: a8b8a7b5039e05d3a9cf9151ea138b5b
- name: cellranger mkfastq test_cellranger_mkfastq_illumina
command: nextflow run tests/modules/cellranger/mkfastq -entry test_cellranger_mkfastq_illumina -c tests/config/nextflow.config -c ./tests/modules/cellranger/mkfastq/nextflow.config
tags:
- cellranger
- cellranger/mkfastq

View file

@ -0,0 +1,11 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { CELLRANGER_MKGTF } from '../../../../modules/cellranger/mkgtf/main.nf'
workflow test_cellranger_mkgtf {
gtf = file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)
CELLRANGER_MKGTF ( gtf )
}

View file

@ -0,0 +1,27 @@
process {
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
withName: CELLRANGER_MKGTF {
ext.args = '--attribute=gene_biotype:protein_coding \
--attribute=gene_biotype:lincRNA \
--attribute=gene_biotype:antisense \
--attribute=gene_biotype:IG_LV_gene \
--attribute=gene_biotype:IG_V_gene \
--attribute=gene_biotype:IG_V_pseudogene \
--attribute=gene_biotype:IG_D_gene \
--attribute=gene_biotype:IG_J_gene \
--attribute=gene_biotype:IG_J_pseudogene \
--attribute=gene_biotype:IG_C_gene \
--attribute=gene_biotype:IG_C_pseudogene \
--attribute=gene_biotype:TR_V_gene \
--attribute=gene_biotype:TR_V_pseudogene \
--attribute=gene_biotype:TR_D_gene \
--attribute=gene_biotype:TR_J_gene \
--attribute=gene_biotype:TR_J_pseudogene \
--attribute=gene_biotype:TR_C_gene'
}
}

View file

@ -0,0 +1,8 @@
- name: cellranger mkgtf test_cellranger_mkgtf
command: nextflow run tests/modules/cellranger/mkgtf -entry test_cellranger_mkgtf -c tests/config/nextflow.config -c tests/modules/cellranger/mkgtf/nextflow.config
tags:
- cellranger
- cellranger/mkgtf
files:
- path: output/cellranger/genome.filtered.gtf
md5sum: a8b8a7b5039e05d3a9cf9151ea138b5b