Merge pull request #51 from czbiohub/olgabot/czb-update

[WIP] CZ Biohub update
2024-11-25 09:19:56 +00:00 · 2019-07-09 16:02:33 -07:00 · 2019-07-09 16:02:33 -07:00 · c59ca56525
commit c59ca56525
parent 85f9b2e661 474e98daaf
4 changed files with 65 additions and 44 deletions
--- a/conf/czbiohub_aws.config
+++ b/conf/czbiohub_aws.config
@ -20,10 +20,11 @@ docker {

 process {
  executor = 'awsbatch'
-  queue = 'nextflow'
+  queue = 'default-971039e0-830c-11e9-9e0b-02c5b84a8036'
+  errorStrategy = 'ignore'
 }

-workDir = "s3://czb-nextflow/work"
+workDir = "s3://czb-nextflow/intermediates/"

 aws.region = 'us-west-2'
 executor.awscli = '/home/ec2-user/miniconda/bin/aws'
@ -37,6 +38,10 @@ params {
  max_cpus = 96
  max_time = 240.h

+  // Compatible with multiple versions of rnaseq pipeline
+  seq_center = "czbiohub"
+  seqCenter = "czbiohub"
+
  // illumina iGenomes reference file paths on CZ Biohub reference s3 bucket
  // No final slash because it's added later
  igenomes_base = "s3://czbiohub-reference/igenomes"
@ -50,15 +55,28 @@ params {
  awsregion = "us-west-2"
  awsqueue = "nextflow"

+  igenomesIgnore = true
+
+  fc_extra_attributes = 'gene_name'
+  fc_group_features = 'gene_id'
+  fc_group_features_type = 'gene_type'
+
+  trim_pattern = '_+S\\d+'
+
  // GENCODE GTF and fasta files
  genomes {
    'GRCh38' {
-      fasta   = "${params.gencode_base}/human/v29/GRCh38.p12.genome.fa"
-      gtf     = "${params.gencode_base}/human/v29/gencode.vM19.annotation.gtf"
+      fasta             = "${params.gencode_base}/human/v30/GRCh38.p12.genome.ERCC92.fa"
+      gtf               = "${params.gencode_base}/human/v30/gencode.v30.annotation.ERCC92.gtf"
+      transcript_fasta  = "${params.gencode_base}/human/v30/gencode.v30.transcripts.ERCC92.fa"
+      star              = "${params.gencode_base}/human/v30/STARIndex/"
+      salmon_index      = "${params.gencode_base}/human/v30/salmon_index/"
    }
    'GRCm38' {
-      fasta   = "${params.gencode_base}/mouse/vM19/GRCm38.p6.genome.fa"
-      gtf     = "${params.gencode_base}/mouse/vM19/gencode.vM19.annotation.gtf"
+      fasta             = "${params.gencode_base}/mouse/vM21/GRCm38.p6.genome.ERCC92.fa"
+      gtf               = "${params.gencode_base}/mouse/vM21/gencode.vM21.annotation.ERCC92.gtf"
+      transcript_fasta  = "${params.gencode_base}/mouse/vM21/gencode.vM21.transcripts.ERCC92.fa"
+      star             = "${params.gencode_base}/mouse/vM21/STARIndex/"
    }
  }

--- a/conf/czbiohub_aws_highpriority.config
+++ b/conf/czbiohub_aws_highpriority.config
@ -0,0 +1,12 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for Chan Zuckerberg Biohub
+ * -------------------------------------------------
+ * Defines reference genomes, using iGenome paths
+ * Imported under the default 'standard' Nextflow
+ * profile in nextflow.config
+ */
+
+process {
+  queue = 'highpriority-971039e0-830c-11e9-9e0b-02c5b84a8036'
+}
--- a/docs/czbiohub.md
+++ b/docs/czbiohub.md
@ -8,47 +8,20 @@ Ask Olga (olga.botvinnik@czbiohub.org) if you have any questions!

 ## Run the pipeline from a small AWS EC2 Instance

-The pipeline will monitor and submit jobs to AWS Batch on your behalf. To ensure that the pipeline is successful, it will need to be run from a computer that has constant internet connection. Unfortunately for us, Biohub has spotty WiFi and even for short pipelines, it is highly recommended to run them from AWS. Make sure you have [aegea](https://github.com/czbiohub/codonboarding/blob/master/guides/aegea.md) installed to make launching AWS instances from the command line much easier.
+The pipeline will monitor and submit jobs to AWS Batch on your behalf. To ensure that the pipeline is successful, it will need to be run from a computer that has constant internet connection. Unfortunately for us, Biohub has spotty WiFi and even for short pipelines, it is highly recommended to run them from AWS.

-### 1. Launch the instance
+### 1. Start tmux

-There is an Elastic Compute Cluster (EC2) Amazon machine image (AMI) set up with everything you need for Nextflow already installed. Its ID is `ami-091ec599f8e77734d` and can be launched from the command line with aegea like this:
+[tmux](https://hackernoon.com/a-gentle-introduction-to-tmux-8d784c404340) is a "Terminal Multiplexer" that allows for commands to continue running even when you have closed your laptop. Start a new tmux session with `tmux new` and we'll name this session `nextflow`.

 ```
-aegea launch --iam-role S3fromEC2 -t t2.small --ami ami-091ec599f8e77734d --subnet subnet-672e832e $USER-nextflow
-```
-
-For example:
-
-```
-aegea launch --iam-role S3fromEC2 -t t2.small --ami ami-0adcc973d6f458a1e --subnet subnet-672e832e olgabot-nextflow
-```
-
-### 2. Log into the instance
-
-Log into the instance with `aegea ssh`:
-
-```
-aegea ssh ec2-user@$USER-nextflow
-```
-
-For a concrete example:
-
-```
-aegea ssh ec2-user@olgabot-nextflow
-```
-
-### 3. Start tmux
-
-[tmux](https://hackernoon.com/a-gentle-introduction-to-tmux-8d784c404340) is a "Terminal Multiplexer" that allows for commands to continue running even when you have closed your laptop and lost your connection. Start a new tmux session with `tmux new`
-
-```
-tmux new
+tmux new -n nextflow
 ```

 Now you can run pipelines with abandon!

-### 4. Make a GitHub repo for your workflows (optional :)
+### 2. Make a GitHub repo for your workflows (optional :)
+

 To make sharing your pipelines and commands easy between your teammates, it's best to share code in a GitHub repository. One way is to store the commands in a Makefile ([example](https://github.com/czbiohub/kh-workflows/blob/master/nf-kmer-similarity/Makefile)) which can contain multiple `nextflow run` commands so that you don't need to remember the S3 bucket or output directory for every single one. [Makefiles](https://kbroman.org/minimal_make/) are broadly used in the software community for running many complex commands. Makefiles can have a lot of dependencies and be confusing, so we're only going to write *simple* Makefiles.

@ -108,7 +81,8 @@ git push origin master
 ```


-### 5. Run your workflow!!
+### 3. Run your workflow!!
+

 Remember to specify `-profile czbiohub_aws` to grab the CZ Biohub-specific AWS configurations, and an `--outdir` with an AWS S3 bucket so you don't run out of space on your small AMI

@ -119,18 +93,31 @@ nextflow run -profile czbiohub_aws nf-core/rnaseq \
    --outdir s3://olgabot-maca/nextflow-test/
 ```

-### 6. If you lose connection, how do you re-attach the tmux session?
+### 4. If you lose connection, how do you restart the jobs?

-If you close your laptop, get onto the train, or lose WiFi connection, you may lose connection to your AWS EC2 instance. To reattach, use the command `tmux attach` and you should see your Nextflow output!
+If you close your laptop, get onto the train, or lose WiFi connection, you may lose connection to AWS and may need to restart the jobs. To reattach, use the command `tmux attach` and you should see your Nextflow output! To get the named session, use:

 ```
-tmux attach
+tmux attach -n nextflow
 ```

+To restart the jobs from where you left off, add the `-resume` flag to your `nextflow` command:
+
+
+```
+nextflow run -profile czbiohub_aws nf-core/rnaseq \
+    --reads 's3://czb-maca/Plate_seq/24_month/180626_A00111_0166_BH5LNVDSXX/fastqs/*{R1,R2}*.fastq.gz' \
+    --genome GRCm38 \
+    --outdir s3://olgabot-maca/nextflow-test/ \
+    -resume
+```
+
+It's important that this command be re-run from the same directory as there is a "hidden" `.nextflow` folder that contains all the metadata and information about previous runs.

 ## iGenomes specific configuration

-A local copy of the iGenomes resource has been made available on `s3://czbiohub-reference` (in `us-west-2` region) so you should be able to run the pipeline against any reference available in the `igenomes.config` specific to the nf-core pipeline.
+A local copy of the iGenomes resource has been made available on `s3://czbiohub-reference/igenomes` (in `us-west-2` region) so you should be able to run the pipeline against any reference available in the `igenomes.config` specific to the nf-core pipeline.
+
 You can do this by simply using the `--genome <GENOME_ID>` parameter.

 For Human and Mouse, we use [GENCODE](https://www.gencodegenes.org/) gene annotations. This doesn't change how you would specify the genome name, only that the pipelines run with the `czbiohub_aws` profile would be with GENCODE rather than iGenomes.
--- a/nfcore_custom.config
+++ b/nfcore_custom.config
@ -17,6 +17,10 @@ profiles {
  cfc          { includeConfig "${params.custom_config_base}/conf/cfc.config" }
  crick        { includeConfig "${params.custom_config_base}/conf/crick.config" }
  czbiohub_aws { includeConfig "${params.custom_config_base}/conf/czbiohub_aws.config" }
+  czbiohub_aws_highpriority {
+    includeConfig "${params.custom_config_base}/conf/czbiohub_aws.config"
+    includeConfig "${params.custom_config_base}/conf/czbiohub_aws_highpriority.config"
+   }
  gis          { includeConfig "${params.custom_config_base}/conf/gis.config" }
  hebbe        { includeConfig "${params.custom_config_base}/conf/hebbe.config" }
  mendel       { includeConfig "${params.custom_config_base}/conf/mendel.config" }