Merge branch 'release/0.5'

Hoohm · Oct 29, 2019 · 4fc0de4 · 4fc0de4
2 parents 9099c59 + d4af6d0
commit 4fc0de4
Show file tree

Hide file tree

Showing 39 changed files with 1,521 additions and 436 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+* text=auto !eol
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-
+.snakemake
+scripts/__pycache__*
diff --git a/.test/config.yaml b/.test/config.yaml
@@ -50,4 +50,5 @@ EXTRACTION:
         - UTR
     strand-strategy: SENSE
     UMI-edit-distance: 1
-    minimum-counts-per-UMI: 0
+    minimum-counts-per-UMI: 0
+DEBUG: True
diff --git a/README.md b/README.md
@@ -7,21 +7,22 @@ This pipeline is based on [snakemake](https://snakemake.readthedocs.io/en/stable
 
 This is the tool we use in our lab to improve our wetlab protocol as well as provide an easy framework to reproduce and compare different experiments with different parameters.
 
-It uses STAR to map the reads. It is usable for any single cell protocol using two reads where the first one holds the Cell and UMI barcodes and the second read holds the RNA. Here is a non-exhausitve list of compatible protocols:
+It uses STAR to map the reads. It is usable for any single cell protocol using two reads where the first one holds the Cell and UMI barcodes and the second read holds the RNA. Here is a non-exhausitve list of compatible protocols/brands:
 
 * Drop-Seq
 * SCRB-Seq
 * 10x Genomics
 * DroNc-seq
+* Dolomite Bio ([Nadia Instrument](https://www.dolomite-bio.com/product/nadia-instrument/))
 
 This package is trying to be as user friendly as possible. One of the hopes is that non-bioinformatician can make use of it without too much hassle. It will still require some command line execution, this is not going to be fully interactive package.
 
 
 ## Authors
 
-* Patrick Roelli (@Hoohm)
-* Sebastian Mueller (@seb-mueller)
-* Charles Girardot (@cgirardot)
+* Patrick Roelli ([@Hoohm)](/~https://github.com/Hoohm))
+* Sebastian Mueller ([@seb-mueller)](/~https://github.com/seb-mueller))
+* Charles Girardot ([@cgirardot)](/~https://github.com/cgirardot))
 
 ## Usage
 
@@ -79,4 +80,12 @@ I'm actively seeking help to implement the points listed bellow. Don't hesitate
 
 I hope it can help you out in your single cell experiments!
 
-Feel free to comment and point out potential improvements via [issues](/~https://github.com/Hoohm/dropSeqPipe/issues)
+Feel free to comment and point out potential improvements via [issues](/~https://github.com/Hoohm/dropSeqPipe/issues)
+
+
+TODO
+---------------------------------------------
+* Add a mixed reference reference for testing purposes
+* Finalize the parameters validation schema
+* Make the debug feature a bit "cleaner". Deal with automatic naming of the debug variables
+* Implement ddseq barcoding strategies
diff --git a/Snakefile b/Snakefile
@@ -2,6 +2,12 @@ import pandas as pd
 import os
 import re
 import glob
+from snakemake.utils import validate, min_version
+
+singularity:
+    "shub://seb-mueller/singularity_dropSeqPipe:v04"
+
+min_version("5.1.2")
 
 #print(os.path.abspath(os.path.dirname(workflow.snakefile)))
 
@@ -10,21 +16,23 @@ import glob
 try:
     configfile_path = config['configfile_path']
 except:
-    configfile_path = "config.yaml"    
+    configfile_path = "config.yaml"
 configfile: configfile_path
 
 
 #Include the gtf biotypes yaml
 configfile: config['META']['gtf_biotypes']
 
 # Define a few variables to make them easier to reference
+snakefile_root_path = os.path.abspath(os.path.dirname(workflow.snakefile))
 ref_path = config['META']['reference-directory']
 barcode_whitelist = config['FILTER']['barcode-whitelist']
 results_dir = config['LOCAL']['results']
 raw_data_dir = config['LOCAL']['raw_data']
 
 # dropSeqPipe version
-config['version'] = '0.4'
+config['version'] = '0.5'
+validate(config, schema=os.path.join(snakefile_root_path,"schemas","config.schema.yaml"))
 
 
 # In order to deal with single species or mixed species experiment
@@ -65,7 +73,8 @@ else:
     exit("Number of species in the config.yaml must be one or two. Exiting")
 
 # Get sample names from samples.csv
-samples = pd.read_table("samples.csv", header=0, sep=',', index_col=0)
+samples = pd.read_table("samples.csv", sep=',').set_index("samples", drop=False)
+validate(samples, schema=os.path.join(snakefile_root_path,"schemas","samples.schema.yaml"))
 types=['read','umi']
 # Get read_lengths from samples.csv
 read_lengths = list(samples.loc[:,'read_length'])
@@ -101,15 +110,19 @@ if len(config['META']['species'].keys()) == 2:
                 #qc
                 '{results_dir}/reports/fastqc_reads.html',
                 '{results_dir}/reports/fastqc_barcodes.html',
+                #fastqc_adapter
+                'fastqc_adapter.tsv',
                 #filter
                 '{results_dir}/plots/adapter_content.pdf',
                 '{results_dir}/reports/barcode_filtering.html',
                 '{results_dir}/reports/RNA_filtering.html',
-                '{results_dir}/samples/{sample}/trimmmed_repaired_R1.fastq.gz',
+                '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz',
+                '{results_dir}/samples/{sample}/top_barcodes.csv',
                 #mapping
                 '{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf',
                 '{results_dir}/reports/star.html',
                 '{results_dir}/plots/yield.pdf',
+                '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz',
                 #splitting
                 '{results_dir}/plots/barnyard/{sample}_genes.pdf',
                 '{results_dir}/plots/barnyard/{sample}_transcripts.pdf'],
@@ -122,8 +135,8 @@ if len(config['META']['species'].keys()) == 2:
                     release=release,
                     species=species),
             expand(
-                ['{results_dir}/samples/{sample}/{species}/umi/expression.mtx',
-                '{results_dir}/samples/{sample}/{species}/read/expression.mtx',
+                ['{results_dir}/samples/{sample}/{species}/umi/matrix.mtx',
+                '{results_dir}/samples/{sample}/{species}/read/matrix.mtx',
                 '{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'],
                 results_dir=results_dir,
                 sample=samples.index,
@@ -146,15 +159,18 @@ elif len(config['META']['species'].keys()) == 1:
                 '{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf',
                 '{results_dir}/reports/star.html',
                 '{results_dir}/plots/yield.pdf',
+                '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz',
                 #extract
                 '{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf',
-                '{results_dir}/summary/{type}/expression.mtx',
-                '{results_dir}/samples/{sample}/{type}/expression.mtx',
+                '{results_dir}/summary/{type}/matrix.mtx',
+                '{results_dir}/samples/{sample}/{type}/matrix.mtx',
                 #merge
                 '{results_dir}/plots/UMI_vs_counts.pdf',
                 '{results_dir}/plots/UMI_vs_gene.pdf',
                 '{results_dir}/plots/Count_vs_gene.pdf',
                 '{results_dir}/summary/R_Seurat_objects.rdata',
+                '{results_dir}/summary/barcode_stats_pre_filter.csv',
+                '{results_dir}/summary/barcode_stats_post_filter.csv',
                 '{results_dir}/plots/violinplots_comparison_UMI.pdf'],
                     read_length=read_lengths,
                     sample=samples.index,
@@ -164,22 +180,23 @@ elif len(config['META']['species'].keys()) == 1:
                     build=build,
                     release=release,
                     species=species)
-        
-rule download_meta:
-    input:
-        expand(
-            ["{ref_path}/{species}_{build}_{release}/annotation.gtf",
-            "{ref_path}/{species}_{build}_{release}/genome.fa"],
-                ref_path=config['META']['reference-directory'],
-                species=species_list,
-                release=release,
-                build=build)
-        
+    rule download_meta:
+        input:
+            expand(
+                ["{ref_path}/{species}_{build}_{release}/annotation.gtf",
+                "{ref_path}/{species}_{build}_{release}/genome.fa"],
+                    ref_path=config['META']['reference-directory'],
+                    species=species_list,
+                    release=release,
+                    build=build)
+
+
 rule qc:
     input:
         expand(
             ['{results_dir}/reports/fastqc_reads.html',
-            '{results_dir}/reports/fastqc_barcodes.html'],
+            '{results_dir}/reports/fastqc_barcodes.html',
+            'fastqc_adapter.tsv'],
                 results_dir=results_dir)
 
 rule filter:
@@ -188,26 +205,28 @@ rule filter:
             ['{results_dir}/plots/adapter_content.pdf',
             '{results_dir}/reports/barcode_filtering.html',
             '{results_dir}/reports/RNA_filtering.html',
-            '{results_dir}/samples/{sample}/trimmmed_repaired_R1.fastq.gz'],
+            '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz',
+            '{results_dir}/samples/{sample}/top_barcodes.csv'],
                 results_dir=results_dir,
                 sample=samples.index)
-        
+
 rule map:
-    input:  
+    input:
         expand(
             ['{results_dir}/plots/knee_plots/{sample}_knee_plot.pdf',
             '{results_dir}/reports/star.html',
             '{results_dir}/plots/yield.pdf',
-            '{results_dir}/samples/{sample}/final.bam'],
+            '{results_dir}/samples/{sample}/final.bam',
+            '{results_dir}/samples/{sample}/Unmapped.out.mate1.gz'],
                 sample=samples.index,
                 results_dir=results_dir)
 
 rule extract:
     input:
         expand(
             ['{results_dir}/plots/rna_metrics/{sample}_rna_metrics.pdf',
-            '{results_dir}/summary/{type}/expression.mtx',
-            '{results_dir}/samples/{sample}/{type}/expression.mtx'],
+            '{results_dir}/summary/{type}/matrix.mtx',
+            '{results_dir}/samples/{sample}/{type}/matrix.mtx.gz'],
                 results_dir=results_dir,
                 sample=samples.index,
                 type=types)
@@ -227,13 +246,13 @@ rule split_species:
 rule extract_species:
     input:
         expand(
-            ['{results_dir}/samples/{sample}/{species}/umi_expression_matrix.txt',
-            '{results_dir}/samples/{sample}/{species}/counts_expression_matrix.txt',
+            ['{results_dir}/samples/{sample}/{species}/{type}/matrix.mtx',
             '{results_dir}/plots/rna_metrics/{sample}_{species}_rna_metrics.pdf'],
                 sample=samples.index,
                 species=config['META']['species'],
-                results_dir=results_dir)
-
+                results_dir=results_dir,
+                type=types)
+
 rule merge:
     input:
         #merge
@@ -242,11 +261,13 @@ rule merge:
             '{results_dir}/plots/UMI_vs_gene.pdf',
             '{results_dir}/plots/Count_vs_gene.pdf',
             '{results_dir}/summary/R_Seurat_objects.rdata',
+            '{results_dir}/summary/barcode_stats_pre_filter.csv',
+            '{results_dir}/summary/barcode_stats_post_filter.csv',
             '{results_dir}/plots/violinplots_comparison_UMI.pdf',
-            '{results_dir}/summary/{type}/expression.mtx'],
+            '{results_dir}/summary/{type}/matrix.mtx'],
                 results_dir=results_dir,
                 type=types)
-        
+
 rule make_report:
     input:
         expand('{results_dir}/reports/publication_text.html', results_dir=results_dir)
@@ -265,4 +286,4 @@ include: "rules/extract_expression_single.smk"
 include: "rules/split_species.smk"
 include: "rules/extract_expression_species.smk"
 include: "rules/merge.smk"
-include: "rules/report.smk"
+include: "rules/report.smk"
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
@@ -4,6 +4,25 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+
+## [0.5]
+### Added
+- Singularity usage. Try out the `--use-singularity` option instead of `--use-conda`
+
+### Changed 
+- Lots off small bugfixes
+
+
+## [0.4.1]
+### Added
+- samples.csv and config.yaml schema validation. This will help users fix missing values.
+- DetectBeadSubstitutionErrors was added in the mapping steps.
+
+### Changed
+- Minimum read length after trimming is now the index of the end of the UMI
+- dropSeqPipe can now run with a docker image if you use the `--use-singularity` option. This should help people with package issues and different linux setups. You need to have installed singularity system wide to use this option.
+
+
 ## [0.4] - 2018-12-19
 ### Added
 - Top barcode detection using [umi-tools](/~https://github.com/CGATOxford/UMI-tools) based on number of expected cells.
@@ -29,6 +48,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ### Removed
 - Merging of species expression accross samples. Since the mixed experiments are mostly used to test out the doublet rate of a platform and not for downstream analysis, this last part has not been updated. Single expression matrices are still there.
 - Cell barcodes dropped, umi barcodes dropped, starttrim and polyA trim plots are now gone. BC_drop is also removed. Replacements are adapter_content and yield plots.
+- Quality trimming via dropseq_tools has been removed and is now down by cutadapt. Those modifications decrease the running time of the pipeline.
 
 
 ## [0.32]

diff --git a/envs/dropseq_tools.yaml b/envs/dropseq_tools.yaml
@@ -1,4 +1,8 @@
 channels:
   - bioconda
+  - anaconda
+  - conda-forge
 dependencies:
   - dropseq_tools=2.0.0
+  - font-ttf-dejavu-sans-mono=2.37
+  - fontconfig=2.13.1
diff --git a/envs/merge_long.yaml b/envs/merge_long.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - pandas=0.25.1
diff --git a/envs/picard.yaml b/envs/picard.yaml
@@ -1,4 +1,9 @@
 channels:
   - bioconda
+  - anaconda
+  - conda-forge
 dependencies:
-  - picard=2.14.1.0
+  - picard=2.14.1.0
+  - font-ttf-dejavu-sans-mono=2.37
+  - fontconfig=2.13.1
+
diff --git a/envs/pigz.yaml b/envs/pigz.yaml
@@ -0,0 +1,4 @@
+channels:
+  - anaconda
+dependencies:
+  - pigz=2.4
diff --git a/envs/plots.yaml b/envs/plots.yaml
diff --git a/envs/plots_ext.yaml → envs/r.yaml b/envs/plots_ext.yaml → envs/r.yaml
@@ -1,18 +1,18 @@
 channels:
   - conda-forge
   - bioconda
-  - r
 dependencies:
   - r=3.4.1
   - r-ggplot2=2.2.1
   - r-gridextra
-  - r-reshape2
   - r-viridis
   - r-stringdist
   - r-dplyr=0.7.6
   - r-mvtnorm
-  - r-seurat
+  - r-seurat=2
   - r-hmisc
   - r-tidyverse
   - r-devtools
-  - r-rcolorbrewer
+  - r-rcolorbrewer
+  - font-ttf-dejavu-sans-mono=2.37
+  - fontconfig=2.13.1
diff --git a/rules/cell_barcodes.smk b/rules/cell_barcodes.smk
@@ -6,7 +6,6 @@ ruleorder: extend_barcode_whitelist > get_cell_whitelist
 
 localrules:
     get_cell_whitelist,
-    extend_barcode_whitelist,
     extend_barcode_top
 
 rule extend_barcode_whitelist:
@@ -22,7 +21,7 @@ rule extend_barcode_whitelist:
 
 rule get_top_barcodes:
     input:
-        '{results_dir}/samples/{sample}/trimmmed_repaired_R1.fastq.gz'
+        '{results_dir}/samples/{sample}/trimmed_repaired_R1.fastq.gz'
     output:
         '{results_dir}/samples/{sample}/top_barcodes.csv'
     conda: '../envs/umi_tools.yaml'