fix: read quantification (#22)

* fix: start salmon in --ont mode * fix: samtools creates BAM files without sorting for salmon * feat: create transcriptome from genome.fa and annotation.gff * fix: resources and linting * fix: removed obsolete comment
snakemake-workflows · Jun 19, 2024 · 71422a1 · 71422a1
1 parent 3237e2c
commit 71422a1
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 25 deletions.
diff --git a/config/config.yml b/config/config.yml
@@ -13,8 +13,8 @@ repo: "/~https://github.com/snakemake-workflows/transriptome-differential-expressi
 
 ## Workflow-specific Parameters:
 
-# Transcriptome fasta (absolute path)
-transcriptome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.fa"
+# Genome fasta (absolute path)
+genome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.fa"
 # Annotation GFF/GTF (absolute path)
 annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
 # these samples ought to contain all samples comprising of the
@@ -23,7 +23,7 @@ annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.
 max_cpus: 40
 
 # Optional: NanoPlot QC using a summary file from the sequencer. If this file is not supplied, put the parameter to "None". It will then do an independent QC run per FASTQ input.
-summary: None #"/lustre/project/m2_zdvhpc/transcriptome_data/seqencing_summary/sequencing_summary.txt"
+summary: None  # "/lustre/project/m2_zdvhpc/transcriptome_data/seqencing_summary/sequencing_summary.txt"
 
 # Minimap2 indexing options
 minimap_index_opts: ""
@@ -56,27 +56,26 @@ min_feature_expr: 3
 # The (log2) log fold change under the null hypothesis. (default: 0).
 lfc_null: 0.1
 #
-# The alternative hypothesis for computing wald p-values. By default, 
-# the normal Wald test assesses deviation of the estimated log fold 
-# change from the null hypothesis, as given by lfc_null. 
-# One of ["greaterAbs", "lessAbs", "greater", "less"] or None. 
-# The alternative hypothesis corresponds to what the user wants to 
+# The alternative hypothesis for computing wald p-values. By default,
+# the normal Wald test assesses deviation of the estimated log fold
+# change from the null hypothesis, as given by lfc_null.
+# One of ["greaterAbs", "lessAbs", "greater", "less"] or None.
+# The alternative hypothesis corresponds to what the user wants to
 # find rather than the null hypothesis. (default: None).
 alt_hypothesis: "greaterAbs"
 #
-# The marker size in points**2 (typographic points are 1/72 in.). 
-# Default is rcParams['lines.markersize'] ** 2.# minimum count to 
+# The marker size in points**2 (typographic points are 1/72 in.).
+# Default is rcParams['lines.markersize'] ** 2.# minimum count to
 # be considered for subsequent analysis
 point_width: 20
 #
 #
 mincount: 10
 #
-# in addition to the full heatmap, plot the top number of different 
+# in addition to the full heatmap, plot the top number of different
 # values, ranked by the top ratio between the two traits
 threshold_plot: 10
 #
 # the heatmap color map
 # see https://seaborn.pydata.org/tutorial/color_palettes.htm for an overview
-colormap: "flare" 
-
+colormap: "flare"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -9,6 +9,7 @@ min_version("8.10.7")
 
 
 localrules:
+    genome_to_transcriptome,
     merge_counts,
     write_coldata,
     write_de_params,
@@ -43,9 +44,25 @@ rule all:
         samstats=expand("QC/samstats/{sample}.txt", sample=samples["sample"]),
 
 
+rule genome_to_transcriptome:
+    input:
+        genome=config["genome"],
+        annotation=config["annotation"],
+    output:
+        transcriptome="transcriptome/transcriptome.fa",
+    log:
+        "logs/gffread.log",
+    conda:
+        "envs/env.yml"
+    shell:
+        """
+    gffread -t {resources.cpus_per_task} -w {output} -g {input.genome} {input.annotation} &> {log}    
+    """
+
+
 rule build_minimap_index:  ## build minimap2 index
     input:
-        genome=config["transcriptome"],
+        transcriptome=rules.genome_to_transcriptome.output,
     output:
         index="index/transcriptome_index.mmi",
     params:
@@ -56,7 +73,7 @@ rule build_minimap_index:  ## build minimap2 index
         "envs/env.yml"
     shell:
         """
-    minimap2 -t {resources.cpus_per_task} {params.opts} -d {output.index} {input.genome} 2> {log}
+    minimap2 -t {resources.cpus_per_task} {params.opts} -ax map-ont -d {output.index} {input.transcriptome} 2> {log}
     """
 
 
@@ -68,7 +85,7 @@ rule map_reads:
             samples["sample"][wildcards.sample]
         ),
     output:
-        "alignments/{sample}.bam",
+        "alignments/{sample}.sam",
     log:
         "logs/minimap2/mapping_{sample}.log",
     params:
@@ -93,7 +110,7 @@ rule sam_sort:
     conda:
         "envs/env.yml"
     shell:
-        "samtools sort -@ {resources.cpus_per_task} {input.sam} -o {output} -O bam &> {log}"
+        "samtools view -@ {resources.cpus_per_task} -bS {input.sam} > {output} 2> {log}"
 
 
 rule sam_index:
@@ -114,7 +131,7 @@ rule sam_index:
 
 rule sam_stats:
     input:
-        bam=rules.map_reads.output,
+        bam=rules.sam_sort.output,
     output:
         "QC/samstats/{sample}.txt",
     log:
@@ -129,8 +146,8 @@ rule sam_stats:
 
 rule count_reads:
     input:
-        bam=rules.sam_index.output.ibam,
-        trs=config["transcriptome"],
+        bam=rules.sam_sort.output,
+        trs=rules.genome_to_transcriptome.output,
     output:
         tsv="counts/{sample}_salmon/quant.sf",
     params:
@@ -142,8 +159,8 @@ rule count_reads:
         "envs/env.yml"
     shell:
         """
-        salmon --no-version-check quant  -p {resources.cpus_per_task} \
-        -t {input.trs} -l {params.libtype} -a {input.bam} -o {params.tsv_dir} &> {log}
+        salmon --no-version-check quant --ont -p {resources.cpus_per_task} \
+        -t {input.trs} -l {params.libtype} -a {input.bam} -o {params.tsv_dir} 2> {log}
     """
 
 

diff --git a/workflow/envs/env.yml b/workflow/envs/env.yml
@@ -27,4 +27,5 @@ dependencies:
     - pydeseq2=0.4.4
     - anndata>=0.8.0
     - ensureconda
+    - gffread>=0.12.7
 
diff --git a/workflow/profile/config.yaml b/workflow/profile/config.yaml
@@ -3,6 +3,11 @@ default-resources:
     slurm_partition: "smp"
 
 set-resources:
+    genome_to_transcriptome:
+        cpus_per_task: 1
+        mem_mb_per_cpu: 1800
+        runtime: "2h"
+
     build_minimap_index:
         cpus_per_task: 4
         mem_mb_per_cpu: 3600
@@ -15,17 +20,17 @@ set-resources:
         slurm_partition: "parallel"
 
     plot_samples:
-        cpus_per_task: 1
+        cpus_per_task: 4
         mem_mb_per_cpu: 1800
-        runtime: "5h"
+        runtime: "3h"
 
     plot_all_samples:
         cpus_per_task: 8
         mem_mb_per_cpu: 1800
         runtime: "2h"
 
     sam_sort:
-        cpus_per_task: 8
+        cpus_per_task: 1
         mem_mb_per_cpu: 1800
         runtime: "1h"