feat: batch correction (#87)

* feat: providing ngs-test-data * fix: deleted own old test data * feat: allowing for batch correction * feat: renaming 'batch_effect' to 'batch' * fix: linter did not accout for latest changes * feat: actual consideration of the batch effect in the DE script * fix: attempting with a mini default profile# * fix: test with workflow-profile flag * fix: added missing unzip package to curl.env * fix: typo * fix: added missing design_factors * fix: removed old test data * fix: only considering one confounding variable during CI tests * fix: renamed and added ncbi-datasets-cli as a package to env/reference.yml * feat: replaced curl downloads wtih ncbi-datasets-cli download by accession number to avoid unstable URLs
snakemake-workflows · Sep 16, 2024 · f174574 · f174574
1 parent cd25504
commit f174574
Show file tree

Hide file tree

Showing 15 changed files with 56 additions and 16,018 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -45,7 +45,7 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--configfile .test/config-simple/config.yml --use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
+        args: "--configfile .test/config-simple/config.yml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --all-temp --workflow-profile .test/profile/"
 
 #    - name: Test report
 #      uses: snakemake/snakemake-github-action@v1.24.0

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule ".test/ngs-test-data"]
+	path = .test/ngs-test-data
+	url = git@github.com:snakemake-workflows/ngs-test-data.git
diff --git a/.test/01.fq b/.test/01.fq
diff --git a/.test/02.fq b/.test/02.fq
diff --git a/.test/config-simple/config.yml b/.test/config-simple/config.yml
@@ -67,6 +67,11 @@ min_feature_expr: 3
 
 # This section defines the deseq2 plot and data handling parameters
 #
+# the "design factors" are the confounding variables to be adjusted for
+# during the normalization. They must be given in the configuration (samples.csv)
+design_factors:
+    - "condition"
+#
 # The (log2) log fold change under the null hypothesis. (default: 0).
 lfc_null: 0.1
 #

diff --git a/.test/config-simple/samples.csv b/.test/config-simple/samples.csv
@@ -1,3 +1,3 @@
-sample  condition   condition2	batch_effect    platform    purity
+sample  condition   condition2	batch    platform    purity
 01  	male        condition2  batch1          NANOPORE    1
 02      female      condition2  batch1          NANOPORE    1
diff --git a/.test/ngs-test-data b/.test/ngs-test-data
diff --git a/.test/profile/config.yaml b/.test/profile/config.yaml
@@ -0,0 +1,2 @@
+default-resources:
+    cpus_per_task: 2
diff --git a/config/Mainz-MogonNHR/config.yml b/config/Mainz-MogonNHR/config.yml
@@ -66,7 +66,13 @@ min_gene_expr: 10
 # Minimum transcript counts
 min_feature_expr: 3
 
-# This section defines the deseq2 plot and data handling parameters
+# This section defines the pyDESeq2 plot and data handling parameters
+#
+# the "design factors" are the confounding variables to be adjusted fr
+# during normalization. They must be given in the configuration (samples.csv).
+design_factors: 
+  - "batch"
+  - "condition"
 #
 # The (log2) log fold change under the null hypothesis. (default: 0).
 lfc_null: 0.1

diff --git a/config/Mainz-MogonNHR/samples.csv b/config/Mainz-MogonNHR/samples.csv
@@ -1,4 +1,4 @@
-sample  condition   condition2	batch_effect    platform    purity
+sample  condition   condition2	batch    platform    purity
 m18_bc01    male    condition2	batch1          NANOPORE    1
 m18_bc02    male    condition2	batch1          NANOPORE    1
 m18_bc03    female  condition2	batch1          NANOPORE    1

diff --git a/workflow/envs/curl.yml → workflow/envs/reference.yml b/workflow/envs/curl.yml → workflow/envs/reference.yml
@@ -1,4 +1,5 @@
 channels:
     - conda-forge
 dependencies:
-    - curl>=8.8.0
+    - ncbi-datasets-cli
+    - unzip
diff --git a/workflow/rules/commons.smk b/workflow/rules/commons.smk
@@ -20,7 +20,7 @@ samples = (
             config["samples"],
         ),
         sep=r"\s+",
-        dtype={"sample": str, "condition": str, "condition2": str, "batch_effect": str},
+        dtype={"sample": str, "condition": str, "condition2": str, "batch": str},
         header=0,
         comment="#",
     )

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
@@ -5,33 +5,53 @@ localrules:
 
 rule get_genome:
     output:
-        genome="references/genomic.fa",
+        # generic name:
+        temp("ncbi_dataset.zip"),
     params:
         accession=config["accession"],
     log:
         "logs/refs/get_genome.log",
     conda:
-        "../envs/curl.yml"
+        "../envs/reference.yml"
     shell:
         """
-        curl -s -o data_genome.zip https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{params.accession}/download?include_annotation_type=GENOME_FASTA &> {log};
-        unzip -p data_genome.zip ncbi_dataset/data/{params.accession}/*.fna > references/genomic.fa 2> {log};
-        rm data_genome.zip &> {log}
+        datasets download genome accession {params.accession} --include gff3,genome &> {log}
         """
 
 
-rule get_annotation:
+rule extract_genome:
+    input:
+        rules.get_genome.output,
+    output:
+        "references/genomic.fna",
+    group:
+        "reference"
+    params:
+        accession=config["accession"],
+    log:
+        "logs/refs/extract_genome.log",
+    conda:
+        "../envs/reference.yml"
+    shell:
+        """
+        unzip -p {input} ncbi_dataset/data/{params.accession}/*.fna > {output} 2> {log}
+        """
+
+
+rule extract_annotation:
+    input:
+        rules.get_annotation.output,
     output:
         "references/genomic.gff",
+    group:
+        "reference"
     params:
         accession=config["accession"],
     log:
         "logs/refs/get_annotation.log",
     conda:
-        "../envs/curl.yml"
+        "../envs/references.yml"
     shell:
         """
-        curl -s -o data_annotation.zip https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/{params.accession}/download?include_annotation_type=GENOME_GFF &> {log};
-        unzip -p data_annotation.zip ncbi_dataset/data/{params.accession}/*.gff > references/genomic.gff 2> {log};
-        rm data_annotation.zip &> {log}
+        unzip -p {input} ncbi_dataset/data/{params.accession}/*.gff > references/genomic.gff 2> {log};
         """
diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml
@@ -41,5 +41,5 @@ properties:
 required:
   - sample
   - condition
-  - batch_effect
+  - batch
 
diff --git a/workflow/scripts/de_analysis.py b/workflow/scripts/de_analysis.py
@@ -41,7 +41,7 @@
 dds = DeseqDataSet(
     counts=counts_df,
     metadata=metadata,
-    design_factors=["condition"],
+    design_factors=snakemake.config["design_factors"],
     refit_cooks=True,
     n_cpus=ncpus,
 )