feat(customization): mostly scriptizing for snakemake

snakemake-workflows · Mar 6, 2024 · 354a91c · 354a91c
1 parent f6b27a1
commit 354a91c
Showing 1 changed file with 44 additions and 31 deletions.
diff --git a/workflow/scripts/de_analysis.py b/workflow/scripts/de_analysis.py
@@ -42,42 +42,57 @@
     n_cpus=ncpus,
 )
 # fit dispersion(s) and LFCs
+#  this - fits the size factors
+#       - the genewise dispersion
+#       - the prior dispersion variance
+#       - MAP dispersion
+#       - log2fold change
+#       - cooks distances
 dds.deseq2()
 
 # compute normalization factors
-dds.fit_size_factors()
+#dds.fit_size_factors()
 # fitting genewise dispersions
-dds.fit_genewise_dispersions()
+#dds.fit_genewise_dispersions()
 
 dds.plot_dispersions(save_path=f"{snakemake.output.dispersion_graph}")
 
 # compute p-values for our dispersion
 stat_res = DeseqStats(dds, n_cpus=ncpus)
-
-# performing LFC shrinkage
-stat_res.lfc_shrink(coeff=f"condition_{b_condition}_vs_{a_condition}")
+# a_condition:
+a_condition = snakemake.config["condition_a_identifier"]
+# b_condition
+b_condition = snakemake.config["condition_b_identifier"]
 
 # run Wald test and plot, perform optional threshold tests, if wanted
-if snakemake.config["lfc_null"] or snakemake.config["alt_hypothesis"]:
-    summary = stat_res(lfc_null = snakemake.config.get("lfc_null", default=0),
-                       alt_hypothesis = snakemake.config.get("alt_hypothesis", default=None))
+if "lfc_null" in snakemake.config or "alt_hypothesis" in snakemake.config:
+    summary = stat_res.summary(lfc_null = snakemake.config.get("lfc_null", 0),
+                       alt_hypothesis = snakemake.config.get("alt_hypothesis", None))
 else:
     summary = stat_res.summary()
+# performing LFC shrinkage
+stat_res.lfc_shrink(coeff=f"condition_{b_condition}_vs_{a_condition}")
 
 stat_res.results_df.to_csv(snakemake.output.lfc_analysis)
 
-stat_res.plot_MA(s=snakemake.config["point_width"], save_path=f"{snakemake.output.ma_graph}")
+stat_res.plot_MA(s=snakemake.config["point_width"], 
+                 save_path=f"{snakemake.output.ma_graph}")
 
 # create a clustermap, based on normalized counts
 #dds_df = dds.to_df()
 #ds_df.to_csv('dds_df.csv'
 # getting and applying the scaling factors
 sf = dds.obsm["size_factors"]
+
 normalized = counts_df.values.T * sf
 
 # transpose back
 normalized = normalized.T
 
+# append log2fold and pvalue columns
+normalized.join(stats_res.results_df["log2FoldChange"])
+normalized.join(stats_res.results_df["pvalue"])
+
 # get indices where the matrix is 0 - may happen
 #zero_indices = np.argwhere(normalized == 0)
 # TODO: try implementing imputation
@@ -90,10 +105,6 @@
 
 normalized   = pd.DataFrame(normalized, index = row_names,
                             columns = column_names)
-# a_condition:
-a_condition = snakemake.config["condition_a_identifier"]
-# b_condition
-b_condition = snakemake.config["condition_b_identifier"]
 
 # Order columns according to traits - generally column
 # order can be arbitrary, but for the headmap, we want. 
@@ -111,35 +122,37 @@
 # final orientation and order
 normalized = normalized.T[samples]
 
+
 # get the means of our conditions
-a_mean = normalized[a_samples].mean(axis=1)
-b_mean = normalized[b_samples].mean(axis=1)
+#a_mean = normalized[a_samples].mean(axis=1)
+#b_mean = normalized[b_samples].mean(axis=1)
 
-a_over_b = a_mean/b_mean
-b_over_a = b_mean/a_mean
+#a_over_b = a_mean/b_mean
+#b_over_a = b_mean/a_mean
 # which is bigger?
-ratio = list(None for _ in a_over_b)
-for index, state in enumerate(a_over_b.ge(b_over_a)):
+#ratio = list(None for _ in a_over_b)
+#for index, state in enumerate(a_over_b.ge(b_over_a)):
     # enter ratio, but check for non-inf-ness, first:
-    print(a_over_b[index], b_over_a[index])
-    if state:
+#    print(a_over_b[index], b_over_a[index])
+#    if state:
 #        if np.isinf(a_over_b[index]):
 #            normalized.drop(index, inplace=True)
-        ratio[index] = a_over_b[index]
-    else:
+#        ratio[index] = a_over_b[index]
+#    else:
 #        if np.isinf(b_over_a[index]):
 #            normalized.drop(index, inplace=True)
-        ratio[index] = b_over_a[index]
+ #       ratio[index] = b_over_a[index]
 
-normalized["ratio"] = ratio
-normalized = normalized[~np.all(normalized == np.inf, axis=1)]
-print(normalized)
-# now sort according to the ratio
-normalized.sort_values("ratio", )
+#normalized["ratio"] = ratio
+#normalized = normalized[~np.all(normalized == np.inf, axis=1)]
+#print(normalized)
+# now sort according to the log2foldchange
+normalized.sort_values(by="log2FoldChange")
+# delete rows, which do not meet our p-value criterion
+normalized.drop(normalized["pvalue"] > 0.05, inplace=True)
 # through away this column
-normalized.drop(["ratio"], axis=1, inplace=True)
+#normalized.drop(["ratio"], axis=1, inplace=True)
 
-print(normalized)
 #normalized.loc[normalized.index.difference(normalized.dropna(how='all').index)]
 #print(normalized)