feat: correcting p-values for multiple testing (#66)

* fix: deleted old template script - not in use * feat: new logfile ouput to de_analysis rule, gathering stats logs for the report * feat: p value correction * feat: updated sample configs to include the Type I error cutoff value * featblanking the upper triangle of the correlation matrix
snakemake-workflows · Aug 21, 2024 · a83d514 · a83d514
1 parent a5d3aba
commit a83d514
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 20 deletions.
diff --git a/config/Mainz-MogonII/config.yml b/config/Mainz-MogonII/config.yml
@@ -82,9 +82,14 @@ alt_hypothesis: "greaterAbs"
 # be considered for subsequent analysis
 point_width: 20
 #
-#
+# we disregard loci with a count number lower 'mincount'
 mincount: 10
 #
+# Type I error cutoff value:
+alpha: 0.05
+# an adjustment for multiple testing will be performed using the Holm-Sidak
+# method.
+#
 # in addition to the full heatmap, plot the top number of different
 # values, ranked by the top ratio between the two traits
 threshold_plot: 10

diff --git a/config/Mainz-MogonNHR/config.yml b/config/Mainz-MogonNHR/config.yml
@@ -82,9 +82,14 @@ alt_hypothesis: "greaterAbs"
 # be considered for subsequent analysis
 point_width: 20
 #
-#
+# we disrecard loci with count number lower 'mincount'
 mincount: 10
 #
+# Type I error cutoff value:
+alpha: 0.05
+# an adjustment for multiple testing will be performed using the Holm-Sidak
+# method.
+#
 # in addition to the full heatmap, plot the top number of different
 # values, ranked by the top ratio between the two traits
 threshold_plot: 10

diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -12,6 +12,7 @@ rule de_analysis:
         correlation_matrix="de_analysis/correlation_matrix.svg",
         normalized_counts="de_analysis/normalized_counts.csv",
         de_top_heatmap="de_analysis/heatmap_top.svg",
+        statistics="de_analysis/statistics.log",
         lfc_analysis="de_analysis/lfc_analysis.csv",
     params:
         samples=samples,

diff --git a/workflow/scripts/_template_script.py b/workflow/scripts/_template_script.py
diff --git a/workflow/scripts/de_analysis.py b/workflow/scripts/de_analysis.py
@@ -7,9 +7,11 @@
 matplotlib.use("Agg")  # suppress creating of interactive plots
 import matplotlib.pyplot as plt
 from matplotlib.colors import LogNorm
+import numpy as np
 import pandas as pd
 import seaborn as sns
 import scipy.spatial as sp, scipy.cluster.hierarchy as hc
+import statsmodels.api as sm
 
 
 from snakemake.exceptions import WorkflowError
@@ -101,15 +103,30 @@
 
 # shorthand for log2fold and pvalue columns
 log2foldchange = stat_res.results_df["log2FoldChange"]
+# 'pvalue' is a pandas series, linear, of length(number of aligned loci)
 pvalue = stat_res.results_df["pvalue"]
 
+# p-value adjustment for independent p-values
+# the return value is a tuple:
+# (array of bools, np.floats), both of length(number of aligned loci)
+cutoff = sm.stats.multipletests(
+    pvalue, alpha=0.05, method="hs", maxiter=1, is_sorted=False, returnsorted=False
+)
+
+with open(snakemake.output.statistics, "w") as statslog:
+    statslog.write(
+        f"Found {cutoff[0].sum()} values about the alpha threshold." + os.linesep
+    )
+    statslog.write(f"The total number of tests was {cutoff[1].shape[0]}." + os.linesep)
+
 normalized = normalized.join(log2foldchange)
+# normalized = normalized.join(np.array(pvalue[1]))
 normalized = normalized.join(pvalue)
 
-
 normalized.sort_values(by="log2FoldChange")
 # delete rows, which do not meet our p-value criterion
-normalized.drop(normalized[normalized.pvalue > 0.05].index, inplace=True)
+# the comparison operator is >= because we drop all values >= our desired alpha
+normalized.drop(normalized[cutoff[1] >= snakemake.config["alpha"]].index, inplace=True)
 # through away these columns
 normalized.drop("log2FoldChange", axis=1, inplace=True)
 normalized.drop("pvalue", axis=1, inplace=True)
@@ -123,14 +140,22 @@
 col_dism = 1 - normalized.corr()
 col_linkage = hc.linkage(sp.distance.squareform(col_dism), method="complete")
 
-# TODO: only half of the matrix should be plotted
+# we calculate the calculation matrix once, with filling NaNs as 0
+correlation_matrix = normalized.corr().fillna(0)
+# next, a triangle mask is needed, to avoid redundant plotting of the matrix
+# in a square
+mask = np.triu(np.ones_like(correlation_matrix))
+
 # TODO: add contidion labels (e.g. male/female to the map)
-sns.clustermap(
-    normalized.corr().fillna(0),
+cluster = sns.clustermap(
+    correlation_matrix,
     cmap=snakemake.config["colormap"],
     linewidths=0,
-    # , norm=LogNorm()
 )  # , xticklables = metadata.index.to_list())#, yticklabels = sta)
+values = cluster.ax_heatmap.collections[0].get_array().reshape(correlation_matrix.shape)
+new_values = np.ma.array(values, mask=mask)
+cluster.ax_heatmap.collections[0].set_array(new_values)
+cluster.ax_col_dendrogram.set_visible(False)
 plt.savefig(snakemake.output.correlation_matrix)
 
 # TODO: add contidion labels (e.g. male/female to the map)