Skip to content

Commit

Permalink
feat: correcting p-values for multiple testing (#66)
Browse files Browse the repository at this point in the history
* fix: deleted old template script - not in use

* feat: new logfile ouput to de_analysis rule, gathering stats logs for the report

* feat: p value correction

* feat: updated sample configs to include the Type I error cutoff value

* featblanking the upper triangle of the correlation matrix
  • Loading branch information
cmeesters authored Aug 21, 2024
1 parent a5d3aba commit a83d514
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 20 deletions.
7 changes: 6 additions & 1 deletion config/Mainz-MogonII/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,14 @@ alt_hypothesis: "greaterAbs"
# be considered for subsequent analysis
point_width: 20
#
#
# we disregard loci with a count number lower 'mincount'
mincount: 10
#
# Type I error cutoff value:
alpha: 0.05
# an adjustment for multiple testing will be performed using the Holm-Sidak
# method.
#
# in addition to the full heatmap, plot the top number of different
# values, ranked by the top ratio between the two traits
threshold_plot: 10
Expand Down
7 changes: 6 additions & 1 deletion config/Mainz-MogonNHR/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,14 @@ alt_hypothesis: "greaterAbs"
# be considered for subsequent analysis
point_width: 20
#
#
# we disrecard loci with count number lower 'mincount'
mincount: 10
#
# Type I error cutoff value:
alpha: 0.05
# an adjustment for multiple testing will be performed using the Holm-Sidak
# method.
#
# in addition to the full heatmap, plot the top number of different
# values, ranked by the top ratio between the two traits
threshold_plot: 10
Expand Down
1 change: 1 addition & 0 deletions workflow/rules/diffexp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rule de_analysis:
correlation_matrix="de_analysis/correlation_matrix.svg",
normalized_counts="de_analysis/normalized_counts.csv",
de_top_heatmap="de_analysis/heatmap_top.svg",
statistics="de_analysis/statistics.log",
lfc_analysis="de_analysis/lfc_analysis.csv",
params:
samples=samples,
Expand Down
12 changes: 0 additions & 12 deletions workflow/scripts/_template_script.py

This file was deleted.

37 changes: 31 additions & 6 deletions workflow/scripts/de_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
matplotlib.use("Agg") # suppress creating of interactive plots
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.spatial as sp, scipy.cluster.hierarchy as hc
import statsmodels.api as sm


from snakemake.exceptions import WorkflowError
Expand Down Expand Up @@ -101,15 +103,30 @@

# shorthand for log2fold and pvalue columns
log2foldchange = stat_res.results_df["log2FoldChange"]
# 'pvalue' is a pandas series, linear, of length(number of aligned loci)
pvalue = stat_res.results_df["pvalue"]

# p-value adjustment for independent p-values
# the return value is a tuple:
# (array of bools, np.floats), both of length(number of aligned loci)
cutoff = sm.stats.multipletests(
pvalue, alpha=0.05, method="hs", maxiter=1, is_sorted=False, returnsorted=False
)

with open(snakemake.output.statistics, "w") as statslog:
statslog.write(
f"Found {cutoff[0].sum()} values about the alpha threshold." + os.linesep
)
statslog.write(f"The total number of tests was {cutoff[1].shape[0]}." + os.linesep)

normalized = normalized.join(log2foldchange)
# normalized = normalized.join(np.array(pvalue[1]))
normalized = normalized.join(pvalue)


normalized.sort_values(by="log2FoldChange")
# delete rows, which do not meet our p-value criterion
normalized.drop(normalized[normalized.pvalue > 0.05].index, inplace=True)
# the comparison operator is >= because we drop all values >= our desired alpha
normalized.drop(normalized[cutoff[1] >= snakemake.config["alpha"]].index, inplace=True)
# through away these columns
normalized.drop("log2FoldChange", axis=1, inplace=True)
normalized.drop("pvalue", axis=1, inplace=True)
Expand All @@ -123,14 +140,22 @@
col_dism = 1 - normalized.corr()
col_linkage = hc.linkage(sp.distance.squareform(col_dism), method="complete")

# TODO: only half of the matrix should be plotted
# we calculate the calculation matrix once, with filling NaNs as 0
correlation_matrix = normalized.corr().fillna(0)
# next, a triangle mask is needed, to avoid redundant plotting of the matrix
# in a square
mask = np.triu(np.ones_like(correlation_matrix))

# TODO: add contidion labels (e.g. male/female to the map)
sns.clustermap(
normalized.corr().fillna(0),
cluster = sns.clustermap(
correlation_matrix,
cmap=snakemake.config["colormap"],
linewidths=0,
# , norm=LogNorm()
) # , xticklables = metadata.index.to_list())#, yticklabels = sta)
values = cluster.ax_heatmap.collections[0].get_array().reshape(correlation_matrix.shape)
new_values = np.ma.array(values, mask=mask)
cluster.ax_heatmap.collections[0].set_array(new_values)
cluster.ax_col_dendrogram.set_visible(False)
plt.savefig(snakemake.output.correlation_matrix)

# TODO: add contidion labels (e.g. male/female to the map)
Expand Down

0 comments on commit a83d514

Please sign in to comment.