diff --git a/workflow/envs/pydeseq2.yml b/workflow/envs/pydeseq2.yml index c5d8ce9..218839d 100644 --- a/workflow/envs/pydeseq2.yml +++ b/workflow/envs/pydeseq2.yml @@ -5,4 +5,3 @@ dependencies: - anndata=0.10.8 - pydeseq2=0.4.10 - seaborn>=0.13.2 - - statsmodels>=0.14.2 \ No newline at end of file diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk index b45731f..1ec577c 100644 --- a/workflow/rules/diffexp.smk +++ b/workflow/rules/diffexp.smk @@ -8,7 +8,6 @@ rule de_analysis: correlation_matrix="de_analysis/correlation_matrix.svg", normalized_counts="de_analysis/normalized_counts.csv", de_top_heatmap="de_analysis/heatmap_top.svg", - statistics="de_analysis/statistics.log", lfc_analysis="de_analysis/lfc_analysis.csv", params: samples=samples, diff --git a/workflow/scripts/de_analysis.py b/workflow/scripts/de_analysis.py index 9b8944a..85c2bae 100644 --- a/workflow/scripts/de_analysis.py +++ b/workflow/scripts/de_analysis.py @@ -11,7 +11,6 @@ import pandas as pd import seaborn as sns import scipy.spatial as sp, scipy.cluster.hierarchy as hc -import statsmodels.api as sm from snakemake.exceptions import WorkflowError @@ -105,31 +104,19 @@ log2foldchange = stat_res.results_df["log2FoldChange"] # 'pvalue' is a pandas series, linear, of length(number of aligned loci) pvalue = stat_res.results_df["pvalue"] - -# p-value adjustment for independent p-values -# the return value is a tuple: -# (array of bools, np.floats), both of length(number of aligned loci) -cutoff = sm.stats.multipletests( - pvalue, alpha=0.05, method="hs", maxiter=1, is_sorted=False, returnsorted=False -) - -with open(snakemake.output.statistics, "w") as statslog: - statslog.write( - f"Found {cutoff[0].sum()} values about the alpha threshold." + os.linesep - ) - statslog.write(f"The total number of tests was {cutoff[1].shape[0]}." + os.linesep) +padj = stat_res.results_df["padj"] normalized = normalized.join(log2foldchange) # normalized = normalized.join(np.array(pvalue[1])) -normalized = normalized.join(pvalue) +normalized = normalized.join(padj) normalized.sort_values(by="log2FoldChange") # delete rows, which do not meet our p-value criterion # the comparison operator is >= because we drop all values >= our desired alpha -normalized.drop(normalized[cutoff[1] >= snakemake.config["alpha"]].index, inplace=True) +normalized.drop(normalized[padj >= snakemake.config["alpha"]].index, inplace=True) # through away these columns normalized.drop("log2FoldChange", axis=1, inplace=True) -normalized.drop("pvalue", axis=1, inplace=True) +normalized.drop("padj", axis=1, inplace=True) normalized.to_csv(snakemake.output.normalized_counts) normalized.dropna(inplace=True)