Merge pull request #6 from BrentLab/g3-release-cleanup

G3 release
BrentLab · Jun 18, 2022 · c7f60bc · c7f60bc
2 parents eb9086e + 6386529
commit c7f60bc
Show file tree

Hide file tree

Showing 7 changed files with 49 additions and 88 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@ NOTEBOOKS/helper_data/*
 
 .*DS_Store
 .ipynb_checkpoints/
+.idea/
+.vscode/
diff --git a/CODE/explain_human_resps.py b/CODE/explain_human_resps.py
@@ -44,17 +44,11 @@ def parse_args(argv):
     parser.add_argument(
         '-N', '--number_of_permutations', type=int, nargs='?', const=5, choices=range(1,100),
         help="Number of permutation runs (default 5).")
-    parser.add_argument(
-        '--enable_permutation_shap', action='store_true',
-        help="Enable SHAP for permutation runs. Permutations will default to not calculating SHAP values\n(using this flag will make permutations training significantly longer.")
-    parser.add_argument(
-        '--disable_shap', action='store_true',)
     parsed = parser.parse_args(argv[1:])
     return parsed
 
 
-
-def run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, output_dir, model_hyparams, model_tuning, permutations, number_of_permutations, disable_shap):
+def run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, output_dir, model_hyparams, model_tuning, permutations, number_of_permutations):
 
     last_run_num = 0
 
@@ -85,8 +79,7 @@ def run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, output_d
         logger.info('==> Cross validating response prediction model <==')
         tfpr_explainer.cross_validate(permute=permutations)
 
-        # TODO: to be removed in release
-        if not disable_shap:
+        if not permutations:
             logger.info('==> Analyzing feature contributions <==')
             tfpr_explainer.explain()
 
@@ -133,7 +126,7 @@ def main(argv):
         tf_feat_mtx_dict[feat_info_dict['tfs'][0]].shape,
         nontf_feat_mtx.shape))
 
-    run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, filepath_dict['output_dir'], model_hyparams, args.model_tuning, args.permutations, args.number_of_permutations, args.disable_shap)
+    run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, filepath_dict['output_dir'], model_hyparams, args.model_tuning, args.permutations, args.number_of_permutations)
 
 
 if __name__ == "__main__":

diff --git a/CODE/explain_yeast_resps.py b/CODE/explain_yeast_resps.py
@@ -37,8 +37,6 @@ def parse_args(argv):
     parser.add_argument(
         '--model_config', default='MODEL_CONFIG/yeast_default_config.json',
         help='Json file for pretrained model hyperparameters.')
-    parser.add_argument(
-        '--disable_shap', action='store_true',)
     parsed = parser.parse_args(argv[1:])
     return parsed
 
@@ -74,16 +72,6 @@ def main(argv):
         tf_feat_mtx_dict[feat_info_dict['tfs'][0]].shape,
         nontf_feat_mtx.shape))
 
-    # # TODO: delete data pickling
-    # import pickle
-
-    # with open(filepath_dict['output_dir'] + '/input_data.pkl', 'wb') as f: 
-    #     pickle.dump([tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict], f)
-
-    # with open(filepath_dict['output_dir'] + '/input_data.pkl', 'rb') as f: 
-    #     tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict = pickle.load(f)
-    # # END OF TODO
-
     ## Model prediction and explanation
     tfpr_explainer = TFPRExplainer(
         tf_feat_mtx_dict, nontf_feat_mtx, features, 
@@ -97,11 +85,9 @@ def main(argv):
     logger.info('==> Cross validating response prediction model <==')
     tfpr_explainer.cross_validate()
 
-    # TODO: to be removed in release
-    if not args.disable_shap:
-        logger.info('==> Analyzing feature contributions <==')
-        tfpr_explainer.explain()
-
+    logger.info('==> Analyzing feature contributions <==')
+    tfpr_explainer.explain()
+
     logger.info('==> Saving output data <==')
     tfpr_explainer.save()
 

diff --git a/CODE/response_explainer.py b/CODE/response_explainer.py
@@ -211,7 +211,6 @@ def save(self):
             '{}/feat_mtx_nontf.csv.gz'.format(self.output_dirpath), self.nontf_X,
             fmt='%.8f', delimiter=',')
 
-        # TODO
         if hasattr(self, 'shap_vals'):
             for k, df in enumerate(self.shap_vals):
                 df['cv'] = k
@@ -285,22 +284,6 @@ def train_classifier(X, y, model_hyparams):
     return model
 
 
-# TODO: clean up in release
-# def train_regressor(X, y):
-#     """Train a XGBoost regressor.
-#     """
-#     model = xgb.XGBRegressor(
-#         n_estimators=2500,
-#         learning_rate=.01,
-#         objective='reg:squarederror',
-#         booster='gbtree',
-#         n_jobs=-1,
-#         random_state=RAND_NUM
-#     )
-#     model.fit(X, y)
-#     return model
-
-
 def calculate_tree_shap(model, X, genes, X_bg):
     """Calcualte SHAP values for tree-based model.
     """

diff --git a/CODE/visualization_utils.py b/CODE/visualization_utils.py
@@ -146,7 +146,6 @@ def get_feature_indices(df, organism):
 def calculate_resp_and_unresp_signed_shap_sum(data_dir, tfs=None, organism='yeast', sum_over_type='tf'):
     """Calculate the sum of SHAP values within responsive and unresponsive genes respectively.
     """
-    # TODO: update shap csv header
     print('Loading feature data ...', end=' ')
     shap_subdf_list = []
     for i, shap_subdf in enumerate(pd.read_csv('{}/feat_shap_wbg.csv.gz'.format(data_dir), chunksize=10 ** 7, low_memory=False)):

diff --git a/README.md b/README.md
@@ -1,63 +1,41 @@
 # TF-Perturbation Response Explainer
 
-The ability to predict which genes will respond to perturbation of a TF's activity serves as a benchmark for our systems-level understanding of transcriptional regulatory networks. This repo uses machine learning models to predict each gene's responsiveness to a TF perturbation using genomic data from the unperturbed cells, and explains which genomic factors determine the model predictions.
+The ability to predict which genes will respond to perturbation of a transcription factor (TF)'s activity serves as a benchmark for our systems-level understanding of transcriptional regulatory networks. This repo uses machine learning models to predict each gene's responsiveness to a TF perturbation using genomic data from the unperturbed cells, and explains which genomic factors determine the model predictions.
 
 ## Installation
 
-Use `miniconda` to manage packages. If on HTCF cluster (SLURM), use `module load miniconda3`; otherwise, install and configure your own conda manager. 
+Use [miniconda](https://docs.conda.io/en/latest/miniconda.html) to manage packages.
 
 ```
 conda create -n tfpr_exp python=3.6.10
 conda activate tfpr_exp
-conda config --append channels conda-forge 
-conda config --append channels bioconda
-conda install numpy==1.19.2 pandas==1.1.3 scikit-learn==0.22.1 jupyterlab==2.2.6 jedi==0.17.2 pybedtools==0.8.0 biopython==1.78 h5py==2.10.0 multiprocess==0.70.11.1 xgboost==0.90 shap==0.35.0 plotnine==0.7.1 hyperopt=0.2.5
+conda config --append channels conda-forge --append channels bioconda
+conda install --file requirements.txt
 ```
 
 ## Usage
 
 ### Explaining a gene's responsiveness to a TF perturbation
 
-`TFPRExplainer` predicts which gene would response to a TF perturbation and explains the extend to which each feature contributes to the prediction of responsiveness. Use the following example code to load feature matrix, cross validate response predictions, and analyze contributions of relevant genomic features.
+`TFPRExplainer` predicts which gene would response to a TF perturbation and explains the extend to which each feature contributes to the prediction of responsiveness. Use the following example code to load feature matrix, cross validate response predictions across TFs, and analyze contributions of relevant genomic features.
 
 It takes two forms of parameters:
 
-- *Command line arguments* define the perturbed TF, the collection of genomic features, and directory paths for input and output data. Use `--help` for details.
+- *Command line arguments* define the perturbed TF, the collection of genomic features, and directory paths for input and output data. Use `-h` for details.
 - *Configuration parameters* define the boundary of each gene's regulatory DNA, and instructions on processing feature matrix and response label. Reference `config.ini` for default parameters.
 
 For yeast genome, run
 
 ```
 $ python3 CODE/explain_yeast_resps.py \
-    -i YLR451W \
+    -i <tf1 tf2 ... tfn> \
     -f tf_binding histone_modifications chromatin_accessibility dna_sequence_nt_freq gene_expression gene_variation \
-    -x RESOURCES/h5_data/yeast_dna_cc_hm_atac_tss1000to500b_expr_var.h5 \
-    -y RESOURCES/Yeast_ZEV_IDEA/ZEV_15min_shrunkenData.csv \
-    -o OUTPUT/Yeast_CallingCards_ZEV/all_feats/
+    -x <feature matrix (.h5)> \
+    -y <tf-pert response data (.csv)> \
+    -o <output directory>
 ```
 
-For human genome, run
-
-```
-$ python3 CODE/explain_human_resps.py \
-    -i ENSG00000001167 \
-    -f tf_binding histone_modifications chromatin_accessibility dna_sequence_nt_freq gene_expression gene_variation \
-    -x RESOURCES/h5_data/human_encode_enhan_alltss_2kbto2kb_promo.h5 \
-    -y RESOURCES/HumanK562_TFPert/K562_pertResp_DESeq2_long.csv \
-    -o OUTPUT/Human_ChIPseq_TFpert//all_feats/
-```
-
-### Explaining a gene's frequency of response across perturbations
-
-```
-$ python3 CODE/explain_yeast_resps.py \
-    --is_regressor \
-    -i freq \
-    -f histone_modifications chromatin_accessibility dna_sequence_nt_freq gene_expression gene_variation \
-    -x RESOURCES/h5_data/yeast_dna_cc_hm_atac_tss1000to500b_expr_var.h5 \
-    -y RESOURCES/Yeast_ZEV_IDEA/ZEV_15min_shrunkenData_deFreq.csv \
-    -o OUTPUT/Yeast_ZEV_DE_freq/tf_indep_feats/xgb/
-```
+For human genome, use `CODE/explain_human_resps.py` with the above set of arguments.
 
 ### Visualizing feature contributions in Jupyter notebooks
 
@@ -68,20 +46,24 @@ Use Jupyter notebooks in `Notebooks/` to make feautre visualization.
 
 ### Features
 
-All feature data are store in one hdf5 file. Use the pre-compiled hdf5 file or compile on your own using the following code as an example for yeast data.
+The feature matrix data are store in a hdf5 file. Use the pre-compiled hdf5 file or build your own using the following code for yeast data. Use `-h` argument for argument details.
 
 ```
-$ python3 CODE/preprocess_data.py \
-    -o OUTPUT/h5_data/yeast_s288c_data.h5 \
-    -a RESOURCES/Yeast_genome/S288C_R64-1-1_sacCer3_orf.bed \
-    -f RESOURCES/Yeast_genome/S288C_R64-1-1_sacCer3_genome.fa \
-    --tf_bind RESOURCES/Yeast_CallingCards/*.bed \
-    --hist_mod RESOURCES/Yeast_HistoneMarks_Weiner2014/*.bed \
-    --chrom_acc RESOURCES/Yeast_ChromAcc_Schep2015/BY4741_ypd_osm_0min.occ.bed \
-    --gene_expr RESOURCES/Yeast_ZEV_IDEA/[...].csv \
-    --gene_var RESOURCES/Yeast_ZEV_IDEA/[...].csv 
+$ python3 CODE/preprocess_yeast_data.py \
+    -o <feature matrix (.h5)> \
+    -a <tss data (.bed)> \
+    -f <reference genome (.fa)> \
+    --tf_bind <tf binding peaks (.bed)> \
+    --hist_mod <histone marks data (.bed)> \
+    --chrom_acc <chromatin accessibility data (.bed)> \
+    --gene_expr <pre-pert gex levels (.csv)> \
+    --gene_var <pre-pert gex variations (.csv)> 
 ```
 
+For human data, use `CODE/preprocess_human_data.py` with the above set of arguments with the additional `-r <regulatory elements (.bed)>` for distal enhancer and promoter data.
+
+For running random permutation on human data, enable it by setting `--permutations` (default to 5 runs) and optionally set the number of permutations by `--number_of_permutations <n>`.
+
 ### Response label
 
 Store the magnitude of genes' responses to perturbations in wide or long format.
@@ -94,3 +76,6 @@ Store the magnitude of genes' responses to perturbations in wide or long format.
 - `feats`: Feature names and their corresponding ranges of column indices in `feat_shap_wbg`.
 - `genes`: Gene names corresponding to row indices in `feat_shap_wbg`.
 - `feat_mtx`: Feature matrix (gene x feature) constructed from input hdf5.
+
+## References
+- Kang Y, Jung WJ, Brent MR. 2022. Predicting which genes will respond to transcription factor perturbations. G3 Genes| Genomes| Genetics. [doi:10.1093/g3journal/jkac144](https://doi.org/10.1093/g3journal/jkac144)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,13 @@
+numpy==1.19.2
+pandas==1.1.3
+scikit-learn==0.22.1
+jupyterlab==2.2.6
+jedi==0.17.2
+pybedtools==0.8.0
+biopython==1.78
+h5py==2.10.0
+multiprocess==0.70.11.1
+xgboost==0.90
+shap==0.35.0
+plotnine==0.7.1
+hyperopt=0.2.5