Skip to content

Commit

Permalink
Merge pull request #6 from BrentLab/g3-release-cleanup
Browse files Browse the repository at this point in the history
G3 release
  • Loading branch information
yiming-kang authored Jun 18, 2022
2 parents eb9086e + 6386529 commit c7f60bc
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 88 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ NOTEBOOKS/helper_data/*

.*DS_Store
.ipynb_checkpoints/
.idea/
.vscode/
13 changes: 3 additions & 10 deletions CODE/explain_human_resps.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,11 @@ def parse_args(argv):
parser.add_argument(
'-N', '--number_of_permutations', type=int, nargs='?', const=5, choices=range(1,100),
help="Number of permutation runs (default 5).")
parser.add_argument(
'--enable_permutation_shap', action='store_true',
help="Enable SHAP for permutation runs. Permutations will default to not calculating SHAP values\n(using this flag will make permutations training significantly longer.")
parser.add_argument(
'--disable_shap', action='store_true',)
parsed = parser.parse_args(argv[1:])
return parsed



def run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, output_dir, model_hyparams, model_tuning, permutations, number_of_permutations, disable_shap):
def run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, output_dir, model_hyparams, model_tuning, permutations, number_of_permutations):

last_run_num = 0

Expand Down Expand Up @@ -85,8 +79,7 @@ def run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, output_d
logger.info('==> Cross validating response prediction model <==')
tfpr_explainer.cross_validate(permute=permutations)

# TODO: to be removed in release
if not disable_shap:
if not permutations:
logger.info('==> Analyzing feature contributions <==')
tfpr_explainer.explain()

Expand Down Expand Up @@ -133,7 +126,7 @@ def main(argv):
tf_feat_mtx_dict[feat_info_dict['tfs'][0]].shape,
nontf_feat_mtx.shape))

run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, filepath_dict['output_dir'], model_hyparams, args.model_tuning, args.permutations, args.number_of_permutations, args.disable_shap)
run_tfpr(tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict, filepath_dict['output_dir'], model_hyparams, args.model_tuning, args.permutations, args.number_of_permutations)


if __name__ == "__main__":
Expand Down
20 changes: 3 additions & 17 deletions CODE/explain_yeast_resps.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ def parse_args(argv):
parser.add_argument(
'--model_config', default='MODEL_CONFIG/yeast_default_config.json',
help='Json file for pretrained model hyperparameters.')
parser.add_argument(
'--disable_shap', action='store_true',)
parsed = parser.parse_args(argv[1:])
return parsed

Expand Down Expand Up @@ -74,16 +72,6 @@ def main(argv):
tf_feat_mtx_dict[feat_info_dict['tfs'][0]].shape,
nontf_feat_mtx.shape))

# # TODO: delete data pickling
# import pickle

# with open(filepath_dict['output_dir'] + '/input_data.pkl', 'wb') as f:
# pickle.dump([tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict], f)

# with open(filepath_dict['output_dir'] + '/input_data.pkl', 'rb') as f:
# tf_feat_mtx_dict, nontf_feat_mtx, features, label_df_dict = pickle.load(f)
# # END OF TODO

## Model prediction and explanation
tfpr_explainer = TFPRExplainer(
tf_feat_mtx_dict, nontf_feat_mtx, features,
Expand All @@ -97,11 +85,9 @@ def main(argv):
logger.info('==> Cross validating response prediction model <==')
tfpr_explainer.cross_validate()

# TODO: to be removed in release
if not args.disable_shap:
logger.info('==> Analyzing feature contributions <==')
tfpr_explainer.explain()

logger.info('==> Analyzing feature contributions <==')
tfpr_explainer.explain()

logger.info('==> Saving output data <==')
tfpr_explainer.save()

Expand Down
17 changes: 0 additions & 17 deletions CODE/response_explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,6 @@ def save(self):
'{}/feat_mtx_nontf.csv.gz'.format(self.output_dirpath), self.nontf_X,
fmt='%.8f', delimiter=',')

# TODO
if hasattr(self, 'shap_vals'):
for k, df in enumerate(self.shap_vals):
df['cv'] = k
Expand Down Expand Up @@ -285,22 +284,6 @@ def train_classifier(X, y, model_hyparams):
return model


# TODO: clean up in release
# def train_regressor(X, y):
# """Train a XGBoost regressor.
# """
# model = xgb.XGBRegressor(
# n_estimators=2500,
# learning_rate=.01,
# objective='reg:squarederror',
# booster='gbtree',
# n_jobs=-1,
# random_state=RAND_NUM
# )
# model.fit(X, y)
# return model


def calculate_tree_shap(model, X, genes, X_bg):
"""Calcualte SHAP values for tree-based model.
"""
Expand Down
1 change: 0 additions & 1 deletion CODE/visualization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ def get_feature_indices(df, organism):
def calculate_resp_and_unresp_signed_shap_sum(data_dir, tfs=None, organism='yeast', sum_over_type='tf'):
"""Calculate the sum of SHAP values within responsive and unresponsive genes respectively.
"""
# TODO: update shap csv header
print('Loading feature data ...', end=' ')
shap_subdf_list = []
for i, shap_subdf in enumerate(pd.read_csv('{}/feat_shap_wbg.csv.gz'.format(data_dir), chunksize=10 ** 7, low_memory=False)):
Expand Down
71 changes: 28 additions & 43 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,63 +1,41 @@
# TF-Perturbation Response Explainer

The ability to predict which genes will respond to perturbation of a TF's activity serves as a benchmark for our systems-level understanding of transcriptional regulatory networks. This repo uses machine learning models to predict each gene's responsiveness to a TF perturbation using genomic data from the unperturbed cells, and explains which genomic factors determine the model predictions.
The ability to predict which genes will respond to perturbation of a transcription factor (TF)'s activity serves as a benchmark for our systems-level understanding of transcriptional regulatory networks. This repo uses machine learning models to predict each gene's responsiveness to a TF perturbation using genomic data from the unperturbed cells, and explains which genomic factors determine the model predictions.

## Installation

Use `miniconda` to manage packages. If on HTCF cluster (SLURM), use `module load miniconda3`; otherwise, install and configure your own conda manager.
Use [miniconda](https://docs.conda.io/en/latest/miniconda.html) to manage packages.

```
conda create -n tfpr_exp python=3.6.10
conda activate tfpr_exp
conda config --append channels conda-forge
conda config --append channels bioconda
conda install numpy==1.19.2 pandas==1.1.3 scikit-learn==0.22.1 jupyterlab==2.2.6 jedi==0.17.2 pybedtools==0.8.0 biopython==1.78 h5py==2.10.0 multiprocess==0.70.11.1 xgboost==0.90 shap==0.35.0 plotnine==0.7.1 hyperopt=0.2.5
conda config --append channels conda-forge --append channels bioconda
conda install --file requirements.txt
```

## Usage

### Explaining a gene's responsiveness to a TF perturbation

`TFPRExplainer` predicts which gene would response to a TF perturbation and explains the extend to which each feature contributes to the prediction of responsiveness. Use the following example code to load feature matrix, cross validate response predictions, and analyze contributions of relevant genomic features.
`TFPRExplainer` predicts which gene would response to a TF perturbation and explains the extend to which each feature contributes to the prediction of responsiveness. Use the following example code to load feature matrix, cross validate response predictions across TFs, and analyze contributions of relevant genomic features.

It takes two forms of parameters:

- *Command line arguments* define the perturbed TF, the collection of genomic features, and directory paths for input and output data. Use `--help` for details.
- *Command line arguments* define the perturbed TF, the collection of genomic features, and directory paths for input and output data. Use `-h` for details.
- *Configuration parameters* define the boundary of each gene's regulatory DNA, and instructions on processing feature matrix and response label. Reference `config.ini` for default parameters.

For yeast genome, run

```
$ python3 CODE/explain_yeast_resps.py \
-i YLR451W \
-i <tf1 tf2 ... tfn> \
-f tf_binding histone_modifications chromatin_accessibility dna_sequence_nt_freq gene_expression gene_variation \
-x RESOURCES/h5_data/yeast_dna_cc_hm_atac_tss1000to500b_expr_var.h5 \
-y RESOURCES/Yeast_ZEV_IDEA/ZEV_15min_shrunkenData.csv \
-o OUTPUT/Yeast_CallingCards_ZEV/all_feats/
-x <feature matrix (.h5)> \
-y <tf-pert response data (.csv)> \
-o <output directory>
```

For human genome, run

```
$ python3 CODE/explain_human_resps.py \
-i ENSG00000001167 \
-f tf_binding histone_modifications chromatin_accessibility dna_sequence_nt_freq gene_expression gene_variation \
-x RESOURCES/h5_data/human_encode_enhan_alltss_2kbto2kb_promo.h5 \
-y RESOURCES/HumanK562_TFPert/K562_pertResp_DESeq2_long.csv \
-o OUTPUT/Human_ChIPseq_TFpert//all_feats/
```

### Explaining a gene's frequency of response across perturbations

```
$ python3 CODE/explain_yeast_resps.py \
--is_regressor \
-i freq \
-f histone_modifications chromatin_accessibility dna_sequence_nt_freq gene_expression gene_variation \
-x RESOURCES/h5_data/yeast_dna_cc_hm_atac_tss1000to500b_expr_var.h5 \
-y RESOURCES/Yeast_ZEV_IDEA/ZEV_15min_shrunkenData_deFreq.csv \
-o OUTPUT/Yeast_ZEV_DE_freq/tf_indep_feats/xgb/
```
For human genome, use `CODE/explain_human_resps.py` with the above set of arguments.

### Visualizing feature contributions in Jupyter notebooks

Expand All @@ -68,20 +46,24 @@ Use Jupyter notebooks in `Notebooks/` to make feautre visualization.

### Features

All feature data are store in one hdf5 file. Use the pre-compiled hdf5 file or compile on your own using the following code as an example for yeast data.
The feature matrix data are store in a hdf5 file. Use the pre-compiled hdf5 file or build your own using the following code for yeast data. Use `-h` argument for argument details.

```
$ python3 CODE/preprocess_data.py \
-o OUTPUT/h5_data/yeast_s288c_data.h5 \
-a RESOURCES/Yeast_genome/S288C_R64-1-1_sacCer3_orf.bed \
-f RESOURCES/Yeast_genome/S288C_R64-1-1_sacCer3_genome.fa \
--tf_bind RESOURCES/Yeast_CallingCards/*.bed \
--hist_mod RESOURCES/Yeast_HistoneMarks_Weiner2014/*.bed \
--chrom_acc RESOURCES/Yeast_ChromAcc_Schep2015/BY4741_ypd_osm_0min.occ.bed \
--gene_expr RESOURCES/Yeast_ZEV_IDEA/[...].csv \
--gene_var RESOURCES/Yeast_ZEV_IDEA/[...].csv
$ python3 CODE/preprocess_yeast_data.py \
-o <feature matrix (.h5)> \
-a <tss data (.bed)> \
-f <reference genome (.fa)> \
--tf_bind <tf binding peaks (.bed)> \
--hist_mod <histone marks data (.bed)> \
--chrom_acc <chromatin accessibility data (.bed)> \
--gene_expr <pre-pert gex levels (.csv)> \
--gene_var <pre-pert gex variations (.csv)>
```

For human data, use `CODE/preprocess_human_data.py` with the above set of arguments with the additional `-r <regulatory elements (.bed)>` for distal enhancer and promoter data.

For running random permutation on human data, enable it by setting `--permutations` (default to 5 runs) and optionally set the number of permutations by `--number_of_permutations <n>`.

### Response label

Store the magnitude of genes' responses to perturbations in wide or long format.
Expand All @@ -94,3 +76,6 @@ Store the magnitude of genes' responses to perturbations in wide or long format.
- `feats`: Feature names and their corresponding ranges of column indices in `feat_shap_wbg`.
- `genes`: Gene names corresponding to row indices in `feat_shap_wbg`.
- `feat_mtx`: Feature matrix (gene x feature) constructed from input hdf5.

## References
- Kang Y, Jung WJ, Brent MR. 2022. Predicting which genes will respond to transcription factor perturbations. G3 Genes| Genomes| Genetics. [doi:10.1093/g3journal/jkac144](https://doi.org/10.1093/g3journal/jkac144)
13 changes: 13 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
numpy==1.19.2
pandas==1.1.3
scikit-learn==0.22.1
jupyterlab==2.2.6
jedi==0.17.2
pybedtools==0.8.0
biopython==1.78
h5py==2.10.0
multiprocess==0.70.11.1
xgboost==0.90
shap==0.35.0
plotnine==0.7.1
hyperopt=0.2.5

0 comments on commit c7f60bc

Please sign in to comment.