-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshapFeature.py
49 lines (36 loc) · 1.31 KB
/
shapFeature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import numpy as np
import pandas as pd
import xgboost as xgb
import shap
params={ 'max_depth':3,
'n_estimators':30,
'learning_rate':0.3,
'nthread':4,
'subsample':1.0,
'colsample_bytree':1,
'min_child_weight' : 3,
# 'eval_metric' : ['logloss'],
'seed':1301}
train = pd.read_csv('Merge_sampled.csv', encoding= 'utf-8')
val = pd.read_csv('./Raw_data/validate.csv', encoding= 'utf-8')
train_y, val_y = train['y'], val['y']
train_X, val_X = train.iloc[:,2:], val.iloc[:,2:]
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgval = xgb.DMatrix(val_X, label=val_y)
model = xgb.train(params,
dtrain=xgtrain,
verbose_eval=True,
evals=[(xgtrain, "train"), (xgval, "valid")],
early_stopping_rounds=10,
num_boost_round = 30
)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(train_X)
global_importances = np.abs(shap_values).mean(0)
print(global_importances.shape)
inds = np.argsort(-global_importances)
index=[i for i in range(6373)]
df = pd.DataFrame({'Feature name':index, 'Shap value':global_importances})
df.sort_values(by="Shap value" , inplace=True, ascending=False)
print(df)
df.to_csv('Shap_feature.csv', index=False)