-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathxgboost_lr.py
65 lines (51 loc) · 2.27 KB
/
xgboost_lr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Author:马肖
# E-mail:maxiaoscut@aliyun.com
# Github:/~https://github.com/Albertsr
import numpy as np
from scipy.sparse import hstack
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
# 生成实验数据集
X, y = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,
n_classes=2, n_clusters_per_class=3, random_state=2017)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# 不生成新的特征,直接训练
clf = XGBClassifier(n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print("Original featrues")
print("XGB_ACC: {:.6f}".format(acc))
print("XGB_AUC: {:.6f}".format(auc))
# 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
X_train_leaves = clf.apply(X_train)
X_test_leaves = clf.apply(X_test)
# 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作
All_leaves = np.r_[X_train_leaves, X_test_leaves]
# 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作
enc = OneHotEncoder(categories='auto')
new_features = enc.fit_transform(All_leaves)
# 根据原训练集、测试集的索引对新特征予以拆分
train_samples = X_train.shape[0]
X_train_new = new_features[:train_samples, :]
X_test_new = new_features[train_samples: , :]
# 将初始训练集与GBDT新生成的特征联合后再训练LR
X_train_hstack = hstack([X_train_new, X_train])
X_test_hstack = hstack([X_test_new, X_test])
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train_hstack, y_train)
y_pred = lr.predict(X_test_hstack)
y_prob = lr.predict_proba(X_test_hstack)[:, 1]
# 进行预测
XGB_LR_ACC = accuracy_score(y_test, y_pred)
XGB_LR_AUC = roc_auc_score(y_test, y_prob)
print("\nNew featrues: ")
print('XGB_LR_ACC: {:.6f}'.format(XGB_LR_ACC))
print('XGB_LR_AUC: {:.6f}'.format(XGB_LR_AUC))