Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Python demos with tests. #5651

Merged
merged 3 commits into from
May 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions demo/aft_survival/aft_survival_demo.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""
Demo for survival analysis (regression) using Accelerated Failure Time (AFT) model
"""
import os
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import numpy as np
import xgboost as xgb

# The Veterans' Administration Lung Cancer Trial
# The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
df = pd.read_csv('../data/veterans_lung_cancer.csv')
CURRENT_DIR = os.path.dirname(__file__)
df = pd.read_csv(os.path.join(CURRENT_DIR, '../data/veterans_lung_cancer.csv'))
print('Training data:')
print(df)

Expand Down Expand Up @@ -39,7 +41,7 @@
'lambda': 0.01,
'alpha': 0.02}
bst = xgb.train(params, dtrain, num_boost_round=10000,
evals=[(dtrain, 'train'), (dvalid, 'valid')],
evals=[(dtrain, 'train'), (dvalid, 'valid')],
early_stopping_rounds=50)

# Run prediction on the validation set
Expand Down
6 changes: 3 additions & 3 deletions demo/c-api/c-api-demo.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ if (err != 0) { \
int main(int argc, char** argv) {
int silent = 0;
int use_gpu = 0; // set to 1 to use the GPU for training

// load the data
DMatrixHandle dtrain, dtest;
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.train", silent, &dtrain));
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.test", silent, &dtest));

// create the booster
BoosterHandle booster;
DMatrixHandle eval_dmats[2] = {dtrain, dtest};
Expand All @@ -49,7 +49,7 @@ int main(int argc, char** argv) {
safe_xgboost(XGBoosterSetParam(booster, "gamma", "0.1"));
safe_xgboost(XGBoosterSetParam(booster, "max_depth", "3"));
safe_xgboost(XGBoosterSetParam(booster, "verbosity", silent ? "0" : "1"));

// train and evaluate for 10 iterations
int n_trees = 10;
const char* eval_names[2] = {"train", "test"};
Expand Down
4 changes: 1 addition & 3 deletions demo/gpu_acceleration/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# GPU Acceleration Demo

`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.

`memory.py` shows how to repeatedly train xgboost models while freeing memory between iterations.
`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
1 change: 0 additions & 1 deletion demo/gpu_acceleration/cover_type.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import time
Expand Down
51 changes: 0 additions & 51 deletions demo/gpu_acceleration/memory.py

This file was deleted.

1 change: 0 additions & 1 deletion demo/guide-python/basic_walkthrough.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
import numpy as np
import scipy.sparse
import pickle
Expand Down
10 changes: 6 additions & 4 deletions demo/guide-python/boost_from_prediction.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#!/usr/bin/python
import os
import xgboost as xgb

dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')

CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
###
# advanced: start from a initial base prediction
#
print('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
# train xgboost for 1 round
bst = xgb.train(param, dtrain, 1, watchlist)
# Note: we need the margin value instead of transformed prediction in
Expand Down
11 changes: 6 additions & 5 deletions demo/guide-python/cross_validation.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/usr/bin/python
import os
import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
num_round = 2

print('running cross validation')
Expand Down Expand Up @@ -56,7 +57,7 @@ def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

param = {'max_depth':2, 'eta':1, 'silent':1}
param = {'max_depth':2, 'eta':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
obj=logregobj, feval=evalerror)
9 changes: 5 additions & 4 deletions demo/guide-python/custom_objective.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
#!/usr/bin/python
import os
import numpy as np
import xgboost as xgb
###
# advanced: customized loss function
#
print('start running example to used customized objective function')

dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))

# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'max_depth': 2, 'eta': 1, 'silent': 1}
param = {'max_depth': 2, 'eta': 1}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2

Expand Down
10 changes: 6 additions & 4 deletions demo/guide-python/evals_result.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
##
# This script demonstrate how to access the eval metrics in xgboost
##

import os
import xgboost as xgb
dtrain = xgb.DMatrix('../data/agaricus.txt.train', silent=True)
dtest = xgb.DMatrix('../data/agaricus.txt.test', silent=True)

CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))

param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]

num_round = 2
watchlist = [(dtest,'eval'), (dtrain,'train')]

Expand Down
13 changes: 5 additions & 8 deletions demo/guide-python/external_memory.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
#!/usr/bin/python
import numpy as np
import scipy.sparse
import os
import xgboost as xgb

### simple example for using external memory version

# this is the only difference, add a # followed by a cache prefix name
# several cache file with the prefix will be generated
# currently only support convert from libsvm file
dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache')
dtest = xgb.DMatrix('../data/agaricus.txt.test#dtest.cache')
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))

# specify validations set to watch performance
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}

# performance notice: set nthread to be the number of your real cpu
# some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4
Expand All @@ -21,5 +20,3 @@
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)


3 changes: 1 addition & 2 deletions demo/guide-python/gamma_regression.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/python
import xgboost as xgb
import numpy as np

Expand All @@ -12,7 +11,7 @@

# for gamma regression, we need to set the objective to 'reg:gamma', it also suggests
# to set the base_score to a value between 1 to 5 if the number of iteration is small
param = {'silent':1, 'objective':'reg:gamma', 'booster':'gbtree', 'base_score':3}
param = {'objective':'reg:gamma', 'booster':'gbtree', 'base_score':3}

# the rest of settings are the same
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
Expand Down
9 changes: 5 additions & 4 deletions demo/guide-python/generalized_linear_model.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#!/usr/bin/python
import os
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
param = {'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1}

# normally, you do not need to set eta (step_size)
Expand Down
13 changes: 7 additions & 6 deletions demo/guide-python/predict_first_ntree.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
#!/usr/bin/python
import os
import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)

print('start testing prediction from first n trees')
### predict using first 1 tree
# predict using first 1 tree
label = dtest.get_label()
ypred1 = bst.predict(dtest, ntree_limit=1)
# by default, we predict using all the trees
Expand Down
17 changes: 9 additions & 8 deletions demo/guide-python/predict_leaf_indices.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
#!/usr/bin/python
import os
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)

print ('start testing predict the leaf indices')
### predict using first 2 tree
print('start testing predict the leaf indices')
# predict using first 2 tree
leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf=True)
print(leafindex.shape)
print(leafindex)
### predict all trees
# predict all trees
leafindex = bst.predict(dtest, pred_leaf=True)
print(leafindex.shape)
13 changes: 0 additions & 13 deletions demo/guide-python/runall.sh

This file was deleted.

4 changes: 2 additions & 2 deletions demo/guide-python/sklearn_evals_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Or you can use: clf = xgb.XGBClassifier(**param_dist)

clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)

Expand All @@ -37,7 +37,7 @@
for e_mtr_name, e_mtr_vals in e_mtrs.items():
print(' - {}'.format(e_mtr_name))
print(' - {}'.format(e_mtr_vals))

print('')
print('Access complete dict:')
print(evals_result)
8 changes: 3 additions & 5 deletions demo/guide-python/sklearn_examples.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/python
'''
Created on 1 Apr 2015

Expand Down Expand Up @@ -52,9 +51,9 @@
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y)
{'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

Expand All @@ -73,4 +72,3 @@
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])

Loading