forked from dmlc/xgboost
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
107 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This is the folder giving example of how to use XGBoost to run Kaggle Higgs competition |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/usr/bin/python | ||
# this is the example script to use xgboost to train | ||
import sys | ||
import numpy as np | ||
# add path of xgboost python module | ||
sys.path.append('../../python/') | ||
import xgboost as xgb | ||
|
||
test_size = 550000 | ||
|
||
# path to where the data lies | ||
dpath = 'data' | ||
|
||
# load in training data, directly use numpy | ||
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) | ||
label = dtrain[:,32] | ||
data = dtrain[:,1:31] | ||
# rescale weight to make it same as test set | ||
weight = dtrain[:,31] * float(test_size) / len(label) | ||
|
||
sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 ) | ||
sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 ) | ||
|
||
# print weight statistics | ||
print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ) | ||
|
||
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value | ||
xtrain = xgb.DMatrix( data, label=label, missing = -999.0 ) | ||
|
||
# setup parameters for xgboost | ||
params = {} | ||
# use logistic regression loss | ||
param['loss_type'] = 3 | ||
# scale weight of positive examples | ||
param['scale_pos_weight'] = sum_wpos/sum_wpos | ||
param['bst:eta'] = 0.1 | ||
param['bst:max_depth'] = 6 | ||
param['eval_metric'] = '[email protected]' | ||
param['silent'] = 1 | ||
param['eval_train'] = 1 | ||
param['nthread'] = 16 | ||
|
||
# boost 120 tres | ||
num_round = 120 | ||
print 'loading data end, start to boost trees' | ||
bst = xgb.train( xtrain, param, num_round ); | ||
# save out model | ||
bst.save_model('higgs.model') | ||
|
||
print 'finish training' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/usr/bin/python | ||
# this is the example script to use xgboost to train | ||
import sys | ||
import numpy as np | ||
# add path of xgboost python module | ||
sys.path.append('../../python/') | ||
import xgboost as xgb | ||
|
||
# path to where the data lies | ||
dpath = 'data' | ||
|
||
modelfile = 'higgs.model' | ||
outfile = 'higgs.pred.csv' | ||
# make top 15% as positive | ||
threshold_ratio = 0.15 | ||
|
||
# load in training data, directly use numpy | ||
dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 ) | ||
data = dtest[:,1:31] | ||
idx = dtest[:,1] | ||
|
||
xtest = xgb.DMatrix( data, missing = -999.0 ) | ||
bst = xgb.Booster() | ||
bst.load_model( modelfile ) | ||
|
||
ypred = bst.predict( dtest ) | ||
res = [ ( int(idx[i]), ypred[i] ) for i in xrange(len(ypred)) ] | ||
|
||
rorder = {} | ||
for k, v in sorted( res, key = lambda x:-x[1] ): | ||
rorder[ k ] = len(rorder) + 1 | ||
|
||
# write out predictions | ||
ntop = int( ratio * len(rorder ) ) | ||
fo = open(outfile, 'w') | ||
nhit = 0 | ||
ntot = 0 | ||
fo.write('EventId,RankOrder,Class\n') | ||
for k, v in res: | ||
if rorder[k] <= ntop: | ||
lb = 's' | ||
nhit += 1 | ||
else: | ||
lb = 'b' | ||
fo.write('%s,%d,%s\n' % ( k, rorder[k], lb ) ) | ||
ntot += 1 | ||
fo.close() | ||
|
||
print 'finished writing into model file' | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
|
||
./higgs-numpy.py | ||
./higgs-pred.py |