From 2be3f6ece0253187f9cde5fc8f733b2f1e44fed1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 15 May 2014 20:28:34 -0700 Subject: [PATCH] fix numpy convert --- LICENSE | 2 +- python/example/demo.py | 2 ++ python/xgboost.py | 15 +++++++++++---- python/xgboost_python.cpp | 29 +++++++++++++++++++++++++++++ python/xgboost_python.h | 22 +++++++++++++++++++++- 5 files changed, 64 insertions(+), 6 deletions(-) diff --git a/LICENSE b/LICENSE index 2d9ea05e4d05..b9f38c38aaf9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014 Tianqi Chen +Copyright (c) 2014 by Tianqi Chen and Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/python/example/demo.py b/python/example/demo.py index 4f8c4dc7979a..f5e0aa2a77b6 100755 --- a/python/example/demo.py +++ b/python/example/demo.py @@ -68,7 +68,9 @@ i += 1 csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) +print 'haha' dtrain = xgb.DMatrix( csr ) +print 'set label' dtrain.set_label(labels) evallist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, evallist ) diff --git a/python/xgboost.py b/python/xgboost.py index 8d9297489fb2..879d75b654c8 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -33,15 +33,16 @@ def ctypes2numpy( cptr, length ): # data matrix used in xgboost class DMatrix: # constructor - def __init__(self, data=None, label=None): + def __init__(self, data=None, label=None, missing=0.0): self.handle = xglib.XGDMatrixCreate() if data == None: return if isinstance(data,str): - xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1) - + xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1) elif isinstance(data,scp.csr_matrix): self.__init_from_csr(data) + elif isinstance(data, numpy.ndarray) and len(data.shape) == 2: + self.__init_from_npy2d(data, missing) else: try: csr = scp.csr_matrix(data) @@ -59,6 +60,12 @@ def __init_from_csr(self,csr): ( ctypes.c_uint * len(csr.indices) )(*csr.indices), ( ctypes.c_float * len(csr.data) )(*csr.data), len(csr.indptr), len(csr.data) ) + # convert data from numpy matrix + def __init_from_npy2d(self,mat,missing): + data = numpy.array( mat.reshape(mat.size), dtype='float32' ) + xglib.XGDMatrixParseMat( self.handle, + data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + mat.shape[0], mat.shape[1], ctypes.c_float(missing) ) # destructor def __del__(self): xglib.XGDMatrixFree(self.handle) @@ -103,7 +110,7 @@ def __getitem__(self, ridx): class Booster: """learner class """ - def __init__(self, params, cache=[]): + def __init__(self, params={}, cache=[]): """ constructor, param: """ for d in cache: assert isinstance(d,DMatrix) diff --git a/python/xgboost_python.cpp b/python/xgboost_python.cpp index 9d3975c81402..8dd210c52a9d 100644 --- a/python/xgboost_python.cpp +++ b/python/xgboost_python.cpp @@ -52,6 +52,28 @@ namespace xgboost{ for( size_t i = 0; i < nelem; ++ i ){ mat.row_data_[i] = XGEntry(indices[i], data[i]); } + this->data.InitData(); + this->init_col_ = true; + } + + inline void ParseMat( const float *data, + size_t nrow, + size_t ncol, + float missing ){ + xgboost::booster::FMatrixS &mat = this->data; + mat.Clear(); + for( size_t i = 0; i < nrow; ++i, data += ncol ){ + size_t nelem = 0; + for( size_t j = 0; j < ncol; ++j ){ + if( data[j] != missing ){ + mat.row_data_.push_back( XGEntry(j, data[j]) ); + ++ nelem; + } + } + mat.row_ptr_.push_back( mat.row_ptr_.back() + nelem ); + } + this->data.InitData(); + this->init_col_ = true; } inline void SetLabel( const float *label, size_t len ){ this->info.labels.resize( len ); @@ -163,6 +185,13 @@ extern "C"{ size_t nelem ){ static_cast(handle)->ParseCSR(indptr, indices, data, nindptr, nelem); } + void XGDMatrixParseMat( void *handle, + const float *data, + size_t nrow, + size_t ncol, + float missing ){ + static_cast(handle)->ParseMat(data, nrow, ncol, missing); + } void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){ static_cast(handle)->SetLabel(label,len); } diff --git a/python/xgboost_python.h b/python/xgboost_python.h index b3529a8f0eb1..ac3ca94ac1f4 100644 --- a/python/xgboost_python.h +++ b/python/xgboost_python.h @@ -49,6 +49,19 @@ extern "C"{ const float *data, size_t nindptr, size_t nelem ); + /*! + * \brief set matrix content from data content + * \param handle a instance of data matrix + * \param data pointer to the data space + * \param nrow number of rows + * \param ncol number columns + * \param missing which value to represent missing value + */ + void XGDMatrixParseMat( void *handle, + const float *data, + size_t nrow, + size_t ncol, + float missing ); /*! * \brief set label of the training matrix * \param handle a instance of data matrix @@ -74,9 +87,16 @@ extern "C"{ * \brief get label set from matrix * \param handle a instance of data matrix * \param len used to set result length - * \return pointer to the row + * \return pointer to the label */ const float* XGDMatrixGetLabel( const void *handle, size_t* len ); + /*! + * \brief get weight set from matrix + * \param handle a instance of data matrix + * \param len used to set result length + * \return pointer to the weight + */ + const float* XGDMatrixGetWeight( const void *handle, size_t* len ); /*! * \brief clear all the records, including feature matrix and label * \param handle a instance of data matrix