Skip to content

Commit

Permalink
fix(RF): performance of feature sampling for node splits (#2292)
Browse files Browse the repository at this point in the history
* disable bottlenecks caused by memorySavingMode=false

* feat: add drawSample function with O(k) runtime

* Use drawKFromBufferWithoutReplacement in df training

* sample directly in findBestSplit function to make it O(_maxFeatures)

* rename service_memset_sequential -> service_memset_incrementing

* feat: status checks for node splitting algorithms
  • Loading branch information
ahuber21 authored Mar 24, 2023
1 parent 5e080cf commit af225ad
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 63 deletions.
187 changes: 126 additions & 61 deletions cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,15 @@ private:
WorkItem * _data; // array of heap elements, max element is on the left
};

//////////////////////////////////////////////////////////////////////////////////////////
// Service structure, node split error & splitting status
//////////////////////////////////////////////////////////////////////////////////////////
struct NodeSplitResult
{
services::Status status;
bool bSplitSucceeded;
};

//////////////////////////////////////////////////////////////////////////////////////////
// Service structure, contains numeric tables to be calculated as result
//////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -595,14 +604,14 @@ protected:
algorithmFPType imp);
typename DataHelper::NodeType::Leaf * makeLeaf(const IndexType * idx, size_t n, typename DataHelper::ImpurityData & imp, size_t makeLeaf);

bool findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
bool findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
bool findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
bool simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
typename DataHelper::TSplitData & split);
NodeSplitResult findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
NodeSplitResult findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
NodeSplitResult findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity,
IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights);
NodeSplitResult simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
typename DataHelper::TSplitData & split);
void addImpurityDecrease(IndexType iFeature, size_t n, const typename DataHelper::ImpurityData & curImpurity,
const typename DataHelper::TSplitData & split);

Expand All @@ -619,7 +628,7 @@ protected:
const size_t nGen = (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures) ? n : _nFeaturesPerNode;
*_numElems += n;
RNGs<IndexType, cpu> rng;
rng.uniformWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), 0, n);
rng.drawKFromBufferWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), n);
}

services::Status computeResults(const dtrees::internal::Tree & t);
Expand Down Expand Up @@ -683,16 +692,18 @@ services::Status TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, c
_aFeatureBuf.reset(_nFeatureBufs);
_aFeatureIndexBuf.reset(_nFeatureBufs);

if (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures)
{
_aFeatureIdx.reset(maxFeatures * 2); // maxFeatures elements are used by algorithm, others are used internally by generator
_aConstFeatureIdx.reset(maxFeatures * 2); // first maxFeatures elements are used for saving indices of constant features,
// the other part are used for saving levels of this features
DAAL_CHECK_MALLOC(_aConstFeatureIdx.get());
services::internal::service_memset_seq<IndexType, cpu>(_aConstFeatureIdx.get(), IndexType(0), maxFeatures * 2);
}
else
_aFeatureIdx.reset(_nFeaturesPerNode * 2); // _nFeaturesPerNode elements are used by algorithm, others are used internally by generator
/* first maxFeatures entries serve as a buffer of drawn samples for node splitting */
/* second maxFeatures entries contains [0, ..., maxFeatures - 1] and is used to randomly draw indices */
_aFeatureIdx.reset(maxFeatures * 2);
_aConstFeatureIdx.reset(maxFeatures * 2);

DAAL_CHECK_MALLOC(_aConstFeatureIdx.get());
services::internal::service_memset_seq<IndexType, cpu>(_aConstFeatureIdx.get(), IndexType(0), 2 * maxFeatures);
// in order to use drawKFromBufferWithoutReplacement we need to initialize
// the buffer to contain all indices from [0, 1, ..., maxFeatures - 1]
DAAL_CHECK_MALLOC(_aFeatureIdx.get());
services::internal::service_memset_seq<IndexType, cpu>(_aFeatureIdx.get(), IndexType(0), maxFeatures);
services::internal::service_memset_incrementing<IndexType, cpu>(_aFeatureIdx.get() + maxFeatures, IndexType(0), maxFeatures);

DAAL_CHECK_MALLOC(_aSample.get() && _helper.reset(_nSamples) && _helper.resetWeights(_nSamples) && _aFeatureBuf.get() && _aFeatureIndexBuf.get()
&& _aFeatureIdx.get());
Expand Down Expand Up @@ -798,7 +809,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd

typename DataHelper::TSplitData split;
IndexType iFeature;
if (findBestSplit(level, iStart, n, curImpurity, iFeature, split, totalWeights))

NodeSplitResult split_result = findBestSplit(level, iStart, n, curImpurity, iFeature, split, totalWeights);
DAAL_ASSERT(split_result.status.ok());
if (split_result.bSplitSucceeded)
{
const size_t nLeft = split.nLeft;
const double imp = curImpurity.var;
Expand Down Expand Up @@ -844,6 +858,7 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
DAAL_ASSERT(split.nLeft == right->count);
return res;
}

return makeLeaf(_aSample.get() + iStart, n, curImpurity, nClasses);
}

Expand All @@ -859,7 +874,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
{
return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
}
else if (findBestSplit(level, item.start, item.n, impurity, iFeature, split, item.totalWeights))

NodeSplitResult split_result = findBestSplit(level, item.start, item.n, impurity, iFeature, split, item.totalWeights);
DAAL_ASSERT(split_result.status.ok());
if (split_result.bSplitSucceeded)
{
const double imp = impurity.var;
const double impLeft = split.left.var;
Expand Down Expand Up @@ -896,10 +914,8 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
return item.node;
}
}
else
{
return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
}

return makeLeaf(_aSample.get() + item.start, item.n, impurity, nClasses);
}

template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
Expand Down Expand Up @@ -1032,37 +1048,40 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase<algorithmFPType, BinInd
}

template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::simpleSplit(size_t iStart,
const typename DataHelper::ImpurityData & curImpurity,
IndexType & iFeatureBest,
typename DataHelper::TSplitData & split)
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::simpleSplit(size_t iStart,
const typename DataHelper::ImpurityData & curImpurity,
IndexType & iFeatureBest,
typename DataHelper::TSplitData & split)
{
services::Status st;
RNGs<IndexType, cpu> rng;
algorithmFPType featBuf[2];
IndexType * aIdx = _aSample.get() + iStart;
for (size_t i = 0; i < _nFeaturesPerNode; ++i)
{
IndexType iFeature;
*_numElems += 1;
rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns());
int errorcode = rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns());
if (errorcode)
{
st = services::Status(services::ErrorNullResult);
}
featureValuesToBuf(iFeature, featBuf, aIdx, 2);
if (featBuf[1] - featBuf[0] <= _accuracy) //all values of the feature are the same
continue;
_helper.simpleSplit(featBuf, aIdx, split);
split.featureUnordered = _featHelper.isUnordered(iFeature);
split.impurityDecrease = curImpurity.var;
iFeatureBest = iFeature;
return true;
return { st, true };
}
return false;
return { st, false };
}

template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplit(size_t level, size_t iStart, size_t n,
const typename DataHelper::ImpurityData & curImpurity,
IndexType & iFeatureBest,
typename DataHelper::TSplitData & split,
algorithmFPType totalWeights)
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplit(
size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
typename DataHelper::TSplitData & split, algorithmFPType totalWeights)
{
if (n == 2)
{
Expand All @@ -1078,26 +1097,67 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes

//find best split and put it to featureIndexBuf
template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitSerial(size_t level, size_t iStart, size_t n,
const typename DataHelper::ImpurityData & curImpurity,
IndexType & iBestFeature,
typename DataHelper::TSplitData & bestSplit,
algorithmFPType totalWeights)
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitSerial(
size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature,
typename DataHelper::TSplitData & bestSplit, algorithmFPType totalWeights)
{
chooseFeatures();
size_t nVisitedFeature = 0;
const size_t maxFeatures = nFeatures();
const float qMax = 0.02; //min fracture of observations to be handled as indexed feature values
IndexType * bestSplitIdx = featureIndexBuf(0) + iStart;
IndexType * aIdx = _aSample.get() + iStart;
int iBestSplit = -1;
int idxFeatureValueBestSplit = -1; //when sorted feature is used
services::Status st;

/* counter of the number of visited features, we visit _nFeaturesPerNode
* depending on _useConstFeatures, constant features can be skipped
*/
size_t nVisitedFeature = 0;
/* total number of features */
const size_t maxFeatures = nFeatures();
/* minimum fraction of all samples per bin */
const algorithmFPType qMax = 0.02;
/* index of the best split, initialized to first index we investigate */
IndexType * bestSplitIdx = featureIndexBuf(0) + iStart;
/* sample index */
IndexType * aIdx = _aSample.get() + iStart;
/* zero-based index of best split */
int64_t iBestSplit = -1;
int64_t idxFeatureValueBestSplit = -1;
typename DataHelper::TSplitData split;
const float fact = float(n);
/* RNG for sample drawing */
RNGs<IndexType, cpu> rng;
/* index for swapping samples in Fisher-Yates sampling */
IndexType swapIdx;

for (size_t i = 0; i < maxFeatures && nVisitedFeature < _nFeaturesPerNode; ++i)
{
const auto iFeature = _aFeatureIdx[i];
const bool bUseIndexedFeatures = (!_par.memorySavingMode) && (fact > qMax * float(_helper.indexedFeatures().numIndices(iFeature)));
/* draw a random sample without replacement */
// based on Fisher Yates sampling
// _aFeatureIdx has length of 2 * _maxFeatures
// first maxFeatures contain the currently selected features
// at iteration i, we have drawn i features and written them to
// _aFeatureIdx[0, 1, ..., i-1]
//
// the second half of the buffer contains all numbers from
// [0, 1, ..., maxFeatures-1] and we randomly select one without
// replacement based on Fisher Yates sampling
// drawing uniformly from [0, maxFeatures-i] and swapping the indices
// assures uniform probability of all drawn numbers

/* draw the i-th index of the sample */
int errorcode = rng.uniform(1, &swapIdx, _engineImpl->getState(), 0, maxFeatures - i);
if (errorcode)
{
st = services::Status(services::ErrorNullResult);
}

/* account for buffer offset from 0 */
swapIdx += maxFeatures;
/* _aFeatureIdx[swapIdx] was drawn */
_aFeatureIdx[i] = _aFeatureIdx[swapIdx];
/* swap in number at [2 * maxFeatures - 1 - i] for next draw */
_aFeatureIdx[swapIdx] = _aFeatureIdx[2 * maxFeatures - 1 - i];
/* store drawn number at end of number buffer so that no number is lost */
_aFeatureIdx[2 * maxFeatures - 1 - i] = _aFeatureIdx[i];

const auto iFeature = _aFeatureIdx[i];
const bool bUseIndexedFeatures =
(!_par.memorySavingMode) && (algorithmFPType(n) > qMax * algorithmFPType(_helper.indexedFeatures().numIndices(iFeature)));

if (!_maxLeafNodes && !_useConstFeatures && !_par.memorySavingMode)
{
Expand Down Expand Up @@ -1154,7 +1214,14 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
#endif
}
}
if (iBestSplit < 0) return false; //not found

if (!st.ok() || iBestSplit < 0)
{
// either:
// error during splitting -> failure
// or no split found -> not a failure but still have to return
return { st, false };
}

iBestFeature = _aFeatureIdx[iBestSplit];
bool bCopyToIdx = true;
Expand Down Expand Up @@ -1193,20 +1260,18 @@ bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBes
bCopyToIdx = (iBestSplit + 1 < _nFeaturesPerNode); //if iBestSplit is the last considered feature
//then aIdx already contains the best split, no need to copy
if (bCopyToIdx) services::internal::tmemcpy<IndexType, cpu>(aIdx, bestSplitIdx, n);
return true;
return { st, true };
}

template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
bool TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitThreaded(size_t level, size_t iStart, size_t n,
const typename DataHelper::ImpurityData & curImpurity,
IndexType & iFeatureBest,
typename DataHelper::TSplitData & split,
algorithmFPType totalWeights)
NodeSplitResult TrainBatchTaskBase<algorithmFPType, BinIndexType, DataHelper, cpu>::findBestSplitThreaded(
size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest,
typename DataHelper::TSplitData & split, algorithmFPType totalWeights)
{
chooseFeatures();
TArray<typename DataHelper::TSplitData, cpu> aFeatureSplit(_nFeaturesPerNode);
//TODO, if parallel for features
return false;
return { services::Status(services::ErrorMethodNotSupported), false };
}

template <typename algorithmFPType, typename BinIndexType, typename DataHelper, CpuType cpu>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ Parameter::Parameter()
minWeightFractionInLeafNode(0.),
minImpurityDecreaseInSplitNode(0.),
maxLeafNodes(0),
minBinSize(5),
maxBins(256)
maxBins(256),
minBinSize(5)
{}
} // namespace interface2
Status checkImpl(const decision_forest::training::interface2::Parameter & prm)
Expand Down
13 changes: 13 additions & 0 deletions cpp/daal/src/externals/service_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ T * service_memset(T * const ptr, const T value, const size_t num)
return ptr;
}

/* Initialize block of memory of length num value */
template <typename T, CpuType cpu>
void service_memset_seq(T * const ptr, const T value, const size_t num)
{
Expand All @@ -138,6 +139,18 @@ void service_memset_seq(T * const ptr, const T value, const size_t num)
}
}

/* Initialize block of memory of length num with entries [startValue, ..., startValue + num -1]*/
template <typename T, CpuType cpu>
void service_memset_incrementing(T * const ptr, const T startValue, const size_t num)
{
PRAGMA_IVDEP
PRAGMA_VECTOR_ALWAYS
for (size_t i = 0; i < num; i++)
{
ptr[i] = startValue + i;
}
}

} // namespace internal
} // namespace services
} // namespace daal
Expand Down
Loading

0 comments on commit af225ad

Please sign in to comment.