diff --git a/shared/libebm/BoosterCore.cpp b/shared/libebm/BoosterCore.cpp index 605890431..3aef30e27 100644 --- a/shared/libebm/BoosterCore.cpp +++ b/shared/libebm/BoosterCore.cpp @@ -823,20 +823,26 @@ ErrorEbm BoosterCore::Create(void* const rng, } const size_t cBytesPerMainBin = GetBinSize(true, true, bHessian, cScores); - - if(IsAddError(cBytesPerMainBin, sizeof(void*))) { - LOG_0(Trace_Warning, "WARNING BoosterCore::Create IsAddError(cBytesPerMainBin, sizeof(void*))"); - return Error_OutOfMemory; - } - const size_t cBytesMainBinPlusPointer = cBytesPerMainBin + sizeof(void*); - if(IsMultiplyError(cBytesMainBinPlusPointer, cMainBinsMax)) { + if(IsMultiplyError(cBytesPerMainBin, cMainBinsMax)) { LOG_0(Trace_Warning, "WARNING BoosterCore::Create IsMultiplyError(cBytesPerMainBin, cMainBinsMax)"); return Error_OutOfMemory; } - // we also allocate enough space to create an additional array of pointers - pBoosterCore->m_cBytesMainBins = cBytesMainBinPlusPointer * cMainBinsMax; + size_t cBytesMainBins = cBytesPerMainBin * cMainBinsMax; if(0 != cSingleDimensionBinsMax) { + if(IsAddError(cBytesPerMainBin, sizeof(void*))) { + LOG_0(Trace_Warning, "WARNING BoosterCore::Create IsAddError(cBytesPerMainBin, sizeof(void*))"); + return Error_OutOfMemory; + } + const size_t cBytesMainBinPlusPointer = cBytesPerMainBin + sizeof(void*); + if(IsMultiplyError(cBytesMainBinPlusPointer, cSingleDimensionBinsMax)) { + LOG_0(Trace_Warning, + "WARNING BoosterCore::Create IsMultiplyError(cBytesMainBinPlusPointer, cSingleDimensionBinsMax)"); + return Error_OutOfMemory; + } + // we also allocate enough space to create an additional array of pointers + cBytesMainBins = EbmMax(cBytesMainBins, cBytesMainBinPlusPointer * cSingleDimensionBinsMax); + if(IsOverflowTreeNodeSize(bHessian, cScores) || IsOverflowSplitPositionSize(bHessian, cScores)) { LOG_0(Trace_Warning, "WARNING BoosterCore::Create bin tracking size overflow"); return Error_OutOfMemory; @@ -881,6 +887,7 @@ ErrorEbm BoosterCore::Create(void* const rng, EBM_ASSERT(0 == pBoosterCore->m_cBytesSplitPositions); EBM_ASSERT(0 == pBoosterCore->m_cBytesTreeNodes); } + pBoosterCore->m_cBytesMainBins = cBytesMainBins; } if(0 != cTerms) { error = InitializeTensors(cTerms, pBoosterCore->m_apTerms, cScores, &pBoosterCore->m_apCurrentTermTensors); diff --git a/shared/libebm/CalcInteractionStrength.cpp b/shared/libebm/CalcInteractionStrength.cpp index 0c3614880..ce7ff42fc 100644 --- a/shared/libebm/CalcInteractionStrength.cpp +++ b/shared/libebm/CalcInteractionStrength.cpp @@ -178,8 +178,8 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION CalcInteractionStrength(Interaction LOG_0(Trace_Warning, "WARNING CalcInteractionStrength maxCardinality can't be less than 0. Turning off."); } - size_t cSamplesLeafMin = size_t{0}; // this is the min value - if(IntEbm{0} <= minSamplesLeaf) { + size_t cSamplesLeafMin = size_t{1}; // this is the min value + if(IntEbm{1} <= minSamplesLeaf) { cSamplesLeafMin = static_cast(minSamplesLeaf); if(IsConvertError(minSamplesLeaf)) { // we can never exceed a size_t number of samples, so let's just set it to the maximum if we were going to @@ -187,7 +187,7 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION CalcInteractionStrength(Interaction cSamplesLeafMin = std::numeric_limits::max(); } } else { - LOG_0(Trace_Warning, "WARNING CalcInteractionStrength minSamplesLeaf can't be less than 0. Adjusting to 0."); + LOG_0(Trace_Warning, "WARNING CalcInteractionStrength minSamplesLeaf can't be less than 1. Adjusting to 1."); } FloatCalc hessianMin = static_cast(minHessian); diff --git a/shared/libebm/GenerateTermUpdate.cpp b/shared/libebm/GenerateTermUpdate.cpp index c04bdffad..36a5f5395 100644 --- a/shared/libebm/GenerateTermUpdate.cpp +++ b/shared/libebm/GenerateTermUpdate.cpp @@ -781,8 +781,8 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng, LOG_0(Trace_Warning, "WARNING GenerateTermUpdate learningRate is negative"); } - size_t cSamplesLeafMin = size_t{0}; // this is the min value - if(IntEbm{0} <= minSamplesLeaf) { + size_t cSamplesLeafMin = size_t{1}; // this is the min value + if(IntEbm{1} <= minSamplesLeaf) { cSamplesLeafMin = static_cast(minSamplesLeaf); if(IsConvertError(minSamplesLeaf)) { // we can never exceed a size_t number of samples, so let's just set it to the maximum if we were going to @@ -790,7 +790,7 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng, cSamplesLeafMin = std::numeric_limits::max(); } } else { - LOG_0(Trace_Warning, "WARNING GenerateTermUpdate minSamplesLeaf can't be less than 0. Adjusting to 0."); + LOG_0(Trace_Warning, "WARNING GenerateTermUpdate minSamplesLeaf can't be less than 1. Adjusting to 1."); } FloatCalc hessianMin = static_cast(minHessian); @@ -820,8 +820,8 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng, deltaStepMax = std::numeric_limits::infinity(); } - size_t cCategorySamplesMin = size_t{0}; // this is the min value - if(IntEbm{0} <= minCategorySamples) { + size_t cCategorySamplesMin = size_t{1}; // this is the min value + if(IntEbm{1} <= minCategorySamples) { cCategorySamplesMin = static_cast(minCategorySamples); if(IsConvertError(minCategorySamples)) { // we can never exceed a size_t number of samples, so let's just set it to the maximum if we were going to @@ -829,7 +829,7 @@ EBM_API_BODY ErrorEbm EBM_CALLING_CONVENTION GenerateTermUpdate(void* rng, cCategorySamplesMin = std::numeric_limits::max(); } } else { - LOG_0(Trace_Warning, "WARNING GenerateTermUpdate minSamplesLeaf can't be less than 0. Adjusting to 0."); + LOG_0(Trace_Warning, "WARNING GenerateTermUpdate minSamplesLeaf can't be less than 1. Adjusting to 1."); } FloatCalc categoricalSmoothingCalc = static_cast(categoricalSmoothing); diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp index cf27433dd..32b9fa184 100644 --- a/shared/libebm/PartitionOneDimensionalBoosting.cpp +++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp @@ -112,8 +112,8 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, const FloatCalc regLambda, const FloatCalc deltaStepMax, const size_t iDimension, - const Bin** const apBins, - const Bin** const apBinsEnd, + Bin** const apBins, + Bin** const apBinsEnd, const TreeNode* pMissingValueTreeNode, const TreeNode* pDregsTreeNode, const size_t cSlices, @@ -276,7 +276,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell, } EBM_ASSERT(!bNominal); - iEdge = ppBinLast - apBins + 1; + iEdge = reinterpret_cast(ppBinLast) - reinterpret_cast(apBins) + 1; while(true) { // not a real loop if(bMissing) { @@ -484,7 +484,6 @@ static int FindBestSplitGain(RandomDeterministic* const pRng, const FloatCalc regLambda, const FloatCalc deltaStepMax, const MonotoneDirection monotoneDirection, - const Bin* const pMissingBin, bool* pbMissingIsolated, const TreeNode** const ppMissingValueTreeNode, const TreeNode** const ppDregsTreeNode, @@ -503,7 +502,6 @@ static int FindBestSplitGain(RandomDeterministic* const pRng, "regLambda=%le, " "deltaStepMax=%le, " "monotoneDirection=%" MonotoneDirectionPrintf ", " - "pMissingBin=%p, " "pbMissingIsolated=%p, " "ppMissingValueTreeNode=%p, " "ppDregsTreeNode=%p, " @@ -519,12 +517,15 @@ static int FindBestSplitGain(RandomDeterministic* const pRng, static_cast(regLambda), static_cast(deltaStepMax), monotoneDirection, - static_cast(pMissingBin), static_cast(pbMissingIsolated), static_cast(ppMissingValueTreeNode), static_cast(ppDregsTreeNode), static_cast(pDregSumBin)); + EBM_ASSERT(nullptr != pBoosterShell); + EBM_ASSERT(nullptr != pTreeNode); + EBM_ASSERT(nullptr != pTreeNodeScratchSpace); + EBM_ASSERT(nullptr != pbMissingIsolated); EBM_ASSERT(nullptr != ppMissingValueTreeNode); EBM_ASSERT(nullptr != ppDregsTreeNode); EBM_ASSERT(nullptr == *ppDregsTreeNode && nullptr == pDregSumBin || @@ -533,19 +534,19 @@ static int FindBestSplitGain(RandomDeterministic* const pRng, const auto* const* ppBinCur = pTreeNode->BEFORE_GetBinFirst(); const auto* const* ppBinLast = pTreeNode->BEFORE_GetBinLast(); + const auto* const aBins = + pBoosterShell->GetBoostingMainBins() + ->Specialize(); + if(ppBinCur == ppBinLast) { // There is just one bin and therefore no splits pTreeNode->AFTER_RejectSplit(); - if(pMissingBin == *ppBinCur) { + if(aBins == *ppBinCur) { *pbMissingIsolated = true; } return 1; } - const auto* const aBins = - pBoosterShell->GetBoostingMainBins() - ->Specialize(); - BoosterCore* const pBoosterCore = pBoosterShell->GetBoosterCore(); const size_t cScores = GET_COUNT_SCORES(cCompilerScores, pBoosterCore->GetCountScores()); @@ -695,6 +696,7 @@ static int FindBestSplitGain(RandomDeterministic* const pRng, const FloatMain gradIncOrig = aIncGradHess[iScore].m_sumGradients + aBinGradHess[iScore].m_sumGradients; aIncGradHess[iScore].m_sumGradients = gradIncOrig; const FloatCalc gradDec = static_cast(aParentGradHess[iScore].m_sumGradients - gradIncOrig); + const FloatCalc gradInc = static_cast(gradIncOrig); if(bHessian) { const FloatMain newHessIncOrig = aIncGradHess[iScore].GetHess() + aBinGradHess[iScore].GetHess(); @@ -718,8 +720,6 @@ static int FindBestSplitGain(RandomDeterministic* const pRng, bLegal = false; } - const FloatCalc gradInc = static_cast(gradIncOrig); - if(MONOTONE_NONE != monotoneAdjusted) { const FloatCalc negUpdateDec = CalcNegUpdate(gradDec, hessDecUpdate, regAlpha, regLambda, deltaStepMax); @@ -913,14 +913,14 @@ template class CompareBin final { m_categoricalSmoothing = categoricalSmoothing; } - INLINE_ALWAYS bool operator()( - const Bin*& lhs, - const Bin*& rhs) const noexcept { + INLINE_ALWAYS bool operator()(Bin*& lhs, + Bin*& rhs) const noexcept { // NEVER check for exact equality (as a precondition is ok), since then we'd violate the weak ordering rule // https://medium.com/@shiansu/strict-weak-ordering-and-the-c-stl-f7dcfa4d4e07 EBM_ASSERT(!std::isnan(m_categoricalSmoothing)); + // this only works for bins with 1 score since with multiple what would the sort index be? FloatCalc val1 = static_cast(lhs->GetGradientPairs()[0].m_sumGradients); FloatCalc val2 = static_cast(rhs->GetGradientPairs()[0].m_sumGradients); if(!std::isinf(m_categoricalSmoothing)) { @@ -939,6 +939,7 @@ template class CompareBin final { } if(val1 == val2) { + // As the tie breaker to make the sort values unique, use the pointer itself. return lhs < rhs; } return val1 < val2; @@ -971,7 +972,15 @@ template class PartitionOneDimensionalBoo size_t cSamplesTotal, FloatMain weightTotal, double* const pTotalGain) { + EBM_ASSERT(nullptr != pBoosterShell); EBM_ASSERT(2 <= cBins); // filter these out at the start where we can handle this case easily + EBM_ASSERT(1 <= cSamplesLeafMin); + EBM_ASSERT(std::numeric_limits::min() <= hessianMin); + EBM_ASSERT(1 <= cCategorySamplesMin); + EBM_ASSERT(0 <= categoricalSmoothing); + EBM_ASSERT(2 <= categoricalThresholdMax); + EBM_ASSERT(0.0 <= categoricalInclusionPercent); + EBM_ASSERT(categoricalInclusionPercent <= 1.0); EBM_ASSERT(1 <= cSplitsMax); // filter these out at the start where we can handle this case easily EBM_ASSERT(nullptr != pTotalGain); EBM_ASSERT(!bNominal || MONOTONE_NONE == monotoneDirection); @@ -990,7 +999,6 @@ template class PartitionOneDimensionalBoo auto* const pRootTreeNode = pBoosterShell->GetTreeNodesTemp(); pRootTreeNode->Init(); - // we can only sort if there's a single sortable index, so 1 score value bNominal = bNominal && (0 == (TermBoostFlags_DisableCategorical & flags)); auto* const aBins = @@ -998,49 +1006,42 @@ template class PartitionOneDimensionalBoo ->Specialize(); auto* pBinsEnd = IndexBin(aBins, cBytesPerBin * cBins); - const auto** const apBins = - reinterpret_cast**>( + auto** const apBins = + reinterpret_cast**>( pBinsEnd); - const auto** ppBin = apBins; + auto** ppBin = apBins; auto* pBin = aBins; - const Bin* pMissingBin = nullptr; - bool bMissingIsolated = false; - const TreeNode* pMissingValueTreeNode = nullptr; - const TreeNode* pDregsTreeNode = nullptr; + bool bMissingIsolated = false; Bin* pDregSumBin = nullptr; + const TreeNode* pDregsTreeNode = nullptr; const auto* aSumBins = aBins; - if(bMissing) { - if(!bNominal) { - // For nominal, missing is just a category. - - if(TermBoostFlags_MissingLow & flags) { - pMissingBin = pBin; - } else if(TermBoostFlags_MissingHigh & flags) { - pMissingBin = pBin; - pBin = IndexBin(pBin, cBytesPerBin); - } else if(TermBoostFlags_MissingSeparate & flags) { - cSamplesTotal -= static_cast(aSumBins->GetCountSamples()); - weightTotal -= aSumBins->GetWeight(); - aSumBins = IndexBin(aSumBins, cBytesPerBin); - - pBin = IndexBin(pBin, cBytesPerBin); + if(bMissing && !bNominal) { + // For nominal, missing is just a category. + + if(TermBoostFlags_MissingLow & flags) { + } else if(TermBoostFlags_MissingHigh & flags) { + pBin = IndexBin(pBin, cBytesPerBin); + } else if(TermBoostFlags_MissingSeparate & flags) { + cSamplesTotal -= static_cast(aBins->GetCountSamples()); + weightTotal -= aBins->GetWeight(); + aSumBins = IndexBin(aBins, cBytesPerBin); + + pBin = IndexBin(pBin, cBytesPerBin); + } else { + if(2 == cBins) { + // as a special case, if there are only 2 bins, then treat it like TermBoostFlags_MissingLow + // because merging the missing bin into the only non-missing bin is not going to be useful. + flags |= TermBoostFlags_MissingLow; } else { - if(2 == cBins) { - // as a special case, if there are only 2 bins, then treat it like TermBoostFlags_MissingLow - // because merging the missing bin into the only category will generate no gain - flags |= TermBoostFlags_MissingLow; - pMissingBin = pBin; - } else { - pMissingValueTreeNode = pRootTreeNode; - // Skip the missing bin in the pointer to pointer mapping since it will not be part of the continuous - // region. - pBin = IndexBin(pBin, cBytesPerBin); - } + pMissingValueTreeNode = pRootTreeNode; + // Skip the missing bin in the pointer to pointer mapping since it will not be part of the continuous + // region. + pBin = IndexBin(pBin, cBytesPerBin); } } } @@ -1124,12 +1125,6 @@ template class PartitionOneDimensionalBoo return error; } - error = pInnerTermUpdate->EnsureTensorScoreCapacity(cScores); - if(UNLIKELY(Error_None != error)) { - // already logged - return error; - } - FloatScore* pUpdateScore = pInnerTermUpdate->GetTensorScoresPointer(); FloatScore hess = static_cast(pRootTreeNode->GetBin()->GetWeight()); const auto* pGradientPair = pRootTreeNode->GetBin()->GetGradientPairs(); @@ -1148,32 +1143,35 @@ template class PartitionOneDimensionalBoo } while(pGradientPairEnd != pGradientPair); *pTotalGain = 0; - return error; + return Error_None; } size_t cKeep = static_cast(std::round(categoricalInclusionPercent * cRemaining)); if(cRemaining <= cKeep && categoricalInclusionPercent < 1.0) { + // If categoricalInclusionPercent is anything other than 1.0 then drop one category at least. cKeep = cRemaining - 1; } if(categoricalThresholdMax < cKeep) { cKeep = categoricalThresholdMax; } if(cKeep <= 1) { + // 1 category means we will have no splits, so disallow that even if it means rounding up. cKeep = 2; } if(cRemaining < cKeep) { cKeep = cRemaining; } - EBM_ASSERT(2 <= cKeep); const bool bShuffle = 1 != cCompilerScores || std::isnan(categoricalSmoothing) || cKeep != cRemaining; + // there isn't a single key to sort on with multiple grad/hess pairs, so use random ordering otherwise. const bool bSort = 1 == cCompilerScores && !std::isnan(categoricalSmoothing); EBM_ASSERT(bShuffle || bSort); + EBM_ASSERT(2 <= cKeep); if(bShuffle) { - const auto** ppBinShuffle = apBins; - const auto* const* const ppBinShuffleEnd = apBins + cKeep; + auto** ppBinShuffle = apBins; + auto** const ppBinShuffleEnd = apBins + cKeep; do { EBM_ASSERT(1 <= cRemaining); const size_t iSwap = pRng->NextFast(cRemaining); @@ -1186,7 +1184,6 @@ template class PartitionOneDimensionalBoo } if(bSort) { - // there isn't a single key to sort on with multiple grad/hess pairs, so use random ordering otherwise. std::sort(apBins, ppBin, CompareBin( @@ -1197,6 +1194,7 @@ template class PartitionOneDimensionalBoo *ppBin = aBins; ++ppBin; } + EBM_ASSERT(2 <= ppBin - apBins); } pRootTreeNode->BEFORE_SetBinFirst(apBins); @@ -1219,7 +1217,6 @@ template class PartitionOneDimensionalBoo regLambda, deltaStepMax, monotoneDirection, - pMissingBin, &bMissingIsolated, &pMissingValueTreeNode, &pDregsTreeNode, @@ -1295,7 +1292,6 @@ template class PartitionOneDimensionalBoo regLambda, deltaStepMax, monotoneDirection, - pMissingBin, &bMissingIsolated, &pMissingValueTreeNode, &pDregsTreeNode, @@ -1323,7 +1319,6 @@ template class PartitionOneDimensionalBoo regLambda, deltaStepMax, monotoneDirection, - pMissingBin, &bMissingIsolated, &pMissingValueTreeNode, &pDregsTreeNode, @@ -1368,18 +1363,25 @@ template class PartitionOneDimensionalBoo *pTotalGain = static_cast(totalGain); - size_t cSlices = cSplitsMax - cSplitsRemaining + 1; - if(nullptr != pMissingValueTreeNode) { - EBM_ASSERT(nullptr == pMissingBin); - ++cSlices; - } else if(!bNominal && bMissing && (TermBoostFlags_MissingSeparate & flags)) { - ++cSlices; // missing is at index 0 but was unavailable to tree boosting, so add space for it - } else if(nullptr != pMissingBin && !bMissingIsolated) { - ++cSlices; - } + size_t cSlices; if(bNominal) { cSlices = cBins; + } else { + cSlices = cSplitsMax - cSplitsRemaining + 1; + if(bMissing) { + if(nullptr != pMissingValueTreeNode) { + ++cSlices; + } else if(TermBoostFlags_MissingSeparate & flags) { + ++cSlices; // missing is at index 0 but was unavailable to tree boosting, so add space for it + } else { + EBM_ASSERT((TermBoostFlags_MissingLow | TermBoostFlags_MissingHigh) & flags); + if(!bMissingIsolated) { + ++cSlices; + } + } + } } + error = Flatten(pBoosterShell, bMissing, bNominal, @@ -1388,8 +1390,8 @@ template class PartitionOneDimensionalBoo regLambda, deltaStepMax, iDimension, - reinterpret_cast**>(apBins), - reinterpret_cast**>(ppBin), + reinterpret_cast**>(apBins), + reinterpret_cast**>(ppBin), nullptr != pMissingValueTreeNode ? pMissingValueTreeNode->Downgrade() : nullptr, nullptr != pDregsTreeNode ? pDregsTreeNode->Downgrade() : nullptr, cSlices, diff --git a/shared/libebm/tests/libebm_test.hpp b/shared/libebm/tests/libebm_test.hpp index 8a76b2b67..97b81bcc2 100644 --- a/shared/libebm/tests/libebm_test.hpp +++ b/shared/libebm/tests/libebm_test.hpp @@ -297,7 +297,7 @@ static constexpr double k_maxDeltaStepDefault = 0.0; static constexpr IntEbm k_minCategorySamplesDefault = 0; static constexpr double k_categoricalSmoothingDefault = 10.0; static constexpr IntEbm k_maxCategoricalThresholdDefault = IntEbm{32}; -static constexpr double k_categoricalInclusionPercentDefault = 0.75; +static constexpr double k_categoricalInclusionPercentDefault = 1.0; #ifdef EXPAND_BINARY_LOGITS static constexpr CreateBoosterFlags k_testCreateBoosterFlags_Default = CreateBoosterFlags_BinaryAsMulticlass;