Skip to content

Commit

Permalink
Replace stubbed PERCENTILE_N with naive implementation (#12)
Browse files Browse the repository at this point in the history
Replaces the previously stubbed PERCENTILE_N methods with a naive
implementation. The naive version implements value interpolation
following NumPy and Microsoft Excel, backed by an STL vector.

This implementation works so long as there are no more than
~1M values in any VoltDB partition. Beyond that the implementation
is likely to run into VoltDB errors due to the 8MB VARBINARY
limits. A subsequent PR will introduce more efficient (but
will likely trade off accuracy) means of implementing percentiles
for those cases.
  • Loading branch information
rkennedy-mode authored Apr 13, 2020
1 parent e9ed833 commit 911aa36
Show file tree
Hide file tree
Showing 90 changed files with 31,424 additions and 87 deletions.
171 changes: 134 additions & 37 deletions src/ee/executors/aggregateexecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
#include "roaring/roaring.c"
#include "roaring/roaring.hh" // for COMPACT_COUNT_DISTINCT

#include <cereal/archives/binary.hpp> // for PERCENTILE
#include <cereal/types/vector.hpp> // for PERCENTILE

namespace voltdb {
/*
* Type of the hash set used to check for column aggregate distinctness
Expand Down Expand Up @@ -564,45 +567,116 @@ class CompactToCardinalityAgg : public CompactCountDistinctAgg {

/// Single partition aggregate
class PercentileAgg : public Agg {
protected:
double m_percentile;
std::vector<double> m_values;

public:
PercentileAgg(double percentile) : m_percentile(percentile)
{
}
// Utility function to get the given NValue as a double. There's a bunch of
// clean, handy looking methods in NValue, but they're all private.
double doubleValue(const NValue& value) {
// Cast the value as a double first, because peekDouble() will only
// give you a value if the NValue is of type VALUE_TYPE_DOUBLE.
return ValuePeeker::peekDouble(value.castAs(VALUE_TYPE_DOUBLE));
}

virtual void advance(const NValue& val)
{
// TODO: do the thing
}
public:
PercentileAgg(double percentile) : m_percentile(percentile)
{
// Do some basic checking to ensure that the percentile given is in
// a valid range (0% to 100% exclusive).
assert (percentile > 0.0);
assert (percentile < 1.0);
}

virtual NValue finalize(ValueType type)
{
// TODO: do the thing
return ValueFactory::getDoubleValue(m_percentile);
virtual void advance(const NValue& val)
{
// We only handle numeric types.
assert(ValuePeeker::peekValueType(val) == VALUE_TYPE_TINYINT
|| ValuePeeker::peekValueType(val) != VALUE_TYPE_SMALLINT
|| ValuePeeker::peekValueType(val) != VALUE_TYPE_INTEGER
|| ValuePeeker::peekValueType(val) != VALUE_TYPE_BIGINT
|| ValuePeeker::peekValueType(val) != VALUE_TYPE_DOUBLE
|| ValuePeeker::peekValueType(val) != VALUE_TYPE_TIMESTAMP
|| ValuePeeker::peekValueType(val) != VALUE_TYPE_DECIMAL);

// We don't track null or nonsensical values.
if (!val.isNull() && !val.isNaN()) {
m_values.push_back(doubleValue(val));
}
}

virtual void resetAgg()
{
Agg::resetAgg();
virtual NValue finalize(ValueType type)
{
// If we didn't see any values, the percentile is null.
if (m_values.empty()) {
return ValueFactory::getNullValue();
}

// Sort the values before we index into the list.
std::sort(m_values.begin(), m_values.end());

// Figure out the rank of the value pertaining to the Nth percentile.
// This implements the "Second variant" of the linear interpolation models
// described in the following Wikipedia article:
// https://web.archive.org/web/20200223144405/https://en.wikipedia.org/wiki/Percentile
// rank = (percentile * (sample count - 1)) + 1
double rank = (m_percentile * (m_values.size() - 1)) + 1;

// The rank potentially has both an integral and a fractional rank.
// Break it into those component pieces via modf.
double integralRank;
double fractionalRank = modf(rank, &integralRank);

// The integral rank is 1-based, but our vector of values is 0-based. Fetch the
// value at the integral rank position (the "start") and the value just past
// that position (the "end"). Mix in some std::min and std::max to ensure we
// don't fall off either end of the vector into undefined space.
double start = m_values.at(std::max(0, (int) integralRank - 1));
double end = m_values.at(std::min((int) integralRank, (int) m_values.size() - 1));

// The interpolated result uses the following formula:
// result = start + (fractional rank * (end - start))
return ValueFactory::getDoubleValue(start + (fractionalRank * (end - start)));
}

virtual void resetAgg()
{
Agg::resetAgg();
m_values.clear();
}
};

/// Push-down (multi partition) aggregate
class ValuesToTDigestAgg : public Agg {
class ValuesToTDigestAgg : public PercentileAgg {
public:
virtual void advance(const NValue& val)
// The percentile that we pass down to PercentileAgg doesn't matter for the
// push down aggregate. It just needs to be in the bounds that PercentileAgg
// accepts.
ValuesToTDigestAgg() : PercentileAgg(0.5)
{
// TODO: do the thing
}

virtual NValue finalize(ValueType type)
{
// TODO: do the thing
// Make sure the caller is asking for a VARBINARY value. This aggregate
// doesn't know how to produce anything else.
assert (type == VALUE_TYPE_VARBINARY);
int byteSize = 0;
char *serializedBytes = new char[byteSize];
return ValueFactory::getTempBinaryValue(serializedBytes, byteSize);

// Create a buffer to hold the serialized values.
std::ostringstream oss;
{
// Use Cereal (https://uscilab.github.io/cereal/index.html) to serialize
// the vector of values to the buffer. This code is inside a block to ensure
// that all of the BinaryOutputArchive structures are flushed to the
// ostringstream, which it does when it goes out of scope at the end of
// this block.
cereal::BinaryOutputArchive oarchive(oss);
oarchive(m_values);
}

// Convert the buffer to a VARBINARY value for VoltDB.
return ValueFactory::getTempBinaryValue(oss.str().c_str(),
static_cast<int32_t>(oss.str().length()));
}
};

Expand All @@ -615,7 +689,30 @@ class TDigestToPercentileAgg : public PercentileAgg {

virtual void advance(const NValue& val)
{
// TODO: do the thing
// Ensure that the incoming value is a VARBINARY (we don't know how to work
// with anything else) and that it's not null.
assert (ValuePeeker::peekValueType(val) == VALUE_TYPE_VARBINARY);
assert (!val.isNull());

// Fetch a pointer to the underlying data. It's being loaned to us by the
// NValue, so we don't need to deallocate it when we're done.
int32_t length;
const char* buf = ValuePeeker::peekObject_withoutNull(val, &length);
assert (length > 0);

// Wrap the data up in a buffer that Cereal can work with. This is going to
// copy the data, which is not strictly necessary. But I'm not sure we have
// an alternative. Cereal wants an istringstream and there don't appear to
// be any non-copying variants.
std::istringstream iss(std::string(buf, length));

// Use Cereal to deserialize the values into a std::vector.
cereal::BinaryInputArchive iarchive(iss);
std::vector<double> incomingValues;
iarchive(incomingValues);

// Add the deserialized values to any prior values (from other VoltDB dist nodes).
m_values.insert(m_values.end(), incomingValues.begin(), incomingValues.end());
}
};

Expand Down Expand Up @@ -662,33 +759,33 @@ inline Agg* getAggInstance(Pool& memoryPool, ExpressionType agg_type, bool isDis
case EXPRESSION_TYPE_AGGREGATE_VALUES_TO_TDIGEST:
return new (memoryPool) ValuesToTDigestAgg();
case EXPRESSION_TYPE_AGGREGATE_MEDIAN:
return new (memoryPool) PercentileAgg(50);
return new (memoryPool) PercentileAgg(0.5);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_MEDIAN:
return new (memoryPool) TDigestToPercentileAgg(50);
return new (memoryPool) TDigestToPercentileAgg(0.5);
case EXPRESSION_TYPE_AGGREGATE_PERCENTILE_1:
return new (memoryPool) PercentileAgg(1);
return new (memoryPool) PercentileAgg(0.01);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_PERCENTILE_1:
return new (memoryPool) TDigestToPercentileAgg(1);
return new (memoryPool) TDigestToPercentileAgg(0.01);
case EXPRESSION_TYPE_AGGREGATE_PERCENTILE_5:
return new (memoryPool) PercentileAgg(5);
return new (memoryPool) PercentileAgg(0.05);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_PERCENTILE_5:
return new (memoryPool) TDigestToPercentileAgg(5);
return new (memoryPool) TDigestToPercentileAgg(0.05);
case EXPRESSION_TYPE_AGGREGATE_PERCENTILE_25:
return new (memoryPool) PercentileAgg(25);
return new (memoryPool) PercentileAgg(0.25);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_PERCENTILE_25:
return new (memoryPool) TDigestToPercentileAgg(25);
return new (memoryPool) TDigestToPercentileAgg(0.25);
case EXPRESSION_TYPE_AGGREGATE_PERCENTILE_75:
return new (memoryPool) PercentileAgg(75);
return new (memoryPool) PercentileAgg(0.75);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_PERCENTILE_75:
return new (memoryPool) TDigestToPercentileAgg(75);
return new (memoryPool) TDigestToPercentileAgg(0.75);
case EXPRESSION_TYPE_AGGREGATE_PERCENTILE_95:
return new (memoryPool) PercentileAgg(95);
return new (memoryPool) PercentileAgg(0.95);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_PERCENTILE_95:
return new (memoryPool) TDigestToPercentileAgg(95);
return new (memoryPool) TDigestToPercentileAgg(0.95);
case EXPRESSION_TYPE_AGGREGATE_PERCENTILE_99:
return new (memoryPool) PercentileAgg(99);
return new (memoryPool) PercentileAgg(0.99);
case EXPRESSION_TYPE_AGGREGATE_TDIGEST_TO_PERCENTILE_99:
return new (memoryPool) TDigestToPercentileAgg(99);
return new (memoryPool) TDigestToPercentileAgg(0.99);
default:
{
char message[128];
Expand Down
19 changes: 18 additions & 1 deletion src/hsqldb19b3/org/hsqldb_voltpatches/SetFunction.java
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,24 @@ static Type getType(int setType, Type type) {
case OpTypes.PERCENTILE_75 :
case OpTypes.PERCENTILE_95 :
case OpTypes.PERCENTILE_99 :
return Type.SQL_DOUBLE;
switch (dataType) {
case Types.TINYINT :
case Types.SQL_SMALLINT :
case Types.SQL_INTEGER :
case Types.SQL_BIGINT :
case Types.SQL_FLOAT :
case Types.SQL_DOUBLE :
case Types.SQL_REAL :
case Types.SQL_DECIMAL :
case Types.SQL_TIMESTAMP :
return Type.SQL_DOUBLE;
default:
// We only support numeric types for this
// aggregate function.
//
// Incompatible data types in operation
throw Error.error(ErrorCode.X_42565);
}
default :
throw Error.runtimeError(ErrorCode.U_S0500, "SetFunction");
}
Expand Down
Loading

0 comments on commit 911aa36

Please sign in to comment.