Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Bitwise operations on signed integers (Part-2) #12238

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 37 additions & 29 deletions velox/common/hyperloglog/DenseHll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
#include "velox/common/hyperloglog/DenseHll.h"

#include <sys/stat.h>

#include <exception>
#include <sstream>
#include "velox/common/base/IOUtils.h"
Expand All @@ -24,8 +26,8 @@
namespace facebook::velox::common::hll {
namespace {
const int kBitsPerBucket = 4;
const int8_t kMaxDelta = (1 << kBitsPerBucket) - 1;
const int8_t kBucketMask = (1 << kBitsPerBucket) - 1;
const int8_t kMaxDelta = (1u << kBitsPerBucket) - 1;
const int8_t kBucketMask = (1u << kBitsPerBucket) - 1;
constexpr double kLinearCountingMinEmptyBuckets = 0.4;

/// Buckets are stored in a byte array. Each byte stored 2 buckets, 4 bits each.
Expand All @@ -34,7 +36,7 @@ constexpr double kLinearCountingMinEmptyBuckets = 0.4;
/// the byte for a given bucket, e.g. 0 for even and 4 for odd buckets.
int8_t shiftForBucket(int32_t index) {
// ((1 - bucket) % 2) * kBitsPerBucket
return ((~index) & 1) << 2;
return ((~static_cast<uint32_t>(index)) & 1) << 2;
}

/// Returns the value of alpha constant. See "Practical considerations" section
Expand All @@ -48,7 +50,7 @@ double alpha(int32_t indexBitLength) {
case 6:
return 0.709;
default:
return (0.7213 / (1 + 1.079 / (1 << indexBitLength)));
return (0.7213 / (1 + 1.079 / (1u << indexBitLength)));
}
}

Expand Down Expand Up @@ -132,7 +134,7 @@ void DenseHll::initialize(int8_t indexBitLength) {

indexBitLength_ = indexBitLength;

auto numBuckets = 1 << indexBitLength;
auto numBuckets = 1u << indexBitLength;
baselineCount_ = numBuckets;
deltas_.resize(numBuckets * kBitsPerBucket / 8);
}
Expand Down Expand Up @@ -186,8 +188,8 @@ struct DenseHllView {
const int8_t* overflowValues;

int8_t getDelta(int32_t index) const {
int slot = index >> 1;
return (deltas[slot] >> shiftForBucket(index)) & kBucketMask;
unsigned int slot = static_cast<uint32_t>(index) >> 1;
return (static_cast<uint32_t>(deltas[slot]) >> shiftForBucket(index)) & kBucketMask;
}

int8_t getValue(int32_t index) const {
Expand All @@ -203,7 +205,7 @@ struct DenseHllView {
};

int64_t cardinalityImpl(const DenseHllView& hll) {
auto numBuckets = 1 << hll.indexBitLength;
auto numBuckets = 1u << hll.indexBitLength;

int32_t baselineCount = 0;
for (int i = 0; i < numBuckets; i++) {
Expand All @@ -222,7 +224,7 @@ int64_t cardinalityImpl(const DenseHllView& hll) {
double sum = 0;
for (int i = 0; i < numBuckets; i++) {
int value = hll.getValue(i);
sum += 1.0 / (1L << value);
sum += 1.0 / (1UL << value);
}

double estimate = (alpha(hll.indexBitLength) * numBuckets * numBuckets) / sum;
Expand All @@ -240,7 +242,7 @@ DenseHllView deserialize(const char* serialized) {
auto indexBitLength = stream.read<int8_t>();
auto baseline = stream.read<int8_t>();

auto numBuckets = 1 << indexBitLength;
auto numBuckets = 1u << indexBitLength;
// next numBuckets / 2 bytes are deltas
const int8_t* deltas = stream.read<int8_t>(numBuckets / 2);

Expand Down Expand Up @@ -279,20 +281,20 @@ int64_t DenseHll::cardinality(const char* serialized) {
}

int8_t DenseHll::getDelta(int32_t index) const {
int slot = index >> 1;
unsigned int slot = static_cast<uint32_t>(index) >> 1;
return (deltas_[slot] >> shiftForBucket(index)) & kBucketMask;
}

void DenseHll::setDelta(int32_t index, int8_t value) {
int slot = index >> 1;
unsigned int slot = static_cast<uint32_t>(index) >> 1;

// Clear the old value.
int8_t clearMask = static_cast<int8_t>(kBucketMask << shiftForBucket(index));
deltas_[slot] &= ~clearMask;
int8_t clearMask = static_cast<int8_t>(static_cast<uint8_t>(kBucketMask) << shiftForBucket(index));
deltas_[slot] = static_cast<uint8_t>(deltas_[slot]) & ~static_cast<uint8_t>(clearMask);

// Set the new value.
int8_t setMask = static_cast<int8_t>(value << shiftForBucket(index));
deltas_[slot] |= setMask;
int8_t setMask = static_cast<int8_t>(static_cast<uint8_t>(value) << shiftForBucket(index));
deltas_[slot] |= static_cast<uint8_t>(deltas_[slot]) | static_cast<uint8_t>(setMask);
}

int8_t DenseHll::getOverflow(int32_t index) const {
Expand All @@ -310,7 +312,7 @@ int DenseHll::findOverflowEntry(int32_t index) const {
}

void DenseHll::adjustBaselineIfNeeded() {
auto numBuckets = 1 << indexBitLength_;
auto numBuckets = 1u << indexBitLength_;

while (baselineCount_ == 0) {
baseline_++;
Expand Down Expand Up @@ -389,7 +391,7 @@ int32_t DenseHll::serializedSize() const {
return 1 /* type + version */
+ 1 /* indexBitLength */
+ 1 /* baseline */
+ (1 << indexBitLength_) / 2 /* buckets */
+ (1u << indexBitLength_) / 2 /* buckets */
+ 2 /* overflow bucket count */
+ 2 * overflows_ /* overflow bucket indexes */
+ overflows_ /* overflow bucket values */;
Expand Down Expand Up @@ -426,7 +428,7 @@ bool DenseHll::canDeserialize(const char* input, int size) {
return false;
}

auto numBuckets = 1 << indexBitLength;
auto numBuckets = 1u << indexBitLength;
const int8_t* deltas = stream.read<int8_t>(numBuckets / 2);
auto overflows = stream.read<int16_t>();

Expand Down Expand Up @@ -471,7 +473,7 @@ int32_t DenseHll::estimateInMemorySize(int8_t indexBitLength) {
// Note: we don't take into account overflow entries since their number can
// vary.
return sizeof(indexBitLength_) + sizeof(baseline_) + sizeof(baselineCount_) +
(1 << indexBitLength) / 2;
(1u << indexBitLength) / 2;
}

void DenseHll::serialize(char* output) {
Expand Down Expand Up @@ -500,7 +502,7 @@ DenseHll::DenseHll(const char* serialized, HashStringAllocator* allocator)
initialize(hll.indexBitLength);
baseline_ = hll.baseline;

auto numBuckets = 1 << indexBitLength_;
auto numBuckets = 1u << indexBitLength_;
std::copy(hll.deltas, hll.deltas + numBuckets / 2, deltas_.data());

overflows_ = hll.overflows;
Expand Down Expand Up @@ -553,7 +555,7 @@ void DenseHll::mergeWith(const char* serialized) {

auto baseline = stream.read<int8_t>();

auto numBuckets = 1 << indexBitLength_;
auto numBuckets = 1u << indexBitLength_;
auto deltas = stream.read<int8_t>(numBuckets / 2);
auto overflows = stream.read<int16_t>();
auto overflowBuckets = overflows ? stream.read<uint16_t>(overflows) : nullptr;
Expand Down Expand Up @@ -720,7 +722,9 @@ int32_t DenseHll::mergeWithSimd(const HllView& other, int8_t newBaseline) {

// Store newDelta in deltas_[deltaIndex].
auto slot1 = deltas_[deltaIndex];
deltas_[deltaIndex] = (newDelta << 4) | (slot1 & kBucketMask);
uint8_t highNibble = static_cast<uint8_t>(newDelta) << 4;
uint8_t lowNibble = static_cast<uint8_t>(slot1) & static_cast<uint8_t>(kBucketMask);
deltas_[deltaIndex] = deltas_[deltaIndex] = highNibble | lowNibble;
});
}

Expand All @@ -742,8 +746,10 @@ int32_t DenseHll::mergeWithSimd(const HllView& other, int8_t newBaseline) {
}

// Store newDelta.
auto slot1 = deltas_[deltaIndex];
deltas_[deltaIndex] = (((slot1 >> 4) & kBucketMask) << 4) | newDelta;
auto slot1 = static_cast<uint8_t>(deltas_[deltaIndex]);
uint8_t upperBits = (slot1 >> 4) & static_cast<uint8_t>(kBucketMask);
uint8_t shiftedUpperBits = upperBits << 4;
deltas_[deltaIndex] = shiftedUpperBits | static_cast<uint8_t>(newDelta);
});
}
}
Expand All @@ -762,8 +768,10 @@ int32_t DenseHll::mergeWithScalar(const HllView& other, int8_t newBaseline) {
int8_t slot2 = other.deltas[i];

for (int shift = 4; shift >= 0; shift -= 4) {
int8_t delta1 = (slot1 >> shift) & kBucketMask;
int8_t delta2 = (slot2 >> shift) & kBucketMask;
uint8_t maskedSlot1 = static_cast<uint8_t>(slot1) >> shift;
uint8_t maskedSlot2 = static_cast<uint8_t>(slot2) >> shift;
int8_t delta1 = maskedSlot1 & static_cast<uint8_t>(kBucketMask);
int8_t delta2 = maskedSlot2 & static_cast<uint8_t>(kBucketMask);

auto [newValue, overflowEntry] =
computeNewValue(delta1, delta2, bucket, other);
Expand All @@ -776,8 +784,8 @@ int32_t DenseHll::mergeWithScalar(const HllView& other, int8_t newBaseline) {

newDelta = updateOverflow(bucket, overflowEntry, newDelta);

newSlot <<= 4;
newSlot |= newDelta;
newSlot = static_cast<uint32_t>(newSlot) << 4;
newSlot = static_cast<uint32_t>(newSlot) | static_cast<uint32_t>(newDelta);
bucket++;
}

Expand Down
6 changes: 3 additions & 3 deletions velox/common/hyperloglog/SparseHll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ inline uint32_t decodeIndex(uint32_t entry) {
}

inline uint32_t decodeValue(uint32_t entry) {
return entry & ((1 << kValueBitLength) - 1);
return entry & ((1u << kValueBitLength) - 1);
}

int searchIndex(
Expand All @@ -41,7 +41,7 @@ int searchIndex(
int high = entries.size() - 1;

while (low <= high) {
int middle = (low + high) >> 1;
int middle = static_cast<uint32_t>(low + high) >> 1;

auto middleIndex = decodeIndex(entries[middle]);

Expand Down Expand Up @@ -93,7 +93,7 @@ int64_t SparseHll::cardinality() const {
// 2^kIndexBitLength buckets available due to the fact that we're
// recording the raw leading kIndexBitLength of the hash. This produces
// much better precision while in the sparse regime.
static const int kTotalBuckets = 1 << kIndexBitLength;
static const int kTotalBuckets = 1u << kIndexBitLength;

int zeroBuckets = kTotalBuckets - entries_.size();
return std::round(linearCounting(zeroBuckets, kTotalBuckets));
Expand Down