diff --git a/README.md b/README.md
index bce91a49..7a1f06e9 100644
--- a/README.md
+++ b/README.md
@@ -25,12 +25,12 @@ To add a dependency on hash4j using Maven, use the following:
The average computation time depends logarithmically on the number of buckets. + * * @param pseudoRandomGeneratorProvider a {@link PseudoRandomGeneratorProvider} * @return a {@link ConsistentBucketHasher} */ @@ -36,4 +38,23 @@ public static ConsistentBucketHasher jumpHash( PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) { return new ConsistentJumpBucketHasher(pseudoRandomGeneratorProvider); } + + /** + * Returns a {@link ConsistentBucketHasher}. + * + *
This algorithm is based on the method described in Sergey Ioffe, "Improved Consistent + * Sampling, Weighted Minhash and L1 Sketching," 2010, doi: 10.1109/ICDM.2010.80. which is applied to a + * one-dimensional input vector whose value is equal to the number of buckets. + * + *
The computation time is constant independent of the number of buckets. This method is faster + * than {@link #jumpHash(PseudoRandomGeneratorProvider)} for large number of buckets. + * + * @param pseudoRandomGeneratorProvider a {@link PseudoRandomGeneratorProvider} + * @return a {@link ConsistentBucketHasher} + */ + public static ConsistentBucketHasher improvedConsistentWeightedSampling( + PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) { + return new ImprovedConsistentWeightedSampling(pseudoRandomGeneratorProvider); + } } diff --git a/src/main/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasher.java b/src/main/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasher.java index e63a3691..57140ab9 100644 --- a/src/main/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasher.java +++ b/src/main/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasher.java @@ -54,7 +54,7 @@ class ConsistentJumpBucketHasher implements ConsistentBucketHasher { // see // https://github.com/google/guava/blob/0a17f4a429323589396c38d8ce75ca058faa6c64/guava/src/com/google/common/hash/Hashing.java#L559 @Override - public int getBucket(long hash, int numBuckets) { + public strictfp int getBucket(long hash, int numBuckets) { checkArgument(numBuckets > 0, "buckets must be positive"); pseudoRandomGenerator.reset(hash); @@ -64,11 +64,10 @@ public int getBucket(long hash, int numBuckets) { // Jump from bucket to bucket until we go out of range while (true) { next = (int) ((candidate + 1) / pseudoRandomGenerator.nextDouble()); - if (next > candidate && next < numBuckets) { - candidate = next; - } else { - return candidate; - } + if (next >= numBuckets || next <= candidate) + return candidate; // second condition protects against infinite loops caused by bad random + // values such as NaN or values outside of [0,1) + candidate = next; } } } diff --git a/src/main/java/com/dynatrace/hash4j/consistent/ImprovedConsistentWeightedSampling.java b/src/main/java/com/dynatrace/hash4j/consistent/ImprovedConsistentWeightedSampling.java new file mode 100644 index 00000000..77bbea6d --- /dev/null +++ b/src/main/java/com/dynatrace/hash4j/consistent/ImprovedConsistentWeightedSampling.java @@ -0,0 +1,52 @@ +/* + * Copyright 2023 Dynatrace LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.dynatrace.hash4j.consistent; + +import static com.dynatrace.hash4j.util.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +import com.dynatrace.hash4j.random.PseudoRandomGenerator; +import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider; + +/** + * Consistent hashing algorithm based on a simplified version of the algorithm described in Sergey + * Ioffe, "Improved Consistent + * Sampling, Weighted Minhash and L1 Sketching," 2010 IEEE International Conference on Data + * Mining, Sydney, NSW, Australia, 2010, pp. 246-255, doi: 10.1109/ICDM.2010.80. + */ +class ImprovedConsistentWeightedSampling implements ConsistentBucketHasher { + + private final PseudoRandomGenerator pseudoRandomGenerator; + + ImprovedConsistentWeightedSampling(PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) { + requireNonNull(pseudoRandomGeneratorProvider); + this.pseudoRandomGenerator = pseudoRandomGeneratorProvider.create(); + } + + @Override + public strictfp int getBucket(long hash, int numBuckets) { + checkArgument(numBuckets > 0, "buckets must be positive"); + pseudoRandomGenerator.reset(hash); + double r = pseudoRandomGenerator.nextExponential() + pseudoRandomGenerator.nextExponential(); + double b = pseudoRandomGenerator.nextDouble(); + double t = StrictMath.floor(StrictMath.log(numBuckets) / r + b); + double y = StrictMath.exp(r * (t - b)); + // y should always be in the range [0, numBuckets), + // but could be larger due to numerical inaccuracies, + // therefore limit result after rounding down to numBuckets - 1 + return Math.min((int) y, numBuckets - 1); + } +} diff --git a/src/test/java/com/dynatrace/hash4j/consistent/AbstractConsistentBucketHasherTest.java b/src/test/java/com/dynatrace/hash4j/consistent/AbstractConsistentBucketHasherTest.java new file mode 100644 index 00000000..8704d4c3 --- /dev/null +++ b/src/test/java/com/dynatrace/hash4j/consistent/AbstractConsistentBucketHasherTest.java @@ -0,0 +1,202 @@ +/* + * Copyright 2023 Dynatrace LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.dynatrace.hash4j.consistent; + +import static org.assertj.core.api.Assertions.*; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import com.dynatrace.hash4j.hashing.HashStream64; +import com.dynatrace.hash4j.hashing.Hashing; +import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider; +import com.dynatrace.hash4j.random.PseudoRandomGeneratorProviderForTesting; +import java.util.Arrays; +import java.util.SplittableRandom; +import java.util.stream.IntStream; +import org.hipparchus.stat.inference.AlternativeHypothesis; +import org.hipparchus.stat.inference.BinomialTest; +import org.hipparchus.stat.inference.ChiSquareTest; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; + +abstract class AbstractConsistentBucketHasherTest { + + protected abstract ConsistentBucketHasher getConsistentBucketHasher( + PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider); + + @Test + void testIllegalNumBuckets() { + ConsistentBucketHasher consistentBucketHasher = + getConsistentBucketHasher(PseudoRandomGeneratorProvider.splitMix64_V1()); + assertThatIllegalArgumentException().isThrownBy(() -> consistentBucketHasher.getBucket(0L, 0)); + assertThatIllegalArgumentException().isThrownBy(() -> consistentBucketHasher.getBucket(0L, -1)); + } + + @Test + void testNullPseudoRandomNumberGenerator() { + assertThatNullPointerException().isThrownBy(() -> ConsistentHashing.jumpHash(null)); + } + + @ParameterizedTest + @MethodSource("getNumBuckets") + void testUniformDistribution(int numBuckets) { + double alpha = 0.0001; + int numCycles = 500000; + long seed = + Hashing.komihash5_0() + .hashStream() + .putLong(0x1c1e29a7c6f82fa8L) + .putInt(numBuckets) + .getAsLong(); + long[] counts = new long[numBuckets]; + double[] expected = new double[numBuckets]; + Arrays.fill(expected, 1.0); + ConsistentBucketHasher consistentBucketHasher = + getConsistentBucketHasher(PseudoRandomGeneratorProvider.splitMix64_V1()); + + SplittableRandom random = new SplittableRandom(seed); + for (int i = 0; i < numCycles; ++i) { + int bucketIdx = consistentBucketHasher.getBucket(random.nextLong(), numBuckets); + counts[bucketIdx] += 1; + } + + if (numBuckets >= 2) { + double pValue = new ChiSquareTest().chiSquareTest(expected, counts); + assertThat(pValue).isGreaterThan(alpha); + } + } + + private void testRedistribution(int numBuckets, int numCycles, long seed) { + ConsistentBucketHasher consistentBucketHasher = + getConsistentBucketHasher(PseudoRandomGeneratorProvider.splitMix64_V1()); + + SplittableRandom random = new SplittableRandom(seed); + for (int i = 0; i < numCycles; ++i) { + long hash = random.nextLong(); + int oldBucketIdx = consistentBucketHasher.getBucket(hash, numBuckets); + int newBucketIdx = consistentBucketHasher.getBucket(hash, numBuckets + 1); + if (oldBucketIdx != newBucketIdx) { + assertThat(newBucketIdx).isEqualTo(numBuckets); + } + } + } + + private static IntStream getNumBuckets() { + int maxNumBuckets = 200; + return IntStream.range(1, maxNumBuckets + 1); + } + + @ParameterizedTest + @MethodSource("getNumBuckets") + void testRedistribution(int numBuckets) { + int numCycles = 10000; + long seed = + Hashing.komihash5_0() + .hashStream() + .putLong(0x3df6dcebff42e20dL) + .putInt(numBuckets) + .getAsLong(); + testRedistribution(numBuckets, numCycles, seed); + } + + @Test + void testMaxNumBuckets() { + double alpha = 0.001; + + SplittableRandom random = new SplittableRandom(0x5cfb4dcb296c1921L); + + int numBuckets = Integer.MAX_VALUE; + int numTrials = 1000000; + + int numZero = 0; + int numEven = 0; + int numLower = 0; + + ConsistentBucketHasher consistentBucketHasher = + getConsistentBucketHasher(PseudoRandomGeneratorProvider.splitMix64_V1()); + + for (int i = 0; i < numTrials; ++i) { + int bucket = consistentBucketHasher.getBucket(random.nextLong(), numBuckets); + if (bucket == 0) { + numZero += 1; + } else { + if ((bucket & 1) == 0) { + numEven += 1; + } + if (bucket < numBuckets / 2) { + numLower += 1; + } + } + } + assertThat( + new BinomialTest() + .binomialTest(numTrials - numZero, numEven, 0.5, AlternativeHypothesis.TWO_SIDED)) + .isGreaterThan(alpha); + assertThat( + new BinomialTest() + .binomialTest(numTrials - numZero, numLower, 0.5, AlternativeHypothesis.TWO_SIDED)) + .isGreaterThan(alpha); + assertThat( + new BinomialTest() + .binomialTest(numTrials, numZero, 1. / numBuckets, AlternativeHypothesis.TWO_SIDED)) + .isGreaterThan(alpha); + } + + protected abstract long getCheckSum(); + + @Test + void testCheckSum() { + int numIterations = 1_000_000; + SplittableRandom random = new SplittableRandom(0x2df5ae93946a7653L); + ConsistentBucketHasher hasher = + getConsistentBucketHasher(PseudoRandomGeneratorProvider.splitMix64_V1()); + HashStream64 checkSumHashStream = Hashing.komihash5_0().hashStream(); + for (int i = 0; i < numIterations; ++i) { + int numBuckets = random.nextInt(Integer.MAX_VALUE); + long hash = random.nextLong(); + int bucketIdx = hasher.getBucket(hash, numBuckets); + checkSumHashStream.putInt(bucketIdx); + } + assertThat(checkSumHashStream.getAsLong()).isEqualTo(getCheckSum()); + } + + @ParameterizedTest + @ValueSource( + doubles = { + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -2, + -1, + 0., + 1., + 2, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY, + Double.NaN + }) + void testInvalidPseudoRandomGeneratorNextDouble(double randomValue) { + PseudoRandomGeneratorProviderForTesting pseudoRandomGeneratorProvider = + new PseudoRandomGeneratorProviderForTesting(); + + ConsistentBucketHasher consistentBucketHasher = + getConsistentBucketHasher(pseudoRandomGeneratorProvider); + + pseudoRandomGeneratorProvider.setDoubleValue(randomValue); + assertThatNoException() + .isThrownBy(() -> consistentBucketHasher.getBucket(0x82739fa8da9a7728L, 10)); + } +} diff --git a/src/test/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasherTest.java b/src/test/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasherTest.java index 0eb9d3c1..b9216565 100644 --- a/src/test/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasherTest.java +++ b/src/test/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasherTest.java @@ -18,102 +18,17 @@ import static org.assertj.core.api.Assertions.*; import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider; -import com.dynatrace.hash4j.random.PseudoRandomGeneratorProviderForTesting; -import java.util.Arrays; -import java.util.SplittableRandom; -import org.hipparchus.stat.inference.AlternativeHypothesis; -import org.hipparchus.stat.inference.BinomialTest; -import org.hipparchus.stat.inference.ChiSquareTest; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.ValueSource; -class ConsistentJumpBucketHasherTest { +class ConsistentJumpBucketHasherTest extends AbstractConsistentBucketHasherTest { - @Test - void testIllegalNumBuckets() { - ConsistentBucketHasher consistentBucketHasher = - ConsistentHashing.jumpHash(PseudoRandomGeneratorProvider.splitMix64_V1()); - assertThatIllegalArgumentException().isThrownBy(() -> consistentBucketHasher.getBucket(0L, 0)); - assertThatIllegalArgumentException().isThrownBy(() -> consistentBucketHasher.getBucket(0L, -1)); + @Override + protected ConsistentBucketHasher getConsistentBucketHasher( + PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) { + return ConsistentHashing.jumpHash(pseudoRandomGeneratorProvider); } - @Test - void testNullPseudoRandomNumberGenerator() { - assertThatNullPointerException().isThrownBy(() -> ConsistentHashing.jumpHash(null)); - } - - @Test - void testUniformDistribution() { - - int numBuckets = 10; - int numCycles = 100000; - long[] counts = new long[numBuckets]; - double[] expected = new double[numBuckets]; - Arrays.fill(expected, 1.0); - ConsistentBucketHasher consistentBucketHasher = - ConsistentHashing.jumpHash(PseudoRandomGeneratorProvider.splitMix64_V1()); - - SplittableRandom random = new SplittableRandom(0x392c64621adad448L); - for (int i = 0; i < numCycles; ++i) { - int bucketIdx = consistentBucketHasher.getBucket(random.nextLong(), numBuckets); - counts[bucketIdx] += 1; - } - - double pValue = new ChiSquareTest().chiSquareTest(expected, counts); - assertThat(pValue).isGreaterThan(0.01); - } - - @Test - void testOptimalRedistribution() { - - int numBuckets = 10; - int numCycles = 100000; - ConsistentBucketHasher consistentBucketHasher = - ConsistentHashing.jumpHash(PseudoRandomGeneratorProvider.splitMix64_V1()); - - SplittableRandom random = new SplittableRandom(0x08b6fbb0a6626254L); - int countNewBucket = 0; - for (int i = 0; i < numCycles; ++i) { - long hash = random.nextLong(); - int oldBucketIdx = consistentBucketHasher.getBucket(hash, numBuckets); - int newBucketIdx = consistentBucketHasher.getBucket(hash, numBuckets + 1); - if (oldBucketIdx != newBucketIdx) { - assertThat(newBucketIdx).isEqualTo(numBuckets); - countNewBucket += 1; - } - } - - double pValue = - new BinomialTest() - .binomialTest( - numCycles, countNewBucket, 1. / (numBuckets + 1), AlternativeHypothesis.TWO_SIDED); - assertThat(pValue).isGreaterThan(0.01); - } - - @ParameterizedTest - @ValueSource( - doubles = { - Double.NEGATIVE_INFINITY, - -Double.MAX_VALUE, - -2, - -1, - 0., - 1., - 2, - Double.MAX_VALUE, - Double.POSITIVE_INFINITY, - Double.NaN - }) - void testInvalidPseudoRandomGenerator(double randomValue) { - PseudoRandomGeneratorProviderForTesting pseudoRandomGeneratorProvider = - new PseudoRandomGeneratorProviderForTesting(); - - ConsistentBucketHasher consistentBucketHasher = - ConsistentHashing.jumpHash(pseudoRandomGeneratorProvider); - - pseudoRandomGeneratorProvider.setDoubleValue(randomValue); - assertThatNoException() - .isThrownBy(() -> consistentBucketHasher.getBucket(0x82739fa8da9a7728L, 10)); + @Override + protected long getCheckSum() { + return 0x42cf069c52a4ee21L; } } diff --git a/src/test/java/com/dynatrace/hash4j/consistent/ImprovedConsistentWeightedSamplingTest.java b/src/test/java/com/dynatrace/hash4j/consistent/ImprovedConsistentWeightedSamplingTest.java new file mode 100644 index 00000000..b72e729a --- /dev/null +++ b/src/test/java/com/dynatrace/hash4j/consistent/ImprovedConsistentWeightedSamplingTest.java @@ -0,0 +1,63 @@ +/* + * Copyright 2023 Dynatrace LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.dynatrace.hash4j.consistent; + +import static org.assertj.core.api.Assertions.assertThatNoException; + +import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider; +import com.dynatrace.hash4j.random.PseudoRandomGeneratorProviderForTesting; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class ImprovedConsistentWeightedSamplingTest extends AbstractConsistentBucketHasherTest { + + @Override + protected ConsistentBucketHasher getConsistentBucketHasher( + PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) { + return ConsistentHashing.improvedConsistentWeightedSampling(pseudoRandomGeneratorProvider); + } + + @Override + protected long getCheckSum() { + return 0x41b4e6aa922fae85L; + } + + @ParameterizedTest + @ValueSource( + doubles = { + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -2, + -1, + 0., + 1., + 2, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY, + Double.NaN + }) + void testInvalidPseudoRandomGeneratorNextExponential(double randomValue) { + PseudoRandomGeneratorProviderForTesting pseudoRandomGeneratorProvider = + new PseudoRandomGeneratorProviderForTesting(); + + ConsistentBucketHasher consistentBucketHasher = + getConsistentBucketHasher(pseudoRandomGeneratorProvider); + + pseudoRandomGeneratorProvider.setExponentialValue(randomValue); + assertThatNoException() + .isThrownBy(() -> consistentBucketHasher.getBucket(0x82739fa8da9a7728L, 10)); + } +}