Skip to content

Commit

Permalink
Optimize PFCOUNT, PFMERGE command by SIMD acceleration (#1293)
Browse files Browse the repository at this point in the history
This PR optimizes the performance of HyperLogLog commands (PFCOUNT,
PFMERGE) by adding AVX2 fast paths.

Two AVX2 functions are added for conversion between raw representation
and dense representation. They are 15 ~ 30 times faster than scalar
implementaion. Note that sparse representation is not accelerated.

AVX2 fast paths are enabled when the CPU supports AVX2 (checked at
runtime) and the hyperloglog configuration is default (HLL_REGISTERS ==
16384 && HLL_BITS == 6).

`PFDEBUG SIMD (ON|OFF)` subcommand is added for unit tests. A new TCL
unit test checks that the results produced by non-AVX2 and AVX2
implementations are exactly equal.

When merging 3 dense hll structures, the benchmark shows a 12x speedup
compared to the scalar version.

```
pfcount key1 key2 key3
pfmerge keyall key1 key2 key3
```

```
======================================================================================================
Type             Ops/sec    Avg. Latency     p50 Latency     p99 Latency   p99.9 Latency       KB/sec 
------------------------------------------------------------------------------------------------------
PFCOUNT-scalar    5665.56        35.29839        32.25500        63.99900        67.58300       608.60
PFCOUNT-avx2     72377.83         2.75834         2.67100         5.34300         6.81500      7774.96
------------------------------------------------------------------------------------------------------
PFMERGE-scalar    9851.29        20.28806        20.09500        36.86300        39.16700       615.71
PFMERGE-avx2    125621.89         1.59126         1.55100         3.11900         4.70300     15702.74
------------------------------------------------------------------------------------------------------

scalar: valkey:unstable  2df56d8
avx2:   Nugine:hll-simd  8f9adc3

CPU:    13th Gen Intel® Core™ i9-13900H × 20
Memory: 32.0 GiB
OS:     Ubuntu 22.04.5 LTS
```

Experiment repo: https://github.com/Nugine/redis-hyperloglog
Benchmark script:
https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh
Algorithm:
https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp

---------

Signed-off-by: Xuyang Wang <[email protected]>
  • Loading branch information
Nugine authored Dec 2, 2024
1 parent fbbfe5d commit 3df609e
Show file tree
Hide file tree
Showing 3 changed files with 345 additions and 11 deletions.
13 changes: 13 additions & 0 deletions src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,4 +364,17 @@ void setcpuaffinity(const char *cpulist);
#define valkey_prefetch(addr) ((void)(addr))
#endif

/* Check if we can compile AVX2 code */
#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4))
#if defined(__has_attribute) && __has_attribute(target)
#define HAVE_AVX2
#endif
#endif

#if defined(HAVE_AVX2)
#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2")))
#else
#define ATTRIBUTE_TARGET_AVX2
#endif

#endif
303 changes: 292 additions & 11 deletions src/hyperloglog.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
#include <stdint.h>
#include <math.h>

#ifdef HAVE_AVX2
#include <immintrin.h>
#endif

/* The HyperLogLog implementation is based on the following ideas:
*
* * The use of a 64 bit hash function as proposed in [1], in order to estimate
Expand Down Expand Up @@ -208,6 +212,13 @@ struct hllhdr {

static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";

#ifdef HAVE_AVX2
static int simd_enabled = 1;
#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
#else
#define HLL_USE_AVX2 0
#endif

/* =========================== Low level bit macros ========================= */

/* Macros to access the dense representation.
Expand Down Expand Up @@ -1064,6 +1075,136 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) {
}
}

#ifdef HAVE_AVX2
/* A specialized version of hllMergeDense, optimized for default configurations.
*
* Requirements:
* 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
* 2) The CPU supports AVX2 (checked at runtime in hllMergeDense)
*
* reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
* reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
*/
ATTRIBUTE_TARGET_AVX2
void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
/* Shuffle indices for unpacking bytes of dense registers
* From: {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
* To: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
*/
const __m256i shuffle = _mm256_setr_epi8( //
4, 5, 6, -1, //
7, 8, 9, -1, //
10, 11, 12, -1, //
13, 14, 15, -1, //
0, 1, 2, -1, //
3, 4, 5, -1, //
6, 7, 8, -1, //
9, 10, 11, -1 //
);

/* Merge the first 8 registers (6 bytes) normally
* as the AVX2 algorithm needs 4 padding bytes at the start */
uint8_t val;
for (int i = 0; i < 8; i++) {
HLL_DENSE_GET_REGISTER(val, reg_dense, i);
if (val > reg_raw[i]) {
reg_raw[i] = val;
}
}

/* Dense to Raw:
*
* 4 registers in 3 bytes:
* {bbaaaaaa|ccccbbbb|ddddddcc}
*
* LOAD 32 bytes (32 registers) per iteration:
* 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding)
* {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
*
* SHUFFLE to:
* {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
* {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
*
* AVX2 is little endian, each of the 8 groups is a little-endian int32.
* A group (int32) contains 3 valid bytes (4 registers) and a zero byte.
*
* extract registers in each group with AND and SHIFT:
* {00aaaaaa|00000000|00000000|00000000} x8 (<<0)
* {00000000|00bbbbbb|00000000|00000000} x8 (<<2)
* {00000000|00000000|00cccccc|00000000} x8 (<<4)
* {00000000|00000000|00000000|00dddddd} x8 (<<6)
*
* merge the extracted registers with OR:
* {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
*
* Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw
*/

/* Skip 8 registers (6 bytes) */
const uint8_t *r = reg_dense + 6 - 4;
uint8_t *t = reg_raw + 8;

for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
__m256i x0, x;
x0 = _mm256_loadu_si256((__m256i *)r);
x = _mm256_shuffle_epi8(x0, shuffle);

__m256i a1, a2, a3, a4;
a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0));
a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000));
a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000));

a2 = _mm256_slli_epi32(a2, 2);
a3 = _mm256_slli_epi32(a3, 4);
a4 = _mm256_slli_epi32(a4, 6);

__m256i y1, y2, y;
y1 = _mm256_or_si256(a1, a2);
y2 = _mm256_or_si256(a3, a4);
y = _mm256_or_si256(y1, y2);

__m256i z = _mm256_loadu_si256((__m256i *)t);

z = _mm256_max_epu8(z, y);

_mm256_storeu_si256((__m256i *)t, z);

r += 24;
t += 32;
}

/* Merge the last 24 registers normally
* as the AVX2 algorithm needs 4 padding bytes at the end */
for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) {
HLL_DENSE_GET_REGISTER(val, reg_dense, i);
if (val > reg_raw[i]) {
reg_raw[i] = val;
}
}
}
#endif

/* Merge dense-encoded registers to raw registers array. */
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
#ifdef HAVE_AVX2
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
if (HLL_USE_AVX2) {
hllMergeDenseAVX2(reg_raw, reg_dense);
return;
}
}
#endif

uint8_t val;
for (int i = 0; i < HLL_REGISTERS; i++) {
HLL_DENSE_GET_REGISTER(val, reg_dense, i);
if (val > reg_raw[i]) {
reg_raw[i] = val;
}
}
}

/* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll'
* with an array of uint8_t HLL_REGISTERS registers pointed by 'max'.
*
Expand All @@ -1077,12 +1218,7 @@ int hllMerge(uint8_t *max, robj *hll) {
int i;

if (hdr->encoding == HLL_DENSE) {
uint8_t val;

for (i = 0; i < HLL_REGISTERS; i++) {
HLL_DENSE_GET_REGISTER(val, hdr->registers, i);
if (val > max[i]) max[i] = val;
}
hllMergeDense(max, hdr->registers);
} else {
uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr);
long runlen, regval;
Expand Down Expand Up @@ -1114,6 +1250,121 @@ int hllMerge(uint8_t *max, robj *hll) {
return C_OK;
}

#ifdef HAVE_AVX2
/* A specialized version of hllDenseCompress, optimized for default configurations.
*
* Requirements:
* 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
* 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress)
*
* reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
* reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
*/
ATTRIBUTE_TARGET_AVX2
void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
/* Shuffle indices for packing bytes of dense registers
* From: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
* To: {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
*/
const __m256i shuffle = _mm256_setr_epi8( //
0, 1, 2, //
4, 5, 6, //
8, 9, 10, //
12, 13, 14, //
-1, -1, -1, -1, //
0, 1, 2, //
4, 5, 6, //
8, 9, 10, //
12, 13, 14, //
-1, -1, -1, -1 //
);

/* Raw to Dense:
*
* LOAD 32 bytes (32 registers) per iteration:
* {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
*
* AVX2 is little endian, each of the 8 groups is a little-endian int32.
* A group (int32) contains 4 registers.
*
* move the registers to correct positions with AND and SHIFT:
* {00aaaaaa|00000000|00000000|00000000} x8 (>>0)
* {bb000000|0000bbbb|00000000|00000000} x8 (>>2)
* {00000000|cccc0000|000000cc|00000000} x8 (>>4)
* {00000000|00000000|dddddd00|00000000} x8 (>>6)
*
* merge the registers with OR:
* {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
* {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
*
* SHUFFLE to:
* {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
*
* STORE the lower half and higher half respectively:
* AAABBBCCCDDD0000
* EEEFFFGGGHHH0000
* AAABBBCCCDDDEEEFFFGGGHHH0000
*
* Note that the last 4 bytes are padding bytes.
*/

const uint8_t *r = reg_raw;
uint8_t *t = reg_dense;

for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
__m256i x = _mm256_loadu_si256((__m256i *)r);

__m256i a1, a2, a3, a4;
a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00));
a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000));
a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000));

a2 = _mm256_srli_epi32(a2, 2);
a3 = _mm256_srli_epi32(a3, 4);
a4 = _mm256_srli_epi32(a4, 6);

__m256i y1, y2, y;
y1 = _mm256_or_si256(a1, a2);
y2 = _mm256_or_si256(a3, a4);
y = _mm256_or_si256(y1, y2);
y = _mm256_shuffle_epi8(y, shuffle);

__m128i lower, higher;
lower = _mm256_castsi256_si128(y);
higher = _mm256_extracti128_si256(y, 1);

_mm_storeu_si128((__m128i *)t, lower);
_mm_storeu_si128((__m128i *)(t + 12), higher);

r += 32;
t += 24;
}

/* Merge the last 32 registers normally
* as the AVX2 algorithm needs 4 padding bytes at the end */
for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) {
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
}
}
#endif

/* Compress raw registers to dense representation. */
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
#ifdef HAVE_AVX2
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
if (HLL_USE_AVX2) {
hllDenseCompressAVX2(reg_dense, reg_raw);
return;
}
}
#endif

for (int i = 0; i < HLL_REGISTERS; i++) {
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
}
}

/* ========================== HyperLogLog commands ========================== */

/* Create an HLL object. We always create the HLL using sparse encoding.
Expand Down Expand Up @@ -1363,12 +1614,17 @@ void pfmergeCommand(client *c) {

/* Write the resulting HLL to the destination HLL registers and
* invalidate the cached value. */
for (j = 0; j < HLL_REGISTERS; j++) {
if (max[j] == 0) continue;
if (use_dense) {
hdr = o->ptr;
switch (hdr->encoding) {
case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break;
case HLL_SPARSE: hllSparseSet(o, j, max[j]); break;
hllDenseCompress(hdr->registers, max);
} else {
for (j = 0; j < HLL_REGISTERS; j++) {
if (max[j] == 0) continue;
hdr = o->ptr;
switch (hdr->encoding) {
case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break;
case HLL_SPARSE: hllSparseSet(o, j, max[j]); break;
}
}
}
hdr = o->ptr; /* o->ptr may be different now, as a side effect of
Expand Down Expand Up @@ -1494,13 +1750,38 @@ void pfselftestCommand(client *c) {
* PFDEBUG DECODE <key>
* PFDEBUG ENCODING <key>
* PFDEBUG TODENSE <key>
* PFDEBUG SIMD (ON|OFF)
*/
void pfdebugCommand(client *c) {
char *cmd = c->argv[1]->ptr;
struct hllhdr *hdr;
robj *o;
int j;

if (!strcasecmp(cmd, "simd")) {
if (c->argc != 3) goto arityerr;

if (!strcasecmp(c->argv[2]->ptr, "on")) {
#ifdef HAVE_AVX2
simd_enabled = 1;
#endif
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
#ifdef HAVE_AVX2
simd_enabled = 0;
#endif
} else {
addReplyError(c, "Argument must be ON or OFF");
}

if (HLL_USE_AVX2) {
addReplyStatus(c, "enabled");
} else {
addReplyStatus(c, "disabled");
}

return;
}

o = lookupKeyWrite(c->db, c->argv[2]);
if (o == NULL) {
addReplyError(c, "The specified key does not exist");
Expand Down
Loading

0 comments on commit 3df609e

Please sign in to comment.