Skip to content

Commit

Permalink
SSSE3->AVX2, NEON32 decoding optimization
Browse files Browse the repository at this point in the history
Use Wojciech Mula (@WojciechMula) and @aqrit implementation update for
AVX2 / SSSE3 / NEON32 decoding.

SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched
decoding loops.

SSE4.2 implementation is now useless but kept to ease integration of
future updates if needed.

Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64)
SSSE3 decoding: +79%
SSE4.2 decoding: +37%
AVX decoding: +57%
AVX2 decoding: +64%

Speed-up on Apple iPhone SE (clang-800.0.42.1, armv7)
NEON32 decoding: +66%
  • Loading branch information
mayeut committed Aug 22, 2017
1 parent b6417f3 commit 0a69845
Show file tree
Hide file tree
Showing 9 changed files with 257 additions and 236 deletions.
38 changes: 19 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -423,32 +423,32 @@ The tables below contain some results on random machines. All numbers measured w

x86 processors

| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
| i7-4770 @ 3.4 GHz DDR1600 | 1790 | 3038 | 4899 | 4043 | 4938 | 4939 | 4796 | 5709 | 4681 | 6386 |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784 | 3041 | 4945 | 4035 | 4954 | 4941 | 4776 | 5719 | 4661 | 6294 |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401 | 5729 | 5489 | 7444 | 5030 | 8531 | 5003 | 8624 | 5105 | 8558 |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884 | 7099 | 4917 | 7057 | 4915 | 7541 | 4799 | 7143 | 4902 | 7219 |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212 | 8849 | 5284 | 9099 | 5245 | 9160 | 5289 | 9220 | 4849 | 9200 |
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 3886 | 6701 | 5098 | 7015 | 5281 | 8328 | 7063 |
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - |
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - |
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - |
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | - | - | - | - |
| Intel Edison @ 500 MHz (x86-64) | 97 | 146 | 197 | 207 | 197 | 145 | - | - | - | - |
| Intel Edison @ 500 MHz (x86-64) 2 thread | 193 | 288 | 389 | 410 | 389 | 289 | - | - | - | - |
| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | AVX enc | AVX dec | AVX2 enc | AVX2 dec |
|-------------------------------------------|----------:|----------:|----------:|----------:|--------:|--------:|---------:|---------:|
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | 4999\* | 6666\* |
| i7-4770 @ 3.4 GHz DDR1600 | 1790 | 3038 | 4899 | 4043\* | 4796 | 5709\* | 4681 | 6386\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784 | 3041 | 4945 | 4035\* | 4776 | 5719\* | 4661 | 6294\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401 | 5729 | 5489 | 7444\* | 5003 | 8624\* | 5105 | 8558\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884 | 7099 | 4917 | 7057\* | 4799 | 7143\* | 4902 | 7219\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212 | 8849 | 5284 | 9099\* | 5289 | 9220\* | 4849 | 9200\* |
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 6962 | 7015 | 8267 | 8328 | 11576 |
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | 4124\* | 5403\* |
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - |
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - |
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - |
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | - | - | - | - |
| Intel Edison @ 500 MHz (x86-64) | 97 | 146 | 197 | 207\* | - | - | - | - |
| Intel Edison @ 500 MHz (x86-64) 2 thread | 193 | 288 | 389 | 410\* | - | - | - | - |

ARM processors

| Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
|-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
| Raspberry PI B+ V1.2 | 46 | 40\* | - | - | - | - |
| Raspberry PI 2 B V1.1 | 104 | 88\* | 188 | 116\* | - | - |
| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 1573 | - | - |
| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 2618 | - | - |
| Apple iPhone SE arm64 | 1061 | 1239 | - | - | 4098 | 3983 |

PowerPC processors
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/avx/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ BASE64_DEC_FUNCTION(avx)
{
#ifdef __AVX__
#include "../generic/dec_head.c"
#include "../sse42/dec_loop.c"
#include "../ssse3/dec_loop.c"
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
Expand Down
50 changes: 25 additions & 25 deletions lib/arch/avx2/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,35 +127,35 @@ enc_translate (const __m256i in)
static inline __m256i
dec_reshuffle (__m256i in)
{
// Mask in a single byte per shift:
const __m256i maskB2 = _mm256_set1_epi32(0x003F0000);
const __m256i maskB1 = _mm256_set1_epi32(0x00003F00);

// Pack bytes together:
__m256i out = _mm256_srli_epi32(in, 16);

out = _mm256_or_si256(out, _mm256_srli_epi32(_mm256_and_si256(in, maskB2), 2));
// in, lower lane, bits, upper case are most significant bits, lower case are least significant bits:
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA

out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, maskB1), 12));
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB

out = _mm256_or_si256(out, _mm256_slli_epi32(in, 26));
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc

// Pack bytes together within 32-bit words, discarding words 3 and 7:
// Pack bytes together in each lane:
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
3, 2, 1,
7, 6, 5,
11, 10, 9,
15, 14, 13,
-1, -1, -1, -1,
3, 2, 1,
7, 6, 5,
11, 10, 9,
15, 14, 13,
-1, -1, -1, -1));

// Pack 32-bit words together, squashing empty words 3 and 7:
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(
0, 1, 2, 4, 5, 6, -1, -1));
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa

// Pack lanes
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
}

#endif // __AVX2__
Expand Down
60 changes: 32 additions & 28 deletions lib/arch/avx2/dec_loop.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,42 @@ while (srclen >= 45)
// Load string:
__m256i str = _mm256_loadu_si256((__m256i *)c);

// The input consists of six character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input

const __m256i set1 = CMPEQ(str, '+');
const __m256i set2 = CMPEQ(str, '/');
const __m256i set3 = RANGE(str, '0', '9');
const __m256i set4 = RANGE(str, 'A', 'Z');
const __m256i set5 = RANGE(str, 'a', 'z');

__m256i delta = REPLACE(set1, 19);
delta = _mm256_or_si256(delta, REPLACE(set2, 16));
delta = _mm256_or_si256(delta, REPLACE(set3, 4));
delta = _mm256_or_si256(delta, REPLACE(set4, -65));
delta = _mm256_or_si256(delta, REPLACE(set5, -71));

// Check for invalid input: if any of the delta values are zero,
// fall back on bytewise code to do error checking and reporting:
if (_mm256_movemask_epi8(CMPEQ(delta, 0))) {
// see ssse3/dec_loop.c for an explanation of how the code works.

const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);

const __m256i lut_hi = _mm256_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);

const __m256i lut_roll = _mm256_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0,
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0);

const __m256i mask_2F = _mm256_set1_epi8(0x2f);

// lookup
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));

if (!_mm256_testz_si256(lo, hi)) {
break;
}

// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, delta);
str = _mm256_add_epi8(str, roll);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
Expand Down
116 changes: 75 additions & 41 deletions lib/arch/neon32/dec_loop.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,87 @@
// don't need to check if we have enough remaining input to cover them:
while (srclen >= 64)
{
uint8x16x4_t set1, set2, set3, set4, set5, delta;
uint8x16x3_t dec;

// Load 64 bytes and deinterleave:
uint8x16x4_t str = vld4q_u8((uint8_t *)c);

// The input consists of six character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input

// Benchmarking on the Raspberry Pi 2B and Clang shows that looping
// generates slightly faster code than explicit unrolling:
for (int i = 0; i < 4; i++) {
set1.val[i] = CMPEQ(str.val[i], '+');
set2.val[i] = CMPEQ(str.val[i], '/');
set3.val[i] = RANGE(str.val[i], '0', '9');
set4.val[i] = RANGE(str.val[i], 'A', 'Z');
set5.val[i] = RANGE(str.val[i], 'a', 'z');

delta.val[i] = REPLACE(set1.val[i], 19);
delta.val[i] = vbslq_u8(set2.val[i], vdupq_n_u8( 16), delta.val[i]);
delta.val[i] = vbslq_u8(set3.val[i], vdupq_n_u8( 4), delta.val[i]);
delta.val[i] = vbslq_u8(set4.val[i], vdupq_n_u8(-65), delta.val[i]);
delta.val[i] = vbslq_u8(set5.val[i], vdupq_n_u8(-71), delta.val[i]);
// see ssse3/dec_loop.c for an explanation of how the code works.

const uint8x16_t lut_lo = {
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
};
const uint8x16_t lut_hi = {
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
};

const uint8x16_t lut_roll = {
0, 16, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
0, 0, 0, 0, 0, 0, 0, 0
};

const uint8x16_t mask_F = vdupq_n_u8(0xf);
const uint8x16_t mask_2F = vdupq_n_u8(0x2f);

uint8x16_t classified;

{
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[0], 4);
const uint8x16_t lo_nibbles = vandq_u8(str.val[0], mask_F);
const uint8x16_t eq_2F = vceqq_u8(str.val[0], mask_2F);

const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);

const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
classified = vandq_u8(lo, hi);
// Now simply add the delta values to the input:
str.val[0] = vaddq_u8(str.val[0], delta);
}
{
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[1], 4);
const uint8x16_t lo_nibbles = vandq_u8(str.val[1], mask_F);
const uint8x16_t eq_2F = vceqq_u8(str.val[1], mask_2F);

const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);

const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
classified = vorrq_u8(classified, vandq_u8(lo, hi));
// Now simply add the delta values to the input:
str.val[1] = vaddq_u8(str.val[1], delta);
}
{
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[2], 4);
const uint8x16_t lo_nibbles = vandq_u8(str.val[2], mask_F);
const uint8x16_t eq_2F = vceqq_u8(str.val[2], mask_2F);

const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);

const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
classified = vorrq_u8(classified, vandq_u8(lo, hi));
// Now simply add the delta values to the input:
str.val[2] = vaddq_u8(str.val[2], delta);
}
{
const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[3], 4);
const uint8x16_t lo_nibbles = vandq_u8(str.val[3], mask_F);
const uint8x16_t eq_2F = vceqq_u8(str.val[3], mask_2F);

const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);

const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles));
classified = vorrq_u8(classified, vandq_u8(lo, hi));
// Now simply add the delta values to the input:
str.val[3] = vaddq_u8(str.val[3], delta);
}

// Check for invalid input: if any of the delta values are zero,
// fall back on bytewise code to do error checking and reporting:
uint8x16_t classified = CMPEQ(delta.val[0], 0);
classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0));
classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0));
classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0));

// Extract both 32-bit halves; check that all bits are zero:
if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0
|| vgetq_lane_u32((uint32x4_t)classified, 1) != 0
Expand All @@ -52,16 +92,10 @@ while (srclen >= 64)
break;
}

// Now simply add the delta values to the input:
str.val[0] = vaddq_u8(str.val[0], delta.val[0]);
str.val[1] = vaddq_u8(str.val[1], delta.val[1]);
str.val[2] = vaddq_u8(str.val[2], delta.val[2]);
str.val[3] = vaddq_u8(str.val[3], delta.val[3]);

// Compress four bytes into three:
dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);

// Interleave and store decoded result:
vst3q_u8((uint8_t *)o, dec);
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/sse42/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ BASE64_DEC_FUNCTION(sse42)
{
#ifdef __SSE4_2__
#include "../generic/dec_head.c"
#include "dec_loop.c"
#include "../ssse3/dec_loop.c"
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
Expand Down
Loading

0 comments on commit 0a69845

Please sign in to comment.