From 0a698455df8cd90c066382a375c7886d6158069d Mon Sep 17 00:00:00 2001 From: mayeut Date: Tue, 28 Feb 2017 22:39:28 +0100 Subject: [PATCH] SSSE3->AVX2, NEON32 decoding optimization Use Wojciech Mula (@WojciechMula) and @aqrit implementation update for AVX2 / SSSE3 / NEON32 decoding. SSSE3 implementation is reused in SSE4.1, SSE4.2 and AVX dispatched decoding loops. SSE4.2 implementation is now useless but kept to ease integration of future updates if needed. Speed-up on i7-4870HQ @ 2.5 GHz (clang-800.0.42.1, x86_64) SSSE3 decoding: +79% SSE4.2 decoding: +37% AVX decoding: +57% AVX2 decoding: +64% Speed-up on Apple iPhone SE (clang-800.0.42.1, armv7) NEON32 decoding: +66% --- README.md | 38 +++++------ lib/arch/avx/codec.c | 2 +- lib/arch/avx2/codec.c | 50 +++++++------- lib/arch/avx2/dec_loop.c | 60 +++++++++-------- lib/arch/neon32/dec_loop.c | 116 +++++++++++++++++++++------------ lib/arch/sse42/codec.c | 2 +- lib/arch/sse42/dec_loop.c | 90 ------------------------- lib/arch/ssse3/dec_loop.c | 94 +++++++++++++++++++++----- lib/arch/ssse3/dec_reshuffle.c | 41 +++++++----- 9 files changed, 257 insertions(+), 236 deletions(-) delete mode 100644 lib/arch/sse42/dec_loop.c diff --git a/README.md b/README.md index a185864f..e28e72c5 100644 --- a/README.md +++ b/README.md @@ -423,24 +423,24 @@ The tables below contain some results on random machines. All numbers measured w x86 processors -| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec | -|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:| -| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | 4999\* | 6666\* | -| i7-4770 @ 3.4 GHz DDR1600 | 1790 | 3038 | 4899 | 4043 | 4938 | 4939 | 4796 | 5709 | 4681 | 6386 | -| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784 | 3041 | 4945 | 4035 | 4954 | 4941 | 4776 | 5719 | 4661 | 6294 | -| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401 | 5729 | 5489 | 7444 | 5030 | 8531 | 5003 | 8624 | 5105 | 8558 | -| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884 | 7099 | 4917 | 7057 | 4915 | 7541 | 4799 | 7143 | 4902 | 7219 | -| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212 | 8849 | 5284 | 9099 | 5245 | 9160 | 5289 | 9220 | 4849 | 9200 | -| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 3886 | 6701 | 5098 | 7015 | 5281 | 8328 | 7063 | -| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | 4124\* | 5403\* | -| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | - | - | - | - | -| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - | -| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - | -| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - | -| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | - | - | - | - | -| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | - | - | - | - | -| Intel Edison @ 500 MHz (x86-64) | 97 | 146 | 197 | 207 | 197 | 145 | - | - | - | - | -| Intel Edison @ 500 MHz (x86-64) 2 thread | 193 | 288 | 389 | 410 | 389 | 289 | - | - | - | - | +| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | AVX enc | AVX dec | AVX2 enc | AVX2 dec | +|-------------------------------------------|----------:|----------:|----------:|----------:|--------:|--------:|---------:|---------:| +| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | 4999\* | 6666\* | +| i7-4770 @ 3.4 GHz DDR1600 | 1790 | 3038 | 4899 | 4043\* | 4796 | 5709\* | 4681 | 6386\* | +| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784 | 3041 | 4945 | 4035\* | 4776 | 5719\* | 4661 | 6294\* | +| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401 | 5729 | 5489 | 7444\* | 5003 | 8624\* | 5105 | 8558\* | +| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884 | 7099 | 4917 | 7057\* | 4799 | 7143\* | 4902 | 7219\* | +| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212 | 8849 | 5284 | 9099\* | 5289 | 9220\* | 4849 | 9200\* | +| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 6721 | 6962 | 7015 | 8267 | 8328 | 11576 | +| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | 4124\* | 5403\* | +| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | - | - | - | - | +| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | +| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | +| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | +| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | - | - | - | - | +| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | - | - | - | - | +| Intel Edison @ 500 MHz (x86-64) | 97 | 146 | 197 | 207\* | - | - | - | - | +| Intel Edison @ 500 MHz (x86-64) 2 thread | 193 | 288 | 389 | 410\* | - | - | - | - | ARM processors @@ -448,7 +448,7 @@ ARM processors |-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:| | Raspberry PI B+ V1.2 | 46 | 40\* | - | - | - | - | | Raspberry PI 2 B V1.1 | 104 | 88\* | 188 | 116\* | - | - | -| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 1573 | - | - | +| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 2618 | - | - | | Apple iPhone SE arm64 | 1061 | 1239 | - | - | 4098 | 3983 | PowerPC processors diff --git a/lib/arch/avx/codec.c b/lib/arch/avx/codec.c index 9e19f7c4..d7a523c2 100644 --- a/lib/arch/avx/codec.c +++ b/lib/arch/avx/codec.c @@ -31,7 +31,7 @@ BASE64_DEC_FUNCTION(avx) { #ifdef __AVX__ #include "../generic/dec_head.c" - #include "../sse42/dec_loop.c" + #include "../ssse3/dec_loop.c" #include "../generic/dec_tail.c" #else BASE64_DEC_STUB diff --git a/lib/arch/avx2/codec.c b/lib/arch/avx2/codec.c index e75108a9..21a4bd3c 100644 --- a/lib/arch/avx2/codec.c +++ b/lib/arch/avx2/codec.c @@ -127,35 +127,35 @@ enc_translate (const __m256i in) static inline __m256i dec_reshuffle (__m256i in) { - // Mask in a single byte per shift: - const __m256i maskB2 = _mm256_set1_epi32(0x003F0000); - const __m256i maskB1 = _mm256_set1_epi32(0x00003F00); - - // Pack bytes together: - __m256i out = _mm256_srli_epi32(in, 16); - - out = _mm256_or_si256(out, _mm256_srli_epi32(_mm256_and_si256(in, maskB2), 2)); + // in, lower lane, bits, upper case are most significant bits, lower case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA - out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, maskB1), 12)); + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB - out = _mm256_or_si256(out, _mm256_slli_epi32(in, 26)); + __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc - // Pack bytes together within 32-bit words, discarding words 3 and 7: + // Pack bytes together in each lane: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( - 3, 2, 1, - 7, 6, 5, - 11, 10, 9, - 15, 14, 13, - -1, -1, -1, -1, - 3, 2, 1, - 7, 6, 5, - 11, 10, 9, - 15, 14, 13, - -1, -1, -1, -1)); - - // Pack 32-bit words together, squashing empty words 3 and 7: - return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( - 0, 1, 2, 4, 5, 6, -1, -1)); + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes + return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); } #endif // __AVX2__ diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c index 8344004c..0f588905 100644 --- a/lib/arch/avx2/dec_loop.c +++ b/lib/arch/avx2/dec_loop.c @@ -8,38 +8,42 @@ while (srclen >= 45) // Load string: __m256i str = _mm256_loadu_si256((__m256i *)c); - // The input consists of six character sets in the Base64 alphabet, - // which we need to map back to the 6-bit values they represent. - // There are three ranges, two singles, and then there's the rest. - // - // # From To Add Characters - // 1 [43] [62] +19 + - // 2 [47] [63] +16 / - // 3 [48..57] [52..61] +4 0..9 - // 4 [65..90] [0..25] -65 A..Z - // 5 [97..122] [26..51] -71 a..z - // (6) Everything else => invalid input - - const __m256i set1 = CMPEQ(str, '+'); - const __m256i set2 = CMPEQ(str, '/'); - const __m256i set3 = RANGE(str, '0', '9'); - const __m256i set4 = RANGE(str, 'A', 'Z'); - const __m256i set5 = RANGE(str, 'a', 'z'); - - __m256i delta = REPLACE(set1, 19); - delta = _mm256_or_si256(delta, REPLACE(set2, 16)); - delta = _mm256_or_si256(delta, REPLACE(set3, 4)); - delta = _mm256_or_si256(delta, REPLACE(set4, -65)); - delta = _mm256_or_si256(delta, REPLACE(set5, -71)); - - // Check for invalid input: if any of the delta values are zero, - // fall back on bytewise code to do error checking and reporting: - if (_mm256_movemask_epi8(CMPEQ(delta, 0))) { + // see ssse3/dec_loop.c for an explanation of how the code works. + + const __m256i lut_lo = _mm256_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m256i lut_hi = _mm256_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m256i lut_roll = _mm256_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m256i mask_2F = _mm256_set1_epi8(0x2f); + + // lookup + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + + if (!_mm256_testz_si256(lo, hi)) { break; } // Now simply add the delta values to the input: - str = _mm256_add_epi8(str, delta); + str = _mm256_add_epi8(str, roll); // Reshuffle the input to packed 12-byte output format: str = dec_reshuffle(str); diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c index 5d5c3d77..05250770 100644 --- a/lib/arch/neon32/dec_loop.c +++ b/lib/arch/neon32/dec_loop.c @@ -3,47 +3,87 @@ // don't need to check if we have enough remaining input to cover them: while (srclen >= 64) { - uint8x16x4_t set1, set2, set3, set4, set5, delta; uint8x16x3_t dec; // Load 64 bytes and deinterleave: uint8x16x4_t str = vld4q_u8((uint8_t *)c); - // The input consists of six character sets in the Base64 alphabet, - // which we need to map back to the 6-bit values they represent. - // There are three ranges, two singles, and then there's the rest. - // - // # From To Add Characters - // 1 [43] [62] +19 + - // 2 [47] [63] +16 / - // 3 [48..57] [52..61] +4 0..9 - // 4 [65..90] [0..25] -65 A..Z - // 5 [97..122] [26..51] -71 a..z - // (6) Everything else => invalid input - - // Benchmarking on the Raspberry Pi 2B and Clang shows that looping - // generates slightly faster code than explicit unrolling: - for (int i = 0; i < 4; i++) { - set1.val[i] = CMPEQ(str.val[i], '+'); - set2.val[i] = CMPEQ(str.val[i], '/'); - set3.val[i] = RANGE(str.val[i], '0', '9'); - set4.val[i] = RANGE(str.val[i], 'A', 'Z'); - set5.val[i] = RANGE(str.val[i], 'a', 'z'); - - delta.val[i] = REPLACE(set1.val[i], 19); - delta.val[i] = vbslq_u8(set2.val[i], vdupq_n_u8( 16), delta.val[i]); - delta.val[i] = vbslq_u8(set3.val[i], vdupq_n_u8( 4), delta.val[i]); - delta.val[i] = vbslq_u8(set4.val[i], vdupq_n_u8(-65), delta.val[i]); - delta.val[i] = vbslq_u8(set5.val[i], vdupq_n_u8(-71), delta.val[i]); + // see ssse3/dec_loop.c for an explanation of how the code works. + + const uint8x16_t lut_lo = { + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A + }; + const uint8x16_t lut_hi = { + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 + }; + + const uint8x16_t lut_roll = { + 0, 16, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + + const uint8x16_t mask_F = vdupq_n_u8(0xf); + const uint8x16_t mask_2F = vdupq_n_u8(0x2f); + + uint8x16_t classified; + + { + const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[0], 4); + const uint8x16_t lo_nibbles = vandq_u8(str.val[0], mask_F); + const uint8x16_t eq_2F = vceqq_u8(str.val[0], mask_2F); + + const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles); + const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles); + + const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles)); + classified = vandq_u8(lo, hi); + // Now simply add the delta values to the input: + str.val[0] = vaddq_u8(str.val[0], delta); + } + { + const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[1], 4); + const uint8x16_t lo_nibbles = vandq_u8(str.val[1], mask_F); + const uint8x16_t eq_2F = vceqq_u8(str.val[1], mask_2F); + + const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles); + const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles); + + const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles)); + classified = vorrq_u8(classified, vandq_u8(lo, hi)); + // Now simply add the delta values to the input: + str.val[1] = vaddq_u8(str.val[1], delta); + } + { + const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[2], 4); + const uint8x16_t lo_nibbles = vandq_u8(str.val[2], mask_F); + const uint8x16_t eq_2F = vceqq_u8(str.val[2], mask_2F); + + const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles); + const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles); + + const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles)); + classified = vorrq_u8(classified, vandq_u8(lo, hi)); + // Now simply add the delta values to the input: + str.val[2] = vaddq_u8(str.val[2], delta); + } + { + const uint8x16_t hi_nibbles = vshrq_n_u8(str.val[3], 4); + const uint8x16_t lo_nibbles = vandq_u8(str.val[3], mask_F); + const uint8x16_t eq_2F = vceqq_u8(str.val[3], mask_2F); + + const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles); + const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles); + + const uint8x16_t delta = vqtbl1q_u8(lut_roll, vaddq_u8(eq_2F, hi_nibbles)); + classified = vorrq_u8(classified, vandq_u8(lo, hi)); + // Now simply add the delta values to the input: + str.val[3] = vaddq_u8(str.val[3], delta); } // Check for invalid input: if any of the delta values are zero, // fall back on bytewise code to do error checking and reporting: - uint8x16_t classified = CMPEQ(delta.val[0], 0); - classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0)); - classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0)); - classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0)); - // Extract both 32-bit halves; check that all bits are zero: if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0 || vgetq_lane_u32((uint32x4_t)classified, 1) != 0 @@ -52,16 +92,10 @@ while (srclen >= 64) break; } - // Now simply add the delta values to the input: - str.val[0] = vaddq_u8(str.val[0], delta.val[0]); - str.val[1] = vaddq_u8(str.val[1], delta.val[1]); - str.val[2] = vaddq_u8(str.val[2], delta.val[2]); - str.val[3] = vaddq_u8(str.val[3], delta.val[3]); - // Compress four bytes into three: - dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4); - dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2); - dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3]; + dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4)); + dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2)); + dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]); // Interleave and store decoded result: vst3q_u8((uint8_t *)o, dec); diff --git a/lib/arch/sse42/codec.c b/lib/arch/sse42/codec.c index 02838d4e..ab70c124 100644 --- a/lib/arch/sse42/codec.c +++ b/lib/arch/sse42/codec.c @@ -31,7 +31,7 @@ BASE64_DEC_FUNCTION(sse42) { #ifdef __SSE4_2__ #include "../generic/dec_head.c" - #include "dec_loop.c" + #include "../ssse3/dec_loop.c" #include "../generic/dec_tail.c" #else BASE64_DEC_STUB diff --git a/lib/arch/sse42/dec_loop.c b/lib/arch/sse42/dec_loop.c deleted file mode 100644 index f7e55dcf..00000000 --- a/lib/arch/sse42/dec_loop.c +++ /dev/null @@ -1,90 +0,0 @@ -// If we have SSE4.2 support, pick off 16 bytes at a time for as long as we can, -// but make sure that we quit before seeing any == markers at the end of the -// string. Also, because we write four zeroes at the end of the output, ensure -// that there are at least 6 valid bytes of input data remaining to close the -// gap. 16 + 2 + 6 = 24 bytes: -while (srclen >= 24) -{ - // Load string: - __m128i str = _mm_loadu_si128((__m128i *)c); - - // The input consists of six character sets in the Base64 alphabet, - // which we need to map back to the 6-bit values they represent. - // There are three ranges, two singles, and then there's the rest. - // - // # From To Add Index Characters - // 1 [43] [62] +19 0 + - // 2 [47] [63] +16 1 / - // 3 [48..57] [52..61] +4 [2..11] 0..9 - // 4 [65..90] [0..25] -65 15 A..Z - // 5 [97..122] [26..51] -71 14 a..z - // (6) Everything else => invalid input - - // LUT: - const __m128i lut = _mm_setr_epi8( - 19, 16, 4, 4, - 4, 4, 4, 4, - 4, 4, 4, 4, - 0, 0, -71, -65 - ); - - // Ranges to be checked (all should be valid, repeat the first one): - const __m128i range = _mm_setr_epi8( - '+','+', - '+','+', - '+','+', - '+','+', - '/','/', - '0','9', - 'A','Z', - 'a','z'); - - // Check for invalid input: - // pseudo-code for the _mm_cmpistrc call: - // out_of_range = 0 - // for each byte of str - // out_of_range |= !(byte in one of the ranges) - // return out_of_range - if (_mm_cmpistrc(range, str, _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_NEGATIVE_POLARITY)) { - break; - } - - // Compute indices for table look up: - // First indices for ranges #[1..3]. Others are invalid. - __m128i indices = _mm_subs_epu8(str, _mm_set1_epi8(46)); - - // Compute mask for ranges #4 and #5: - __m128i mask45 = CMPGT(str, 64); - - // Compute mask for range #5: - __m128i mask5 = CMPGT(str, 96); - - // Clear invalid values in indices: - indices = _mm_andnot_si128(mask45, indices); - - // Compute index for range #4: (abs(-1) << 4) + -1 = 15. Index for range #5 is off by one: - mask45 = _mm_add_epi8(_mm_slli_epi16(_mm_abs_epi8(mask45), 4), mask45); - - // Set all indices. Index for range #5 is still off by one: - indices = _mm_add_epi8(indices, mask45); - - // add -1, so substract 1 to indices for range #5, All indices are now correct: - indices = _mm_add_epi8(indices, mask5); - - // Lookup deltas: - __m128i delta = _mm_shuffle_epi8(lut, indices); - - // Now simply add the delta values to the input: - str = _mm_add_epi8(str, delta); - - // Reshuffle the input to packed 12-byte output format: - str = dec_reshuffle(str); - - // Store back: - _mm_storeu_si128((__m128i *)o, str); - - c += 16; - o += 12; - outl += 12; - srclen -= 16; -} diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c index e660d4c6..dd3ca0f8 100644 --- a/lib/arch/ssse3/dec_loop.c +++ b/lib/arch/ssse3/dec_loop.c @@ -20,26 +20,90 @@ while (srclen >= 24) // 5 [97..122] [26..51] -71 a..z // (6) Everything else => invalid input - const __m128i set1 = CMPEQ(str, '+'); - const __m128i set2 = CMPEQ(str, '/'); - const __m128i set3 = RANGE(str, '0', '9'); - const __m128i set4 = RANGE(str, 'A', 'Z'); - const __m128i set5 = RANGE(str, 'a', 'z'); - - __m128i delta = REPLACE(set1, 19); - delta = _mm_or_si128(delta, REPLACE(set2, 16)); - delta = _mm_or_si128(delta, REPLACE(set3, 4)); - delta = _mm_or_si128(delta, REPLACE(set4, -65)); - delta = _mm_or_si128(delta, REPLACE(set5, -71)); - - // Check for invalid input: if any of the delta values are zero, + // We will use LUTS for character validation & offset computation + // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, + // this allows to mask with 0x2F instead of 0x0F and thus save one constant declaration (register and/or memory access) + + // For offsets: + // Perfect hash for lut = ((src>>4)&0x2F)+((src==0x2F)?0xFF:0x00) + // 0000 = garbage + // 0001 = / + // 0010 = + + // 0011 = 0-9 + // 0100 = A-Z + // 0101 = A-Z + // 0110 = a-z + // 0111 = a-z + // 1000 >= garbage + + // For validation, here's the table. + // A character is valid if and only if the AND of the 2 lookups equals 0: + + // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 + // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A + + // 0000 0X10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI + // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + + // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US + // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + + // 0010 0x01 char ! " # $ % & ' ( ) * + , - . / + // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00 + + // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 + + // 0100 0x04 char @ A B C D E F G H I J K L M N 0 + // andlut 0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 + + // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ + // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 + + // 0110 0x04 char ` a b c d e f g h i j k l m n o + // andlut 0x04 0x00 0x00 0x00 0X00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 + // 0111 0X08 char p q r s t u v w x y z { | } ~ + // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 + + // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + + const __m128i lut_lo = _mm_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m128i lut_hi = _mm_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m128i lut_roll = _mm_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m128i mask_2F = _mm_set1_epi8(0x2f); + + // lookup + const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F); + const __m128i lo_nibbles = _mm_and_si128(str, mask_2F); + const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); + const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); + const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F); + const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles)); + + // Check for invalid input: if any "and" values from lo and hi are not zero, // fall back on bytewise code to do error checking and reporting: - if (_mm_movemask_epi8(CMPEQ(delta, 0))) { + if (_mm_movemask_epi8(CMPGT(_mm_and_si128(lo, hi), 0)) != 0) { break; } // Now simply add the delta values to the input: - str = _mm_add_epi8(str, delta); + str = _mm_add_epi8(str, roll); // Reshuffle the input to packed 12-byte output format: str = dec_reshuffle(str); diff --git a/lib/arch/ssse3/dec_reshuffle.c b/lib/arch/ssse3/dec_reshuffle.c index b8cd0c13..e1a2f8cd 100644 --- a/lib/arch/ssse3/dec_reshuffle.c +++ b/lib/arch/ssse3/dec_reshuffle.c @@ -1,24 +1,33 @@ static inline __m128i dec_reshuffle (__m128i in) { - // Mask in a single byte per shift: - const __m128i maskB2 = _mm_set1_epi32(0x003F0000); - const __m128i maskB1 = _mm_set1_epi32(0x00003F00); + // in, bits, upper case are most significant bits, lower case are least significant bits + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA - // Pack bytes together: - __m128i out = _mm_srli_epi32(in, 16); - - out = _mm_or_si128(out, _mm_srli_epi32(_mm_and_si128(in, maskB2), 2)); + const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB - out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, maskB1), 12)); + const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc - out = _mm_or_si128(out, _mm_slli_epi32(in, 26)); - - // Reshuffle and repack into 12-byte output format: - return _mm_shuffle_epi8(out, _mm_setr_epi8( - 3, 2, 1, - 7, 6, 5, - 11, 10, 9, - 15, 14, 13, + // Pack bytes together: + return _mm_shuffle_epi8(out, _mm_setr_epi8( + 2, 1, 0, + 6, 5, 4, + 10, 9, 8, + 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa }