-
Notifications
You must be signed in to change notification settings - Fork 165
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#98: NEON64: enc: convert full encoding loop to inline assembly
Convert the full encoding loop to an inline assembly implementation for compilers that support inline assembly. The motivation for this change is issue #96: when optimization is turned off on recent versions of clang, the encoding table is sometimes not loaded into sequential registers. This happens despite taking pains to ensure that the compiler uses an explicit set of registers for the load (v8..v11). This leaves us with not much options beside rewriting the full encoding loop in inline assembly. Only that way can we be absolutely certain that the correct registers are used. Thankfully, AArch64 assembly is not very difficult to write by hand. Fixes #96. Closes #97. Resolves #98.
- Loading branch information
Showing
3 changed files
with
174 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
// Apologies in advance for combining the preprocessor with inline assembly, | ||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of | ||
// code repetition. The preprocessor is used to template large sections of | ||
// inline assembly that differ only in the registers used. If the code was | ||
// written out by hand, it would become very large and hard to audit. | ||
|
||
// Generate a block of inline assembly that loads three user-defined registers | ||
// P, Q, R from memory and deinterleaves them, post-incrementing the src | ||
// pointer. The register set should be sequential. | ||
#define LOAD(P, Q, R) \ | ||
"ld3 {"P".16b, "Q".16b, "R".16b}, [%[src]], #48 \n\t" | ||
|
||
// Generate a block of inline assembly that takes three deinterleaved registers | ||
// and shuffles the bytes. The output is in temporary registers t0..t3. | ||
#define SHUF(P, Q, R) \ | ||
"ushr %[t0].16b, "P".16b, #2 \n\t" \ | ||
"ushr %[t1].16b, "Q".16b, #4 \n\t" \ | ||
"ushr %[t2].16b, "R".16b, #6 \n\t" \ | ||
"sli %[t1].16b, "P".16b, #4 \n\t" \ | ||
"sli %[t2].16b, "Q".16b, #2 \n\t" \ | ||
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \ | ||
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \ | ||
"and %[t3].16b, "R".16b, %[n63].16b \n\t" | ||
|
||
// Generate a block of inline assembly that takes temporary registers t0..t3 | ||
// and translates them to the base64 alphabet, using a table loaded into | ||
// v8..v11. The output is in user-defined registers P..S. | ||
#define TRAN(P, Q, R, S) \ | ||
"tbl "P".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \ | ||
"tbl "Q".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \ | ||
"tbl "R".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \ | ||
"tbl "S".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" | ||
|
||
// Generate a block of inline assembly that interleaves four registers and | ||
// stores them, post-incrementing the destination pointer. | ||
#define STOR(P, Q, R, S) \ | ||
"st4 {"P".16b, "Q".16b, "R".16b, "S".16b}, [%[dst]], #64 \n\t" | ||
|
||
// Generate a block of inline assembly that generates a single self-contained | ||
// encoder round: fetch the data, process it, and store the result. | ||
#define ROUND() \ | ||
LOAD("v12", "v13", "v14") \ | ||
SHUF("v12", "v13", "v14") \ | ||
TRAN("v12", "v13", "v14", "v15") \ | ||
STOR("v12", "v13", "v14", "v15") | ||
|
||
// Generate a block of assembly that generates a type A interleaved encoder | ||
// round. It uses registers that were loaded by the previous type B round, and | ||
// in turn loads registers for the next type B round. | ||
#define ROUND_A() \ | ||
SHUF("v2", "v3", "v4") \ | ||
LOAD("v12", "v13", "v14") \ | ||
TRAN("v2", "v3", "v4", "v5") \ | ||
STOR("v2", "v3", "v4", "v5") | ||
|
||
// Type B interleaved encoder round. Same as type A, but register sets swapped. | ||
#define ROUND_B() \ | ||
SHUF("v12", "v13", "v14") \ | ||
LOAD("v2", "v3", "v4") \ | ||
TRAN("v12", "v13", "v14", "v15") \ | ||
STOR("v12", "v13", "v14", "v15") | ||
|
||
// The first type A round needs to load its own registers. | ||
#define ROUND_A_FIRST() \ | ||
LOAD("v2", "v3", "v4") \ | ||
ROUND_A() | ||
|
||
// The last type B round omits the load for the next step. | ||
#define ROUND_B_LAST() \ | ||
SHUF("v12", "v13", "v14") \ | ||
TRAN("v12", "v13", "v14", "v15") \ | ||
STOR("v12", "v13", "v14", "v15") | ||
|
||
// Suppress clang's warning that the literal string in the asm statement is | ||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 | ||
// compilers). It may be true, but the goal here is not C99 portability. | ||
#pragma GCC diagnostic push | ||
#pragma GCC diagnostic ignored "-Woverlength-strings" | ||
|
||
static inline void | ||
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) | ||
{ | ||
size_t rounds = *slen / 48; | ||
|
||
if (rounds == 0) { | ||
return; | ||
} | ||
|
||
*slen -= rounds * 48; // 48 bytes consumed per round. | ||
*olen += rounds * 64; // 64 bytes produced per round. | ||
|
||
// Number of times to go through the 8x loop. | ||
size_t loops = rounds / 8; | ||
|
||
// Number of rounds remaining after the 8x loop. | ||
rounds %= 8; | ||
|
||
// Temporary registers, used as scratch space. | ||
uint8x16_t tmp0, tmp1, tmp2, tmp3; | ||
|
||
__asm__ volatile ( | ||
|
||
// Load the encoding table into v8..v11. | ||
" ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t" | ||
|
||
// If there are eight rounds or more, enter an 8x unrolled loop | ||
// of interleaved encoding rounds. The rounds interleave memory | ||
// operations (load/store) with data operations to maximize | ||
// pipeline throughput. | ||
" cbz %[loops], 4f \n\t" | ||
|
||
// The SIMD instructions do not touch the flags. | ||
"88: subs %[loops], %[loops], #1 \n\t" | ||
" " ROUND_A_FIRST() | ||
" " ROUND_B() | ||
" " ROUND_A() | ||
" " ROUND_B() | ||
" " ROUND_A() | ||
" " ROUND_B() | ||
" " ROUND_A() | ||
" " ROUND_B_LAST() | ||
" b.ne 88b \n\t" | ||
// Enter a 4x unrolled loop for rounds of 4 or more. | ||
"4: cmp %[rounds], #4 \n\t" | ||
" b.lt 30f \n\t" | ||
" " ROUND_A_FIRST() | ||
" " ROUND_B() | ||
" " ROUND_A() | ||
" " ROUND_B_LAST() | ||
" sub %[rounds], %[rounds], #4 \n\t" | ||
|
||
// Dispatch the remaining rounds 0..3. | ||
"30: cbz %[rounds], 0f \n\t" | ||
" cmp %[rounds], #2 \n\t" | ||
" b.eq 2f \n\t" | ||
" b.lt 1f \n\t" | ||
|
||
// Block of non-interlaced encoding rounds, which can each | ||
// individually be jumped to. Rounds fall through to the next. | ||
"3: " ROUND() | ||
"2: " ROUND() | ||
"1: " ROUND() | ||
"0: \n\t" | ||
|
||
// Outputs (modified). | ||
: [loops] "+r" (loops), | ||
[src] "+r" (*s), | ||
[dst] "+r" (*o), | ||
[t0] "=&w" (tmp0), | ||
[t1] "=&w" (tmp1), | ||
[t2] "=&w" (tmp2), | ||
[t3] "=&w" (tmp3) | ||
// Inputs (not modified). | ||
: [rounds] "r" (rounds), | ||
[tbl] "r" (base64_table_enc_6bit), | ||
[n63] "w" (vdupq_n_u8(63)) | ||
|
||
// Clobbers. | ||
: "v2", "v3", "v4", "v5", | ||
"v8", "v9", "v10", "v11", | ||
"v12", "v13", "v14", "v15" | ||
); | ||
} | ||
|
||
#pragma GCC diagnostic pop |