diff --git a/Makefile.am b/Makefile.am index 0fcc675b1..d8fce4b30 100644 --- a/Makefile.am +++ b/Makefile.am @@ -13,21 +13,10 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) bin_PROGRAMS = minerd -dist_man_MANS = minerd.1 - minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ - sha2.c scrypt.c -if ARCH_x86 -minerd_SOURCES += sha2-x86.S scrypt-x86.S -endif -if ARCH_x86_64 -minerd_SOURCES += sha2-x64.S scrypt-x64.S -endif -if ARCH_ARM -minerd_SOURCES += sha2-arm.S scrypt-arm.S -endif -minerd_SOURCES += yacoin.c scrypt-jane/scrypt-jane.c + sha2.c sha2-arm.S sha2-x86.S sha2-x64.S \ + scrypt.c scrypt-arm.S scrypt-x86.S scrypt-x64.S scrypt-jane.c minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME diff --git a/NEWS b/NEWS index f9abc5aba..69d6c81c1 100644 --- a/NEWS +++ b/NEWS @@ -1,9 +1,3 @@ -Version 2.3.2 - Jul 10, 2013 - -- Add optimizations for AVX2-capable x86-64 processors -- Ensure that the output stream is flushed after every log message -- Fix an undefined-behavior bug in the Stratum code - Version 2.3.1 - Jun 18, 2013 - Add a --cert option for specifying an SSL certificate (martinwguy) diff --git a/scrypt-jane/code/scrypt-conf.h b/code/scrypt-conf.h similarity index 100% rename from scrypt-jane/code/scrypt-conf.h rename to code/scrypt-conf.h diff --git a/scrypt-jane/code/scrypt-jane-chacha.h b/code/scrypt-jane-chacha.h similarity index 86% rename from scrypt-jane/code/scrypt-jane-chacha.h rename to code/scrypt-jane-chacha.h index 8e403c2ae..c4d44c24b 100644 --- a/scrypt-jane/code/scrypt-jane-chacha.h +++ b/code/scrypt-jane-chacha.h @@ -18,6 +18,10 @@ typedef uint32_t scrypt_mix_word_t; #if defined(SCRYPT_CHACHA_AVX) #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #if defined(X86_INTRINSIC_AVX) + #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1 + #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor + #endif #define SCRYPT_ROMIX_FN scrypt_ROMix_avx #define SCRYPT_MIX_FN chacha_core_avx #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop @@ -27,6 +31,10 @@ typedef uint32_t scrypt_mix_word_t; #if defined(SCRYPT_CHACHA_SSSE3) #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #if defined(X86_INTRINSIC_SSSE3) + #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1 + #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor + #endif #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 #define SCRYPT_MIX_FN chacha_core_ssse3 #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop @@ -36,6 +44,10 @@ typedef uint32_t scrypt_mix_word_t; #if defined(SCRYPT_CHACHA_SSE2) #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #if defined(X86_INTRINSIC_SSE2) + #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1 + #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor + #endif #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 #define SCRYPT_MIX_FN chacha_core_sse2 #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop @@ -52,7 +64,7 @@ typedef uint32_t scrypt_mix_word_t; #if !defined(SCRYPT_CHOOSE_COMPILETIME) static scrypt_ROMixfn -scrypt_getROMix(void) { +scrypt_getROMix() { size_t cpuflags = detect_cpu(); #if defined(SCRYPT_CHACHA_AVX) @@ -80,7 +92,7 @@ scrypt_getROMix(void) { #if defined(SCRYPT_TEST_SPEED) static size_t -available_implementations(void) { +available_implementations() { size_t cpuflags = detect_cpu(); size_t flags = 0; @@ -104,7 +116,7 @@ available_implementations(void) { #endif static int -scrypt_test_mix(void) { +scrypt_test_mix() { static const uint8_t expected[16] = { 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, }; diff --git a/scrypt-jane/code/scrypt-jane-hash.h b/code/scrypt-jane-hash.h similarity index 98% rename from scrypt-jane/code/scrypt-jane-hash.h rename to code/scrypt-jane-hash.h index e72781485..db5c1db3a 100644 --- a/scrypt-jane/code/scrypt-jane-hash.h +++ b/code/scrypt-jane-hash.h @@ -28,7 +28,7 @@ #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ static int -scrypt_test_hash(void) { +scrypt_test_hash() { scrypt_hash_state st; scrypt_hash_digest hash, final; uint8_t msg[SCRYPT_TEST_HASH_LEN]; diff --git a/scrypt-jane/code/scrypt-jane-hash_blake256.h b/code/scrypt-jane-hash_blake256.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-hash_blake256.h rename to code/scrypt-jane-hash_blake256.h diff --git a/scrypt-jane/code/scrypt-jane-hash_blake512.h b/code/scrypt-jane-hash_blake512.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-hash_blake512.h rename to code/scrypt-jane-hash_blake512.h diff --git a/scrypt-jane/code/scrypt-jane-hash_keccak.h b/code/scrypt-jane-hash_keccak.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-hash_keccak.h rename to code/scrypt-jane-hash_keccak.h diff --git a/scrypt-jane/code/scrypt-jane-hash_sha256.h b/code/scrypt-jane-hash_sha256.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-hash_sha256.h rename to code/scrypt-jane-hash_sha256.h diff --git a/scrypt-jane/code/scrypt-jane-hash_sha512.h b/code/scrypt-jane-hash_sha512.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-hash_sha512.h rename to code/scrypt-jane-hash_sha512.h diff --git a/scrypt-jane/code/scrypt-jane-hash_skein512.h b/code/scrypt-jane-hash_skein512.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-hash_skein512.h rename to code/scrypt-jane-hash_skein512.h diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h b/code/scrypt-jane-mix_chacha-avx.h similarity index 60% rename from scrypt-jane/code/scrypt-jane-mix_chacha-avx.h rename to code/scrypt-jane-mix_chacha-avx.h index 7409c91c2..17559d88a 100644 --- a/scrypt-jane/code/scrypt-jane-mix_chacha-avx.h +++ b/code/scrypt-jane-mix_chacha-avx.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) #define SCRYPT_CHACHA_AVX @@ -46,7 +46,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa xmm1,[ecx+esi+16]) a2(vmovdqa xmm2,[ecx+esi+32]) a2(vmovdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_avx_no_xor1) + a1(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[ecx+eax+0]) a3(vpxor xmm1,xmm1,[ecx+eax+16]) a3(vpxor xmm2,xmm2,[ecx+eax+32]) @@ -60,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[esi+ecx+16]) a3(vpxor xmm2,xmm2,[esi+ecx+32]) a3(vpxor xmm3,xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_avx_no_xor2) + a1(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[eax+ecx+0]) a3(vpxor xmm1,xmm1,[eax+ecx+16]) a3(vpxor xmm2,xmm2,[eax+ecx+32]) @@ -111,7 +111,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm6,xmm1,25) a3(vpslld xmm1,xmm1,7) a3(vpxor xmm1,xmm1,xmm6) - aj(ja scrypt_chacha_avx_loop) + a1(ja scrypt_chacha_avx_loop) a3(vpaddd xmm0,xmm0,[esp+0]) a3(vpaddd xmm1,xmm1,[esp+16]) a3(vpaddd xmm2,xmm2,[esp+32]) @@ -128,7 +128,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [eax+32],xmm2) a2(vmovdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_avx_loop) + a1(jne scrypt_ChunkMix_avx_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) @@ -142,7 +142,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) #define SCRYPT_CHACHA_AVX @@ -160,15 +160,15 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa xmm3,[rax+48]) a2(mov r8, 0x0504070601000302) a2(mov rax, 0x0d0c0f0e09080b0a) - a2(movd xmm4, r8) - a2(movd xmm6, rax) + a2(movq xmm4, r8) + a2(movq xmm6, rax) a2(mov r8, 0x0605040702010003) a2(mov rax, 0x0e0d0c0f0a09080b) - a2(movd xmm5, r8) - a2(movd xmm7, rax) + a2(movq xmm5, r8) + a2(movq xmm7, rax) a3(vpunpcklqdq xmm4, xmm4, xmm6) a3(vpunpcklqdq xmm5, xmm5, xmm7) - aj(jz scrypt_ChunkMix_avx_no_xor1) + a1(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[r9+0]) a3(vpxor xmm1,xmm1,[r9+16]) a3(vpxor xmm2,xmm2,[r9+32]) @@ -182,7 +182,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[rsi+r9+16]) a3(vpxor xmm2,xmm2,[rsi+r9+32]) a3(vpxor xmm3,xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_avx_no_xor2) + a1(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[rdx+r9+0]) a3(vpxor xmm1,xmm1,[rdx+r9+16]) a3(vpxor xmm2,xmm2,[rdx+r9+32]) @@ -233,7 +233,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm12,xmm1,25) a3(vpslld xmm1,xmm1,7) a3(vpxor xmm1,xmm1,xmm12) - aj(ja scrypt_chacha_avx_loop) + a1(ja scrypt_chacha_avx_loop) a3(vpaddd xmm0,xmm0,xmm8) a3(vpaddd xmm1,xmm1,xmm9) a3(vpaddd xmm2,xmm2,xmm10) @@ -249,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [rax+16],xmm1) a2(vmovdqa [rax+32],xmm2) a2(vmovdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_avx_loop) + a1(jne scrypt_ChunkMix_avx_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_avx) @@ -261,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) #define SCRYPT_CHACHA_AVX -static void asm_calling_convention NOINLINE +static void NOINLINE scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; @@ -311,8 +311,9 @@ scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]* x3 = _mm_shuffle_epi8(x3, x4); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x6 = _mm_srli_epi32(x1, 20); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, x6); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x3 = _mm_shuffle_epi8(x3, x5); @@ -321,15 +322,17 @@ scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]* x3 = _mm_shuffle_epi32(x3, 0x4e); x1 = _mm_xor_si128(x1, x2); x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x6 = _mm_srli_epi32(x1, 25); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, x6); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x3 = _mm_shuffle_epi8(x3, x4); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x6 = _mm_srli_epi32(x1, 20); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, x6); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x3 = _mm_shuffle_epi8(x3, x5); @@ -338,8 +341,201 @@ scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]* x3 = _mm_shuffle_epi32(x3, 0x4e); x1 = _mm_xor_si128(x1, x2); x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x6 = _mm_srli_epi32(x1, 25); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, x6); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +/* + * Special version with r = 1 and no XORing + * - mikaelh + */ +static void NOINLINE +scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { + const uint32_t r = 1; + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = _mm_srli_epi32(x1, 20); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, x6); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = _mm_srli_epi32(x1, 25); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, x6); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = _mm_srli_epi32(x1, 20); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, x6); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = _mm_srli_epi32(x1, 25); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, x6); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +/* + * Special version with r = 1 and unconditional XORing + * - mikaelh + */ +static void NOINLINE +scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { + const uint32_t r = 1; + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = _mm_srli_epi32(x1, 20); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, x6); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = _mm_srli_epi32(x1, 25); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, x6); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = _mm_srli_epi32(x1, 20); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, x6); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = _mm_srli_epi32(x1, 25); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, x6); } x0 = _mm_add_epi32(x0, t0); diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h b/code/scrypt-jane-mix_chacha-sse2.h similarity index 55% rename from scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h rename to code/scrypt-jane-mix_chacha-sse2.h index 4a0125621..8f79decde 100644 --- a/scrypt-jane/code/scrypt-jane-mix_chacha-sse2.h +++ b/code/scrypt-jane-mix_chacha-sse2.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) #define SCRYPT_CHACHA_SSE2 @@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) + a1(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[ecx+eax+0]) a2(pxor xmm1,[ecx+eax+16]) a2(pxor xmm2,[ecx+eax+32]) @@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[esi+ecx+16]) a2(pxor xmm2,[esi+ecx+32]) a2(pxor xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) + a1(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[eax+ecx+0]) a2(pxor xmm1,[eax+ecx+16]) a2(pxor xmm2,[eax+ecx+32]) @@ -105,7 +105,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pslld xmm1,7) a2(psrld xmm6,25) a2(pxor xmm1,xmm6) - aj(ja scrypt_chacha_sse2_loop) + a1(ja scrypt_chacha_sse2_loop) a2(paddd xmm0,[esp+0]) a2(paddd xmm1,xmm4) a2(paddd xmm2,xmm5) @@ -122,7 +122,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [eax+32],xmm2) a2(movdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_sse2_loop) + a1(jne scrypt_ChunkMix_sse2_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) @@ -136,7 +136,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) #define SCRYPT_CHACHA_SSE2 @@ -152,7 +152,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[rax+16]) a2(movdqa xmm2,[rax+32]) a2(movdqa xmm3,[rax+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) + a1(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -166,7 +166,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[rsi+r9+16]) a2(pxor xmm2,[rsi+r9+32]) a2(pxor xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) + a1(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -233,7 +233,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pslld xmm1,7) a2(psrld xmm6,25) a2(pxor xmm1,xmm6) - aj(ja scrypt_chacha_sse2_loop) + a1(ja scrypt_chacha_sse2_loop) a2(paddd xmm0,xmm8) a2(paddd xmm1,xmm9) a2(paddd xmm2,xmm10) @@ -249,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [rax+16],xmm1) a2(movdqa [rax+32],xmm2) a2(movdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_sse2_loop) + a1(jne scrypt_ChunkMix_sse2_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_sse2) @@ -261,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) #define SCRYPT_CHACHA_SSE2 -static void NOINLINE asm_calling_convention +static void NOINLINE scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; @@ -308,41 +308,255 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes] x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x4 = x3; - x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x3 = _mm_slli_epi32(x3, 16); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); x4 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x4 = x3; - x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x3 = _mm_slli_epi32(x3, 8); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); x0 = _mm_shuffle_epi32(x0, 0x93); x2 = _mm_add_epi32(x2, x3); x3 = _mm_shuffle_epi32(x3, 0x4e); x1 = _mm_xor_si128(x1, x2); x2 = _mm_shuffle_epi32(x2, 0x39); x4 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x4 = x3; - x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x3 = _mm_slli_epi32(x3, 16); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); x4 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x4 = x3; - x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x3 = _mm_slli_epi32(x3, 8); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); x0 = _mm_shuffle_epi32(x0, 0x39); x2 = _mm_add_epi32(x2, x3); x3 = _mm_shuffle_epi32(x3, 0x4e); x1 = _mm_xor_si128(x1, x2); x2 = _mm_shuffle_epi32(x2, 0x93); x4 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +/* + * Special version with r = 1 and no XORing + * - mikaelh + */ +static void NOINLINE +scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { + const uint32_t r = 1; + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 16); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 8); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x4 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 16); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 8); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x4 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +/* + * Special version with r = 1 and unconditional XORing + * - mikaelh + */ +static void NOINLINE +scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { + const uint32_t r = 1; + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 16); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 8); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x4 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 16); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_slli_epi32(x3, 8); + x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x4 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); } x0 = _mm_add_epi32(x0, t0); diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h b/code/scrypt-jane-mix_chacha-ssse3.h similarity index 59% rename from scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h rename to code/scrypt-jane-mix_chacha-ssse3.h index e0d4184d2..6a80cac5b 100644 --- a/scrypt-jane/code/scrypt-jane-mix_chacha-ssse3.h +++ b/code/scrypt-jane-mix_chacha-ssse3.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) #define SCRYPT_CHACHA_SSSE3 @@ -46,7 +46,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_ssse3_no_xor1) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) a2(pxor xmm0,[ecx+eax+0]) a2(pxor xmm1,[ecx+eax+16]) a2(pxor xmm2,[ecx+eax+32]) @@ -60,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pxor xmm1,[esi+ecx+16]) a2(pxor xmm2,[esi+ecx+32]) a2(pxor xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_ssse3_no_xor2) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) a2(pxor xmm0,[eax+ecx+0]) a2(pxor xmm1,[eax+ecx+16]) a2(pxor xmm2,[eax+ecx+32]) @@ -115,7 +115,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pslld xmm1,7) a2(psrld xmm6,25) a2(pxor xmm1,xmm6) - aj(ja scrypt_chacha_ssse3_loop) + a1(ja scrypt_chacha_ssse3_loop) a2(paddd xmm0,[esp+0]) a2(paddd xmm1,[esp+16]) a2(paddd xmm2,[esp+32]) @@ -132,7 +132,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa [eax+32],xmm2) a2(movdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_ssse3_loop) + a1(jne scrypt_ChunkMix_ssse3_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) @@ -146,7 +146,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) /* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) #define SCRYPT_CHACHA_SSSE3 @@ -164,15 +164,15 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa xmm3,[rax+48]) a2(mov r8, 0x0504070601000302) a2(mov rax, 0x0d0c0f0e09080b0a) - a2(movd xmm4, r8) - a2(movd xmm6, rax) + a2(movq xmm4, r8) + a2(movq xmm6, rax) a2(mov r8, 0x0605040702010003) a2(mov rax, 0x0e0d0c0f0a09080b) - a2(movd xmm5, r8) - a2(movd xmm7, rax) + a2(movq xmm5, r8) + a2(movq xmm7, rax) a2(punpcklqdq xmm4, xmm6) a2(punpcklqdq xmm5, xmm7) - aj(jz scrypt_ChunkMix_ssse3_no_xor1) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -186,7 +186,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pxor xmm1,[rsi+r9+16]) a2(pxor xmm2,[rsi+r9+32]) a2(pxor xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_ssse3_no_xor2) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -241,7 +241,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pslld xmm1,7) a2(psrld xmm12,25) a2(pxor xmm1,xmm12) - aj(ja scrypt_chacha_ssse3_loop) + a1(ja scrypt_chacha_ssse3_loop) a2(paddd xmm0,xmm8) a2(paddd xmm1,xmm9) a2(paddd xmm2,xmm10) @@ -257,7 +257,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa [rax+16],xmm1) a2(movdqa [rax+32],xmm2) a2(movdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_ssse3_loop) + a1(jne scrypt_ChunkMix_ssse3_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_ssse3) @@ -269,7 +269,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) #define SCRYPT_CHACHA_SSSE3 -static void NOINLINE asm_calling_convention +static void NOINLINE scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; @@ -320,7 +320,8 @@ scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x3 = _mm_shuffle_epi8(x3, x5); @@ -330,14 +331,16 @@ scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes x1 = _mm_xor_si128(x1, x2); x2 = _mm_shuffle_epi32(x2, 0x39); x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x3 = _mm_shuffle_epi8(x3, x4); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x3 = _mm_shuffle_epi8(x3, x5); @@ -347,7 +350,200 @@ scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes x1 = _mm_xor_si128(x1, x2); x2 = _mm_shuffle_epi32(x2, 0x93); x6 = x1; - x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +/* + * Special version with r = 1 and no XORing + * - mikaelh + */ +static void NOINLINE +scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { + const uint32_t r = 1; + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +/* + * Special version with r = 1 and unconditional XORing + * - mikaelh + */ +static void NOINLINE +scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { + const uint32_t r = 1; + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_slli_epi32(x1, 12); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_slli_epi32(x1, 7); + x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); } x0 = _mm_add_epi32(x0, t0); diff --git a/scrypt-jane/code/scrypt-jane-mix_chacha.h b/code/scrypt-jane-mix_chacha.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-mix_chacha.h rename to code/scrypt-jane-mix_chacha.h diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h b/code/scrypt-jane-mix_salsa-avx.h similarity index 95% rename from scrypt-jane/code/scrypt-jane-mix_salsa-avx.h rename to code/scrypt-jane-mix_salsa-avx.h index 1b1b6085e..1ca90b5fa 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa-avx.h +++ b/code/scrypt-jane-mix_salsa-avx.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) #define SCRYPT_SALSA_AVX @@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_avx_no_xor1) + a1(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[ecx+eax+0]) a3(vpxor xmm1,xmm1,[ecx+eax+16]) a3(vpxor xmm2,xmm2,[ecx+eax+32]) @@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[esi+ecx+16]) a3(vpxor xmm2,xmm2,[esi+ecx+32]) a3(vpxor xmm3,xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_avx_no_xor2) + a1(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[eax+ecx+0]) a3(vpxor xmm1,xmm1,[eax+ecx+16]) a3(vpxor xmm2,xmm2,[eax+ecx+32]) @@ -97,7 +97,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(pshufd xmm2, xmm2, 0x4e) a3(vpxor xmm0, xmm0, xmm4) a3(pshufd xmm3, xmm3, 0x39) - aj(ja scrypt_salsa_avx_loop) + a1(ja scrypt_salsa_avx_loop) a3(vpaddd xmm0,xmm0,[esp+0]) a3(vpaddd xmm1,xmm1,[esp+16]) a3(vpaddd xmm2,xmm2,xmm6) @@ -114,7 +114,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [eax+32],xmm2) a2(vmovdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_avx_loop) + a1(jne scrypt_ChunkMix_avx_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) @@ -128,7 +128,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) #define SCRYPT_SALSA_AVX @@ -144,7 +144,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa xmm1,[rax+16]) a2(vmovdqa xmm2,[rax+32]) a2(vmovdqa xmm3,[rax+48]) - aj(jz scrypt_ChunkMix_avx_no_xor1) + a1(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[r9+0]) a3(vpxor xmm1,xmm1,[r9+16]) a3(vpxor xmm2,xmm2,[r9+32]) @@ -158,7 +158,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[rsi+r9+16]) a3(vpxor xmm2,xmm2,[rsi+r9+32]) a3(vpxor xmm3,xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_avx_no_xor2) + a1(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[rdx+r9+0]) a3(vpxor xmm1,xmm1,[rdx+r9+16]) a3(vpxor xmm2,xmm2,[rdx+r9+32]) @@ -217,7 +217,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(pshufd xmm2, xmm2, 0x4e) a3(vpxor xmm0, xmm0, xmm4) a3(pshufd xmm3, xmm3, 0x39) - aj(ja scrypt_salsa_avx_loop) + a1(ja scrypt_salsa_avx_loop) a3(vpaddd xmm0,xmm0,xmm8) a3(vpaddd xmm1,xmm1,xmm9) a3(vpaddd xmm2,xmm2,xmm10) @@ -233,7 +233,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [rax+16],xmm1) a2(vmovdqa [rax+32],xmm2) a2(vmovdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_avx_loop) + a1(jne scrypt_ChunkMix_avx_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_avx) @@ -245,7 +245,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) #define SCRYPT_SALSA_AVX -static void asm_calling_convention NOINLINE +static void NOINLINE scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h b/code/scrypt-jane-mix_salsa-sse2.h similarity index 95% rename from scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h rename to code/scrypt-jane-mix_salsa-sse2.h index a1274c34a..ecc5f0f8d 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa-sse2.h +++ b/code/scrypt-jane-mix_salsa-sse2.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) #define SCRYPT_SALSA_SSE2 @@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) + a1(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[ecx+eax+0]) a2(pxor xmm1,[ecx+eax+16]) a2(pxor xmm2,[ecx+eax+32]) @@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[esi+ecx+16]) a2(pxor xmm2,[esi+ecx+32]) a2(pxor xmm3,[esi+ecx+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) + a1(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[eax+ecx+0]) a2(pxor xmm1,[eax+ecx+16]) a2(pxor xmm2,[eax+ecx+32]) @@ -113,7 +113,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm0, xmm4) a3(pshufd xmm3, xmm3, 0x39) a2(pxor xmm0, xmm5) - aj(ja scrypt_salsa_sse2_loop) + a1(ja scrypt_salsa_sse2_loop) a2(paddd xmm0,[esp+0]) a2(paddd xmm1,[esp+16]) a2(paddd xmm2,xmm6) @@ -130,7 +130,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [eax+32],xmm2) a2(movdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - aj(jne scrypt_ChunkMix_sse2_loop) + a1(jne scrypt_ChunkMix_sse2_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) @@ -144,7 +144,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) #define SCRYPT_SALSA_SSE2 @@ -160,7 +160,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[rax+16]) a2(movdqa xmm2,[rax+32]) a2(movdqa xmm3,[rax+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) + a1(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -174,7 +174,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[rsi+r9+16]) a2(pxor xmm2,[rsi+r9+32]) a2(pxor xmm3,[rsi+r9+48]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) + a1(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -249,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm0, xmm4) a3(pshufd xmm3, xmm3, 0x39) a2(pxor xmm0, xmm5) - aj(ja scrypt_salsa_sse2_loop) + a1(ja scrypt_salsa_sse2_loop) a2(paddd xmm0,xmm8) a2(paddd xmm1,xmm9) a2(paddd xmm2,xmm10) @@ -265,7 +265,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [rax+16],xmm1) a2(movdqa [rax+32],xmm2) a2(movdqa [rax+48],xmm3) - aj(jne scrypt_ChunkMix_sse2_loop) + a1(jne scrypt_ChunkMix_sse2_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_sse2) @@ -277,7 +277,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) #define SCRYPT_SALSA_SSE2 -static void NOINLINE asm_calling_convention +static void NOINLINE scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa.h b/code/scrypt-jane-mix_salsa.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-mix_salsa.h rename to code/scrypt-jane-mix_salsa.h diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h b/code/scrypt-jane-mix_salsa64-avx.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h rename to code/scrypt-jane-mix_salsa64-avx.h index c7c7f5205..50c9902d5 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa64-avx.h +++ b/code/scrypt-jane-mix_salsa64-avx.h @@ -1,5 +1,5 @@ /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) #define SCRYPT_SALSA64_AVX @@ -23,7 +23,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa xmm5,[rax+80]) a2(vmovdqa xmm6,[rax+96]) a2(vmovdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_avx_no_xor1) + a1(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[r9+0]) a3(vpxor xmm1,xmm1,[r9+16]) a3(vpxor xmm2,xmm2,[r9+32]) @@ -45,7 +45,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm5,xmm5,[rsi+r9+80]) a3(vpxor xmm6,xmm6,[rsi+r9+96]) a3(vpxor xmm7,xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_avx_no_xor2) + a1(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[rdx+r9+0]) a3(vpxor xmm1,xmm1,[rdx+r9+16]) a3(vpxor xmm2,xmm2,[rdx+r9+32]) @@ -142,7 +142,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a4(vpalignr xmm3, xmm7, xmm6, 8) a4(vpalignr xmm6, xmm11, xmm10, 8) a4(vpalignr xmm7, xmm10, xmm11, 8) - aj(ja scrypt_salsa64_avx_loop) + a1(ja scrypt_salsa64_avx_loop) a3(vpaddq xmm0,xmm0,[rsp+0]) a3(vpaddq xmm1,xmm1,[rsp+16]) a3(vpaddq xmm2,xmm2,[rsp+32]) @@ -166,7 +166,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [rax+80],xmm5) a2(vmovdqa [rax+96],xmm6) a2(vmovdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_avx_loop) + a1(jne scrypt_ChunkMix_avx_loop) a2(mov rsp, rbp) a1(pop rbp) a1(ret) @@ -176,7 +176,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX) #define SCRYPT_SALSA64_AVX diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h b/code/scrypt-jane-mix_salsa64-sse2.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h rename to code/scrypt-jane-mix_salsa64-sse2.h index e6f809a45..f8d957432 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa64-sse2.h +++ b/code/scrypt-jane-mix_salsa64-sse2.h @@ -1,5 +1,5 @@ /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) #define SCRYPT_SALSA64_SSE2 @@ -23,7 +23,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm5,[rax+80]) a2(movdqa xmm6,[rax+96]) a2(movdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_sse2_no_xor1) + a1(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -45,7 +45,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm5,[rsi+r9+80]) a2(pxor xmm6,[rsi+r9+96]) a2(pxor xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_sse2_no_xor2) + a1(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -186,7 +186,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(punpckhqdq xmm3, xmm11) a2(punpckhqdq xmm6, xmm9) a2(punpckhqdq xmm7, xmm8) - aj(ja scrypt_salsa64_sse2_loop) + a1(ja scrypt_salsa64_sse2_loop) a2(paddq xmm0,[rsp+0]) a2(paddq xmm1,[rsp+16]) a2(paddq xmm2,[rsp+32]) @@ -210,7 +210,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [rax+80],xmm5) a2(movdqa [rax+96],xmm6) a2(movdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_sse2_loop) + a1(jne scrypt_ChunkMix_sse2_loop) a2(mov rsp, rbp) a1(pop rbp) a1(ret) @@ -220,7 +220,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2) #define SCRYPT_SALSA64_SSE2 diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h b/code/scrypt-jane-mix_salsa64-ssse3.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h rename to code/scrypt-jane-mix_salsa64-ssse3.h index d54ca450b..105efa83f 100644 --- a/scrypt-jane/code/scrypt-jane-mix_salsa64-ssse3.h +++ b/code/scrypt-jane-mix_salsa64-ssse3.h @@ -1,5 +1,5 @@ /* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) #define SCRYPT_SALSA64_SSSE3 @@ -23,7 +23,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa xmm5,[rax+80]) a2(movdqa xmm6,[rax+96]) a2(movdqa xmm7,[rax+112]) - aj(jz scrypt_ChunkMix_ssse3_no_xor1) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -45,7 +45,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pxor xmm5,[rsi+r9+80]) a2(pxor xmm6,[rsi+r9+96]) a2(pxor xmm7,[rsi+r9+112]) - aj(jz scrypt_ChunkMix_ssse3_no_xor2) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -174,7 +174,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa xmm7, xmm10) a3(palignr xmm6, xmm10, 8) a3(palignr xmm7, xmm11, 8) - aj(ja scrypt_salsa64_ssse3_loop) + a1(ja scrypt_salsa64_ssse3_loop) a2(paddq xmm0,[rsp+0]) a2(paddq xmm1,[rsp+16]) a2(paddq xmm2,[rsp+32]) @@ -198,7 +198,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa [rax+80],xmm5) a2(movdqa [rax+96],xmm6) a2(movdqa [rax+112],xmm7) - aj(jne scrypt_ChunkMix_ssse3_loop) + a1(jne scrypt_ChunkMix_ssse3_loop) a2(mov rsp, rbp) a1(pop rbp) a1(ret) @@ -208,7 +208,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) /* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3) #define SCRYPT_SALSA64_SSSE3 diff --git a/scrypt-jane/code/scrypt-jane-mix_salsa64.h b/code/scrypt-jane-mix_salsa64.h similarity index 100% rename from scrypt-jane/code/scrypt-jane-mix_salsa64.h rename to code/scrypt-jane-mix_salsa64.h diff --git a/scrypt-jane/code/scrypt-jane-pbkdf2.h b/code/scrypt-jane-pbkdf2.h similarity index 68% rename from scrypt-jane/code/scrypt-jane-pbkdf2.h rename to code/scrypt-jane-pbkdf2.h index 711e3d633..53b2d947e 100644 --- a/scrypt-jane/code/scrypt-jane-pbkdf2.h +++ b/code/scrypt-jane-pbkdf2.h @@ -40,7 +40,9 @@ scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { pad[i] ^= (0x5c ^ 0x36); scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); +#ifdef SCRYPT_PREVENT_STATE_LEAK scrypt_ensure_zero(pad, sizeof(pad)); +#endif } static void @@ -59,7 +61,9 @@ scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); scrypt_hash_finish(&st->outer, mac); +#ifdef SCRYPT_PREVENT_STATE_LEAK scrypt_ensure_zero(st, sizeof(*st)); +#endif } static void @@ -105,8 +109,53 @@ scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, bytes -= SCRYPT_HASH_DIGEST_SIZE; } +#ifdef SCRYPT_PREVENT_STATE_LEAK scrypt_ensure_zero(ti, sizeof(ti)); scrypt_ensure_zero(u, sizeof(u)); scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); +#endif +} + +/* + * Special version where N = 1 + * - mikaelh + */ +static void +scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) { + scrypt_hmac_state hmac_pw, hmac_pw_salt, work; + scrypt_hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, blocks; + uint64_t c; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for (i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + scrypt_hmac_update(&work, be, 4); + scrypt_hmac_finish(&work, ti); + memcpy(u, ti, sizeof(u)); + + memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + out += SCRYPT_HASH_DIGEST_SIZE; + bytes -= SCRYPT_HASH_DIGEST_SIZE; + } + +#ifdef SCRYPT_PREVENT_STATE_LEAK + scrypt_ensure_zero(ti, sizeof(ti)); + scrypt_ensure_zero(u, sizeof(u)); + scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); + scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); +#endif } diff --git a/scrypt-jane/code/scrypt-jane-portable-x86.h b/code/scrypt-jane-portable-x86.h similarity index 88% rename from scrypt-jane/code/scrypt-jane-portable-x86.h rename to code/scrypt-jane-portable-x86.h index 26fdc3f6a..4eb07f704 100644 --- a/scrypt-jane/code/scrypt-jane-portable-x86.h +++ b/code/scrypt-jane-portable-x86.h @@ -24,7 +24,7 @@ #endif #endif -#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64)) +#if defined(COMPILER_MSVC) #define X86_INTRINSIC #if defined(CPU_X86_64) || defined(X86ASM_SSE) #define X86_INTRINSIC_SSE @@ -37,6 +37,14 @@ #endif #endif +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + #if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) #define X86_INTRINSIC #if defined(__SSE__) @@ -51,6 +59,15 @@ #if defined(__AVX__) #define X86_INTRINSIC_AVX #endif + + /* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */ + #undef X86_64ASM_SSSE3 + #undef X86_64ASM_AVX + #undef X86_64ASM_SSE2 + #undef X86ASM_AVX + #undef X86ASM_SSSE3 + #undef X86ASM_SSE2 + #undef X86ASM_SSE #endif /* only use simd on windows (or SSE2 on gcc)! */ @@ -72,6 +89,10 @@ #define X86_INTRINSIC_SSSE3 #include #endif + #if defined (X86_INTRINSIC_AVX) + #define X86_INTRINSIC_AVX + #include + #endif #endif @@ -133,12 +154,12 @@ #define a2(x, y) __asm {x, y} #define a3(x, y, z) __asm {x, y, z} #define a4(x, y, z, w) __asm {x, y, z, w} - #define aj(x) __asm {x} + #define al(x) __asm {label##x:} + #define aj(x, y, z) __asm {x label##y} #define asm_align8 a1(ALIGN 8) #define asm_align16 a1(ALIGN 16) #define asm_calling_convention STDCALL - #define aret(n) a1(ret n) #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn #define asm_naked_fn(fn) { #define asm_naked_fn_end(fn) } @@ -147,25 +168,28 @@ #define GNU_AS2(x, y) #x ", " #y ";\n" #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" + #define GNU_ASL(x) "\n" #x ":\n" #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" - #define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n" + #define GNU_ASJ(x, y, z) #x " " #y #z ";" #define a1(x) GNU_AS1(x) #define a2(x, y) GNU_AS2(x, y) #define a3(x, y, z) GNU_AS3(x, y, z) #define a4(x, y, z, w) GNU_AS4(x, y, z, w) - #define aj(x) GNU_ASJ(x) - #define asm_align8 ".p2align 3,,7" - #define asm_align16 ".p2align 4,,15" + #define al(x) GNU_ASL(x) + #define aj(x, y, z) GNU_ASJ(x, y, z) + #define asm_align8 a1(.align 8) + #define asm_align16 a1(.align 16) #if defined(OS_WINDOWS) #define asm_calling_convention CDECL #define aret(n) a1(ret) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); #else #define asm_calling_convention STDCALL #define aret(n) a1(ret n) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); #endif - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) @@ -360,4 +384,4 @@ get_top_cpuflag_desc(size_t flag) { #endif #endif -#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ \ No newline at end of file +#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ diff --git a/scrypt-jane/code/scrypt-jane-portable.h b/code/scrypt-jane-portable.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-portable.h rename to code/scrypt-jane-portable.h index cb1c7b308..33c8c2cad 100644 --- a/scrypt-jane/code/scrypt-jane-portable.h +++ b/code/scrypt-jane-portable.h @@ -65,8 +65,6 @@ #define ROTR64(a,b) _rotr64(a,b) #undef NOINLINE #define NOINLINE __declspec(noinline) - #undef NORETURN - #define NORETURN #undef INLINE #define INLINE __forceinline #undef FASTCALL @@ -99,12 +97,6 @@ #else #define NOINLINE #endif - #undef NORETURN - #if (COMPILER_GCC >= 30000) - #define NORETURN __attribute__((noreturn)) - #else - #define NORETURN - #endif #undef INLINE #if (COMPILER_GCC >= 30000) #define INLINE __attribute__((always_inline)) @@ -255,7 +247,7 @@ scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { return (1 & ((differentbits - 1) >> 8)); } -static void +void scrypt_ensure_zero(void *p, size_t len) { #if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) __stosb((unsigned char *)p, 0, len); @@ -287,6 +279,3 @@ scrypt_ensure_zero(void *p, size_t len) { #include "scrypt-jane-portable-x86.h" -#if !defined(asm_calling_convention) -#define asm_calling_convention -#endif diff --git a/scrypt-jane/code/scrypt-jane-romix-basic.h b/code/scrypt-jane-romix-basic.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-romix-basic.h rename to code/scrypt-jane-romix-basic.h index 4f1e07cdb..1cdb3fb06 100644 --- a/scrypt-jane/code/scrypt-jane-romix-basic.h +++ b/code/scrypt-jane-romix-basic.h @@ -6,7 +6,6 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc /* romix pre/post nop function */ static void asm_calling_convention scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { - (void)blocks; (void)nblocks; } /* romix pre/post endian conversion function */ @@ -21,8 +20,6 @@ scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); } } -#else - (void)blocks; (void)nblocks; #endif } diff --git a/scrypt-jane/code/scrypt-jane-romix-template.h b/code/scrypt-jane-romix-template.h similarity index 64% rename from scrypt-jane/code/scrypt-jane-romix-template.h rename to code/scrypt-jane-romix-template.h index a5f8da1c1..7879c58f8 100644 --- a/scrypt-jane/code/scrypt-jane-romix-template.h +++ b/code/scrypt-jane-romix-template.h @@ -69,7 +69,7 @@ SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *B static void NOINLINE FASTCALL SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { - uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * r * 2); + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; scrypt_mix_word_t *block = V; SCRYPT_ROMIX_TANGLE_FN(X, r * 2); @@ -107,6 +107,67 @@ SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chu SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); } +/* + * Special version with hard-coded r = 1 + * - mikaelh + */ +static void NOINLINE FASTCALL +scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) { + const uint32_t r = 1; + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, r * 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < N - 1; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ +#ifdef SCRYPT_CHUNKMIX_1_FN + SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block); +#else + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); +#endif + } +#ifdef SCRYPT_CHUNKMIX_1_FN + SCRYPT_CHUNKMIX_1_FN(X, block); +#else + SCRYPT_CHUNKMIX_FN(X, block, NULL, r); +#endif + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ +#ifdef SCRYPT_CHUNKMIX_1_XOR_FN + SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords)); +#else + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); +#endif + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ +#ifdef SCRYPT_CHUNKMIX_1_XOR_FN + SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords)); +#else + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); +#endif + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); +} + #endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ diff --git a/scrypt-jane/code/scrypt-jane-romix.h b/code/scrypt-jane-romix.h similarity index 88% rename from scrypt-jane/code/scrypt-jane-romix.h rename to code/scrypt-jane-romix.h index 84cf61201..faa655a0f 100644 --- a/scrypt-jane/code/scrypt-jane-romix.h +++ b/code/scrypt-jane-romix.h @@ -13,11 +13,11 @@ #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) #if !defined(SCRYPT_CHOOSE_COMPILETIME) static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} - static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; } + static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } #else static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} #endif - static int scrypt_test_mix(void) { return 0; } + static int scrypt_test_mix() { return 0; } #error must define a mix function! #endif diff --git a/scrypt-jane/code/scrypt-jane-salsa.h b/code/scrypt-jane-salsa.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-salsa.h rename to code/scrypt-jane-salsa.h index 23eca3d13..76f3da630 100644 --- a/scrypt-jane/code/scrypt-jane-salsa.h +++ b/code/scrypt-jane-salsa.h @@ -41,7 +41,7 @@ typedef uint32_t scrypt_mix_word_t; #if !defined(SCRYPT_CHOOSE_COMPILETIME) static scrypt_ROMixfn -scrypt_getROMix(void) { +scrypt_getROMix() { size_t cpuflags = detect_cpu(); #if defined(SCRYPT_SALSA_AVX) @@ -63,7 +63,7 @@ scrypt_getROMix(void) { #if defined(SCRYPT_TEST_SPEED) static size_t -available_implementations(void) { +available_implementations() { size_t cpuflags = detect_cpu(); size_t flags = 0; @@ -83,7 +83,7 @@ available_implementations(void) { static int -scrypt_test_mix(void) { +scrypt_test_mix() { static const uint8_t expected[16] = { 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, }; diff --git a/scrypt-jane/code/scrypt-jane-salsa64.h b/code/scrypt-jane-salsa64.h similarity index 97% rename from scrypt-jane/code/scrypt-jane-salsa64.h rename to code/scrypt-jane-salsa64.h index 6f67e42d3..ecc87f596 100644 --- a/scrypt-jane/code/scrypt-jane-salsa64.h +++ b/code/scrypt-jane-salsa64.h @@ -49,7 +49,7 @@ typedef uint64_t scrypt_mix_word_t; #if !defined(SCRYPT_CHOOSE_COMPILETIME) static scrypt_ROMixfn -scrypt_getROMix(void) { +scrypt_getROMix() { size_t cpuflags = detect_cpu(); #if defined(SCRYPT_SALSA64_AVX) @@ -77,7 +77,7 @@ scrypt_getROMix(void) { #if defined(SCRYPT_TEST_SPEED) static size_t -available_implementations(void) { +available_implementations() { size_t cpuflags = detect_cpu(); size_t flags = 0; @@ -101,7 +101,7 @@ available_implementations(void) { #endif static int -scrypt_test_mix(void) { +scrypt_test_mix() { static const uint8_t expected[16] = { 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, }; diff --git a/scrypt-jane/code/scrypt-jane-test-vectors.h b/code/scrypt-jane-test-vectors.h similarity index 98% rename from scrypt-jane/code/scrypt-jane-test-vectors.h rename to code/scrypt-jane-test-vectors.h index 72a727634..8093cf03e 100644 --- a/scrypt-jane/code/scrypt-jane-test-vectors.h +++ b/code/scrypt-jane-test-vectors.h @@ -3,10 +3,15 @@ typedef struct scrypt_test_setting_t { uint8_t Nfactor, rfactor, pfactor; } scrypt_test_setting; +/* + * I'm hardcoding the values of p and r, which means they can't be tested + * anymore. A new test case with a different value for N should maybe be added. + * - mikaelh + */ static const scrypt_test_setting post_settings[] = { {"", "", 3, 0, 0}, - {"password", "NaCl", 9, 3, 4}, - {0, 0, 0, 0, 0} +// {"password", "NaCl", 9, 3, 4}, + {0} }; #if defined(SCRYPT_SHA256) diff --git a/compat.h b/compat.h index 283fc9b61..cb7630992 100644 --- a/compat.h +++ b/compat.h @@ -5,7 +5,10 @@ #include -#define sleep(secs) Sleep((secs) * 1000) +static inline void sleep(int secs) +{ + Sleep(secs * 1000); +} enum { PRIO_PROCESS = 0, diff --git a/configure.ac b/configure.ac index 971fc3a66..6663ae06c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer], [2.3.2]) +AC_INIT([cpuminer], [2.3.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM @@ -42,11 +42,13 @@ case $target in i*86-*-*) have_x86=true ;; - x86_64-*-*|amd64-*-*) + x86_64-*-*) + have_x86=true have_x86_64=true ;; - arm*-*-*) - have_arm=true + amd64-*-*) + have_x86=true + have_x86_64=true ;; esac @@ -61,7 +63,7 @@ case $target in ;; esac -if test x$have_x86 = xtrue -o x$have_x86_64 = xtrue +if test x$have_x86 = xtrue then AC_MSG_CHECKING(whether we can compile AVX code) AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])], @@ -100,7 +102,6 @@ AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue]) AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue]) AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue]) AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue]) -AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue]) if test x$request_jansson = xtrue then diff --git a/cpu-miner.c b/cpu-miner.c index 537316e0f..488c2c27f 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -37,7 +37,6 @@ #include #include "compat.h" #include "miner.h" -#include "yacoin.h" #define PROGRAM_NAME "minerd" #define DEF_RPC_URL "http://127.0.0.1:9332/" @@ -101,16 +100,15 @@ struct workio_cmd { } u; }; - enum sha256_algos { ALGO_SCRYPT, /* scrypt(1024,1,1) */ - ALGO_YACOIN, /* scrypt(N,1,1) */ + ALGO_SCRYPT_JANE, /* scrypt-jane with n-factor */ ALGO_SHA256D, /* SHA-256d */ }; static const char *algo_names[] = { [ALGO_SCRYPT] = "scrypt", - [ALGO_YACOIN] = "yacoin", + [ALGO_SCRYPT_JANE] = "scrypt-jane", [ALGO_SHA256D] = "sha256d", }; @@ -169,9 +167,9 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ - scrypt scrypt(1024, 1, 1) (default)\n\ - yacoin scrypt N keccak512/chacha20/8\n\ - sha256d SHA-256d\n\ + scrypt scrypt(1024, 1, 1) (default)\n\ + scrypt-jane scrypt-jane\n\ + sha256d SHA-256d\n\ -o, --url=URL URL of mining server (default: " DEF_RPC_URL ")\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -661,7 +659,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) free(xnonce2str); } - if (opt_algo == ALGO_SCRYPT) + if (opt_algo == ALGO_SCRYPT||opt_algo == ALGO_SCRYPT_JANE) diff_to_target(work->target, sctx->job.diff / 65536.0); else diff_to_target(work->target, sctx->job.diff); @@ -695,7 +693,7 @@ static void *miner_thread(void *userdata) affine_to_cpu(thr_id, thr_id % num_processors); } - if (opt_algo == ALGO_SCRYPT) + if (opt_algo == ALGO_SCRYPT||opt_algo == ALGO_SCRYPT) { scratchbuf = scrypt_buffer_alloc(); } @@ -747,7 +745,7 @@ static void *miner_thread(void *userdata) - time(NULL); max64 *= thr_hashrates[thr_id]; if (max64 <= 0) - max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL; + max64 = (opt_algo == ALGO_SCRYPT||opt_algo == ALGO_SCRYPT_JANE) ? 0xfffLL : 0x1fffffLL; if (work.data[19] + max64 > end_nonce) max_nonce = end_nonce; else @@ -763,8 +761,8 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; - case ALGO_YACOIN: - rc = scanhash_yacoin(thr_id, work.data, work.target, + case ALGO_SCRYPT_JANE: + rc = scanhash_scrypt_jane(thr_id, work.data, work.target, max_nonce, &hashes_done); break; diff --git a/miner.h b/miner.h index f47bb3202..b031dd577 100644 --- a/miner.h +++ b/miner.h @@ -4,7 +4,7 @@ #include "cpuminer-config.h" #include -#include +#include #include #include #include @@ -155,10 +155,10 @@ extern unsigned char *scrypt_buffer_alloc(); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_yacoin(int thr_id, uint32_t *pdata, + +extern int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); - struct thr_info { int id; pthread_t pth; diff --git a/scrypt-jane.c b/scrypt-jane.c new file mode 100644 index 000000000..18b705991 --- /dev/null +++ b/scrypt-jane.c @@ -0,0 +1,309 @@ +/* + scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + + Public Domain or MIT License, whichever is easier +*/ + +#include "cpuminer-config.h" +#include "miner.h" + +#include + +/* Hard-coded scrypt parameteres r and p - mikaelh */ +#define SCRYPT_R 1 +#define SCRYPT_P 1 + +/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */ +#define CPU_X86_FORCE_INTRINSICS + +#include "scrypt-jane.h" +#include "code/scrypt-jane-portable.h" +#include "code/scrypt-jane-hash.h" +#include "code/scrypt-jane-romix.h" +#include "code/scrypt-jane-test-vectors.h" + + +#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ +#if (SCRYPT_BLOCK_BYTES == 64) +#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 128) +#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 256) +#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ +#elif (SCRYPT_BLOCK_BYTES == 512) +#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ +#endif +#define scrypt_maxr scrypt_r_32kb /* 32kb */ +#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ + +#include +#include + +static void +scrypt_fatal_error_default(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + +static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; + +void +scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { + scrypt_fatal_error = fn; +} + +typedef struct scrypt_aligned_alloc_t { + uint8_t *mem, *ptr; +} scrypt_aligned_alloc; + +#if defined(SCRYPT_TEST_SPEED) +static uint8_t *mem_base = (uint8_t *)0; +static size_t mem_bump = 0; + +/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + scrypt_aligned_alloc aa; + if (!mem_base) { + mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); + if (!mem_base) + scrypt_fatal_error("scrypt: out of memory"); + mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + } + aa.mem = mem_base + mem_bump; + aa.ptr = aa.mem; + mem_bump += (size_t)size; + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + mem_bump = 0; +} +#else +static scrypt_aligned_alloc +scrypt_alloc(uint64_t size) { + static const size_t max_alloc = (size_t)-1; + scrypt_aligned_alloc aa; + size += (SCRYPT_BLOCK_BYTES - 1); + if (size > max_alloc) + scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); + aa.mem = (uint8_t *)malloc((size_t)size); + aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + if (!aa.mem) + scrypt_fatal_error("scrypt: out of memory"); + return aa; +} + +static void +scrypt_free(scrypt_aligned_alloc *aa) { + free(aa->mem); +} +#endif + + +static int +scrypt_power_on_self_test() { + const scrypt_test_setting *t; + uint8_t test_digest[64]; + uint32_t i; + int res = 7, scrypt_valid; + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + uint32_t N, chunk_bytes; + const uint32_t r = SCRYPT_R; + const uint32_t p = SCRYPT_P; + + if (!scrypt_test_mix()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); +#endif + res &= ~1; + } + + if (!scrypt_test_hash()) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); +#endif + res &= ~2; + } + + for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { + t = post_settings + i; + + N = (1 << (t->Nfactor + 1)); + + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + V = scrypt_alloc((uint64_t)N * chunk_bytes); + YX = scrypt_alloc((p + 1) * chunk_bytes); + + Y = YX.ptr; + X = Y + chunk_bytes; + + scrypt_N_1_1((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), N, test_digest, sizeof(test_digest), X, Y, V.ptr); + scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); + + scrypt_free(&V); + scrypt_free(&YX); + } + + if (!scrypt_valid) { +#if !defined(SCRYPT_TEST) + scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); +#endif + res &= ~4; + } + + return res; +} + + +void +scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) { + uint32_t chunk_bytes, i; + const uint32_t r = SCRYPT_R; + const uint32_t p = SCRYPT_P; + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) + scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); +#endif + + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + + /* 1: X = PBKDF2(password, salt) */ + scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p); + + /* 2: X = ROMix(X) */ + for (i = 0; i < p; i++) + scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N); + + /* 3: Out = PBKDF2(password, X) */ + scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes); + +#ifdef SCRYPT_PREVENT_STATE_LEAK + /* This is an unnecessary security feature - mikaelh */ + scrypt_ensure_zero(Y, (p + 1) * chunk_bytes); +#endif +} + + +// yacoin: increasing Nfactor gradually +const unsigned char minNfactor = 4; +const unsigned char maxNfactor = 30; + +unsigned char GetNfactor(unsigned int nTimestamp) { + int l = 0; + + if (nTimestamp <= 1367991200) + return 4; + + unsigned long int s = nTimestamp - 1367991200; + while ((s >> 1) > 3) { + l += 1; + s >>= 1; + } + + s &= 3; + + int n = (l * 170 + s * 25 - 2320) / 100; + + if (n < 0) n = 0; + + if (n > 255) + printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); + + unsigned char N = (unsigned char)n; + //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor)); + +// return min(max(N, minNfactor), maxNfactor); + + if(NmaxNfactor) return maxNfactor; + return N; +} + +int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[20], hash[8], target_swap[8]; + volatile unsigned char *hashc = (unsigned char *) hash; + volatile unsigned char *datac = (unsigned char *) data; + volatile unsigned char *pdatac = (unsigned char *) pdata; + uint32_t n = pdata[19] - 1; + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + uint32_t N, chunk_bytes; + const uint32_t r = SCRYPT_R; + const uint32_t p = SCRYPT_P; + int i; + +#if !defined(SCRYPT_TEST) + static int power_on_self_test = 0; + if (!power_on_self_test) { + power_on_self_test = 1; + if (!scrypt_power_on_self_test()) + scrypt_fatal_error("scrypt: power on self test failed"); + } +#endif + + /* byte swap it */ + for(int z=0;z<20;z++) { + datac[(z*4) ] = pdatac[(z*4)+3]; + datac[(z*4)+1] = pdatac[(z*4)+2]; + datac[(z*4)+2] = pdatac[(z*4)+1]; + datac[(z*4)+3] = pdatac[(z*4) ]; + } + + int Nfactor = GetNfactor(data[17]); + if (Nfactor > scrypt_maxN) { + scrypt_fatal_error("scrypt: N out of range"); + } + + N = (1 << (Nfactor + 1)); + + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + V = scrypt_alloc((uint64_t)N * chunk_bytes); + YX = scrypt_alloc((p + 1) * chunk_bytes); + + Y = YX.ptr; + X = Y + chunk_bytes; + + do { + data[19] = ++n; + + scrypt_N_1_1((unsigned char *)data, 80, + (unsigned char *)data, 80, + N, (unsigned char *)hash, 32, X, Y, V.ptr); + + if (hashc[31] == 0 && hashc[30] == 0) { +/* + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", hash[z]); + fprintf(stderr, "\n"); + + for(int z=7;z>=0;z--) + fprintf(stderr, "%08x ", ptarget[z]); + fprintf(stderr, "\n"); +*/ + if(fulltest(hash, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdatac[76] = datac[79]; + pdatac[77] = datac[78]; + pdatac[78] = datac[77]; + pdatac[79] = datac[76]; + + scrypt_free(&V); + scrypt_free(&YX); + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + scrypt_free(&V); + scrypt_free(&YX); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +} diff --git a/scrypt-jane/scrypt-jane.h b/scrypt-jane.h similarity index 75% rename from scrypt-jane/scrypt-jane.h rename to scrypt-jane.h index 1c0df6242..c554935cb 100644 --- a/scrypt-jane/scrypt-jane.h +++ b/scrypt-jane.h @@ -18,10 +18,11 @@ */ #include +#include typedef void (*scrypt_fatal_errorfn)(const char *msg); void scrypt_set_fatal_error(scrypt_fatal_errorfn fn); -void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes); +void scrypt_N_1_1(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, uint32_t N, unsigned char *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V); #endif /* SCRYPT_JANE_H */ diff --git a/scrypt-jane/README.md b/scrypt-jane/README.md deleted file mode 100644 index 2b6976684..000000000 --- a/scrypt-jane/README.md +++ /dev/null @@ -1,161 +0,0 @@ -This project provides a performant, flexible implementations of Colin Percival's [scrypt](http://www.tarsnap.com/scrypt.html). - -# Features - -## Modular Design - -The code uses a modular (compile, not runtime) layout to allow new mixing & hash functions to be added easily. The base components (HMAC, PBKDF2, and scrypt) are static and will immediately work with any conforming mix or hash function. - -## Supported Mix Functions - -* [Salsa20/8](http://cr.yp.to/salsa20.html) -* [ChaCha20/8](http://cr.yp.to/chacha.html) -* [Salsa6420/8]() - -I am not actually aware of any other candidates for a decent mix function. Salsa20/8 was nearly perfect, but its successor, ChaCha20/8, has better diffusion and is thus stronger, is potentially faster given advanced SIMD support (byte level shuffles, or a 32bit rotate), and is slightly cleaner to implement given that it requires no pre/post processing of data for SIMD implementations. - -64-byte blocks are no longer assumed! Salsa6420/8 is a 'proof of concept' 64-bit version of Salsa20/8 with a 128 byte block, and rotation constants chosen to allow 32-bit word shuffles instead of rotations for two of the rotations which put it on par with ChaCha in terms of SSE implementation shortcuts. - -## Supported Hash Functions - -* SHA256/512 -* [BLAKE256/512](https://www.131002.net/blake/) -* [Skein512](http://www.skein-hash.info/) -* [Keccak256/512](http://keccak.noekeon.org/) (SHA-3) - -Hash function implementations, unlike mix functions, are not optimized. The PBKDF2 computations are relatively minor in the scrypt algorithm, so including CPU specific versions, or vastly unrolling loops, would serve little purpose while bloating the code, both source and binary, and making it more confusing to implement correctly. - -Most (now only two!) of the SHA-3 candidates fall in to the "annoying to read/implement" category and have not been included yet. This will of course be moot once ~~BLAKE is chosen as SHA-3~~ Keccak is chosen as SHA-3. Well shit. - -## CPU Adaptation - -The mixing function specialization is selected at runtime based on what the CPU supports (well, x86/x86-64 for now, but theoretically any). On platforms where this is not needed, e.g. where packages are usually compiled from source, it can also select the most suitable implementation at compile time, cutting down on binary size. - -For those who are familiar with the scrypt spec, the code specializes at the ROMix level, allowing all copy, and xor calls to be inlined efficiently. ***Update***: This is actually not as important as I switched from specializing at the mix() level and letting the compiler somewhat inefficiently inline block_copy and block_xor to specializing at ChunkMix(), where they can be inlined properly. I thought about specializing at ROMix(), but it would increase the complexity per mix function even more and would not present many more opportunities than what is generated by the compiler presently. - -MSVC uses SSE intrinsics as opposed to inline assembly for the mix functions to allow the compiler to fully inline properly. Also, Visual Studio is not smart enough to allow inline assembly in 64-bit code. - -## Self Testing - -On first use, scrypt() runs a small series of tests to make sure the hash function, mix functions, and scrypt() itself, are generating correct results. It will exit() (or call a user defined fatal error function) should any of these tests fail. - -Test vectors for individual mix and hash functions are generated from reference implementations. The only "official" test vectors for the full scrypt() are for SHA256 + Salsa20/8 of course; other combinations are generated from this code (once it works with all reference test vectors) and subject to change if any implementation errors are discovered. - -# Performance (on an E5200 2.5GHZ) - -Benchmarks are run _without_ allocating memory, i.e. allocating enough memory before the trials are run. Different allocators can have different costs and non-deterministic effects, which is not the point of comparing implementations. The only hash function compared will be SHA-256 to be comparable to Colin's reference implementation, and the hash function will generally be a fraction of a percent of noise in the overall result. - -Three different scrypt settings are tested (the last two are from the scrypt paper): - -* 'High Volume': N=4096, r=8, p=1, 4mb memory -* 'Interactive': N=16384, r=8, p=1, 16mb memory -* 'Non-Interactive': N=1048576, r=8, p=1, 1gb memory - -Cycle counts are in millions of cycles. All versions compiled with gcc 4.6.3, -O3. Sorted from fastest to slowest. - -Scaling refers to how much more expensive 'Non-Interactive' is to compute than 'High Volume', normalized to "ideal" scaling (256x difficulty). Under 100% means it becomes easier to process as N grows, over 100% means it becomes more difficult to process as N grows. - - - - - - - - - - - - - - - - - -
ImplemenationAlgoHigh VolumeInteractiveNon-InteractiveScaling
scrypt-jane SSSE3 64bitSalsa6420/8 18.2m 75.6m5120.0m110.0%
scrypt-jane SSSE3 64bitChaCha20/8 19.6m 79.6m5296.7m105.6%
scrypt-jane SSSE3 32bitChaCha20/8 19.8m 80.3m5346.1m105.5%
scrypt-jane SSE2 64bit Salsa6420/8 19.8m 82.1m5529.2m109.1%
scrypt-jane SSE2 64bit Salsa20/8 22.1m 89.7m5938.8m105.0%
scrypt-jane SSE2 32bit Salsa20/8 22.3m 90.6m6011.0m105.3%
scrypt-jane SSE2 64bit ChaCha20/8 23.9m 96.8m6399.7m104.6%
scrypt-jane SSE2 32bit ChaCha20/8 24.2m 98.3m6500.7m104.9%
*Reference SSE2 64bit* Salsa20/8 32.9m135.2m8881.6m105.5%
*Reference SSE2 32bit* Salsa20/8 33.0m134.4m8885.2m105.2%
- -* scrypt-jane Salsa6420/8-SSSE3 is ~1.80x faster than reference Salsa20/8-SSE2 for High Volume, but drops to 1.73x faster for 'Non-Interactive' instead of remaining constant -* scrypt-jane ChaCha20/8-SSSE3 is ~1.67x faster than reference Salsa20/8-SSE2 -* scrypt-jane Salsa20/8-SSE2 is ~1.48x faster than reference Salsa20/8-SSE2 - -# Performance (on a slightly noisy E3-1270 3.4GHZ) - -All versions compiled with gcc 4.4.7, -O3. Sorted from fastest to slowest. - - - - - - - - - - - - - - - - - - - - - - -
ImplemenationAlgoHigh VolumeInteractiveNon-InteractiveScaling
scrypt-jane AVX 64bit Salsa6420/8 11.8m 52.5m3848.6m127.4%
scrypt-jane SSSE3 64bit Salsa6420/8 13.3m 57.9m4176.6m122.7%
scrypt-jane SSE2 64bit Salsa6420/8 14.2m 61.1m4382.4m120.6%
scrypt-jane AVX 64bit ChaCha20/8 18.0m 77.4m5396.8m117.1%
scrypt-jane AVX 32bit ChaCha20/8 18.3m 82.1m5421.8m115.7%
scrypt-jane SSSE3 64bit ChaCha20/8 19.0m 81.3m5600.7m115.1%
scrypt-jane AVX 64bit Salsa20/8 19.0m 81.2m5610.6m115.3%
scrypt-jane AVX 32bit Salsa20/8 19.0m 81.3m5621.6m115.6%
scrypt-jane SSSE3 32bit ChaCha20/8 19.1m 81.8m5621.6m115.0%
scrypt-jane SSE2 64bit Salsa20/8 19.5m 83.8m5772.9m115.6%
scrypt-jane SSE2 32bit Salsa20/8 19.6m 84.0m5793.9m115.5%
*Reference SSE2/AVX 64bit* Salsa20/8 21.5m 90.4m6147.1m111.7%
*Reference SSE2/AVX 32bit* Salsa20/8 22.3m 94.0m6267.7m110.0%
scrypt-jane SSE2 64bit ChaCha20/8 23.1m 97.7m6670.0m112.8%
scrypt-jane SSE2 32bit ChaCha20/8 23.3m 98.4m6728.7m112.8%
*Reference SSE2 64bit* Salsa20/8 30.4m125.6m8139.4m104.6%
*Reference SSE2 32bit* Salsa20/8 30.0m124.5m8469.3m110.3%
- -* scrypt-jane Salsa6420/8-AVX is 1.60x - 1.82x faster than reference Salsa20/8-SSE2/AVX -* scrypt-jane ChaCha20/8-AVX is 1.13x - 1.19x faster than reference Salsa20/8-SSE2/AVX -* scrypt-jane Salsa20/8-AVX is 1.09x - 1.13x faster than reference Salsa20/8-SSE2/AVX - - -# Building - - [gcc,icc,clang] scrypt-jane.c -O3 -[m32,m64] -DSCRYPT_MIX -DSCRYPT_HASH -c - -where SCRYPT_MIX is one of - -* SCRYPT_SALSA -* SCRYPT_SALSA64 (no optimized 32-bit implementation) -* SCRYPT_CHACHA - -and SCRYPT_HASH is one of - -* SCRYPT_SHA256 -* SCRYPT_SHA512 -* SCRYPT_BLAKE256 -* SCRYPT_BLAKE512 -* SCRYPT_SKEIN512 -* SCRYPT_KECCAK256 -* SCRYPT_KECCAK512 - -e.g. - - gcc scrypt-jane.c -O3 -DSCRYPT_CHACHA -DSCRYPT_BLAKE512 -c - gcc example.c scrypt-jane.o -o example - -clang *may* need "-no-integrated-as" as some? versions don't support ".intel_syntax" - -# Using - - #include "scrypt-jane.h" - - scrypt(password, password_len, salt, salt_len, Nfactor, pfactor, rfactor, out, want_bytes); - -## scrypt parameters - -* Nfactor: Increases CPU & Memory Hardness -* rfactor: Increases Memory Hardness -* pfactor: Increases CPU Hardness - -In scrypt terms - -* N = (1 << (Nfactor + 1)), which controls how many times to mix each chunk, and how many temporary chunks are used. Increasing N increases both CPU time and memory used. -* r = (1 << rfactor), which controls how many blocks are in a chunk (i.e., 2 * r blocks are in a chunk). Increasing r increases how much memory is used. -* p = (1 << pfactor), which controls how many passes to perform over the set of N chunks. Increasing p increases CPU time used. - -I chose to use the log2 of each parameter as it is the common way to communicate settings (e.g. 2^20, not 1048576). - -# License - -Public Domain, or MIT \ No newline at end of file diff --git a/scrypt-jane/example.c b/scrypt-jane/example.c deleted file mode 100644 index 6f290a13c..000000000 --- a/scrypt-jane/example.c +++ /dev/null @@ -1,13 +0,0 @@ -#include -#include "scrypt-jane.h" - - -int main(void) { - unsigned char digest[16]; - int i; - scrypt("pw", 2, "salt", 4, 0, 0, 0, digest, 16); - for (i = 0; i < sizeof(digest); i++) - printf("%02x, ", digest[i]); - printf("\n"); - return 0; -} \ No newline at end of file diff --git a/scrypt-jane/scrypt-jane-speed.c b/scrypt-jane/scrypt-jane-speed.c deleted file mode 100644 index e8d61b3c0..000000000 --- a/scrypt-jane/scrypt-jane-speed.c +++ /dev/null @@ -1,121 +0,0 @@ -#define SCRYPT_TEST_SPEED -#include "scrypt-jane.c" - -/* ticks - not tested on anything other than x86 */ -static uint64_t -get_ticks(void) { -#if defined(CPU_X86) || defined(CPU_X86_64) - #if defined(COMPILER_INTEL) - return _rdtsc(); - #elif defined(COMPILER_MSVC) - return __rdtsc(); - #elif defined(COMPILER_GCC) - uint32_t lo, hi; - __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); - return ((uint64_t)lo | ((uint64_t)hi << 32)); - #else - need rdtsc for this compiler - #endif -#elif defined(OS_SOLARIS) - return (uint64_t)gethrtime(); -#elif defined(CPU_SPARC) && !defined(OS_OPENBSD) - uint64_t t; - __asm__ __volatile__("rd %%tick, %0" : "=r" (t)); - return t; -#elif defined(CPU_PPC) - uint32_t lo = 0, hi = 0; - __asm__ __volatile__("mftbu %0; mftb %1" : "=r" (hi), "=r" (lo)); - return ((uint64_t)lo | ((uint64_t)hi << 32)); -#elif defined(CPU_IA64) - uint64_t t; - __asm__ __volatile__("mov %0=ar.itc" : "=r" (t)); - return t; -#elif defined(OS_NIX) - timeval t2; - gettimeofday(&t2, NULL); - t = ((uint64_t)t2.tv_usec << 32) | (uint64_t)t2.tv_sec; - return t; -#else - need ticks for this platform -#endif -} - -#define timeit(x,minvar) { \ - ticks = get_ticks(); \ - x; \ - ticks = get_ticks() - ticks; \ - if (ticks < minvar) \ - minvar = ticks; \ - } - -#define maxticks 0xffffffffffffffffull - -typedef struct scrypt_speed_settings_t { - const char *desc; - uint8_t Nfactor, rfactor, pfactor; -} scrypt_speed_settings; - -/* scrypt_r_32kb is set to a 32kb chunk, so (1 << (scrypt_r_32kb - 5)) = 1kb chunk */ -static const scrypt_speed_settings settings[] = { - {"scrypt high volume ( ~4mb)", 11, scrypt_r_32kb - 5, 0}, - {"scrypt interactive (~16mb)", 13, scrypt_r_32kb - 5, 0}, - {"scrypt non-interactive (~ 1gb)", 19, scrypt_r_32kb - 5, 0}, - {0} -}; - -int main(void) { - const scrypt_speed_settings *s; - uint8_t password[64], salt[24], digest[64]; - uint64_t minticks, ticks; - size_t i, passes; - size_t cpuflags, topbit; - - for (i = 0; i < sizeof(password); i++) - password[i] = (uint8_t)i; - for (i = 0; i < sizeof(salt); i++) - salt[i] = 255 - (uint8_t)i; - - /* warm up a little */ - scrypt(password, sizeof(password), salt, sizeof(salt), 15, 3, 4, digest, sizeof(digest)); - - cpuflags = available_implementations(); - topbit = 0; - for (i = cpuflags; i != 0; i >>= 1) - topbit++; - topbit = ((size_t)1 << topbit); - - while (1) { - #if defined(SCRYPT_CHOOSE_COMPILETIME) - printf("speed test for scrypt[%s,%s]\n", SCRYPT_HASH, SCRYPT_MIX); - #else - printf("speed test for scrypt[%s,%s,%s]\n", SCRYPT_HASH, SCRYPT_MIX, get_top_cpuflag_desc(cpuflags)); - #endif - - cpu_detect_mask = cpuflags; - for (i = 0; settings[i].desc; i++) { - s = &settings[i]; - minticks = maxticks; - for (passes = 0; passes < 16; passes++) - timeit(scrypt(password, sizeof(password), salt, sizeof(salt), s->Nfactor, s->rfactor, s->pfactor, digest, sizeof(digest)), minticks) - - printf("%s, %.0f ticks\n", s->desc, (double)minticks); - } - - #if defined(SCRYPT_CHOOSE_COMPILETIME) - break; - #else - while (topbit && ((cpuflags & topbit) == 0)) - topbit >>= 1; - cpuflags &= ~topbit; - - /* (cpuflags == 0) is the basic/portable version, don't bother timing it */ - if (!cpuflags) - break; - #endif - } - - printf("\n\n"); - - return 0; -} - diff --git a/scrypt-jane/scrypt-jane-test.c b/scrypt-jane/scrypt-jane-test.c deleted file mode 100644 index 808f84311..000000000 --- a/scrypt-jane/scrypt-jane-test.c +++ /dev/null @@ -1,12 +0,0 @@ -#define SCRYPT_TEST -#include "scrypt-jane.c" - -int main(void) { - int res = scrypt_power_on_self_test(); - - printf("%s: test %s\n", SCRYPT_MIX, (res & 1) ? "ok" : "FAILED"); - printf("%s: test %s\n", SCRYPT_HASH, (res & 2) ? "ok" : "FAILED"); - printf("scrypt: test vectors %s\n", (res & 4) ? "ok" : "FAILED"); - - return ((res & 7) == 7) ? 0 : 1; -} diff --git a/scrypt-jane/scrypt-jane.c b/scrypt-jane/scrypt-jane.c deleted file mode 100644 index 9774617da..000000000 --- a/scrypt-jane/scrypt-jane.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane - - Public Domain or MIT License, whichever is easier -*/ - -#include - -#include "scrypt-jane.h" -#include "code/scrypt-jane-portable.h" -#include "code/scrypt-jane-hash.h" -#include "code/scrypt-jane-romix.h" -#include "code/scrypt-jane-test-vectors.h" - - -#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ -#if (SCRYPT_BLOCK_BYTES == 64) -#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 128) -#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 256) -#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */ -#elif (SCRYPT_BLOCK_BYTES == 512) -#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ -#endif -#define scrypt_maxr scrypt_r_32kb /* 32kb */ -#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ - -#include -//#include - -static void NORETURN -scrypt_fatal_error_default(const char *msg) { - fprintf(stderr, "%s\n", msg); - exit(1); -} - -static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; - -void -scrypt_set_fatal_error(scrypt_fatal_errorfn fn) { - scrypt_fatal_error = fn; -} - -static int -scrypt_power_on_self_test(void) { - const scrypt_test_setting *t; - uint8_t test_digest[64]; - uint32_t i; - int res = 7, scrypt_valid; - - if (!scrypt_test_mix()) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); -#endif - res &= ~1; - } - - if (!scrypt_test_hash()) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); -#endif - res &= ~2; - } - - for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { - t = post_settings + i; - scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); - scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); - } - - if (!scrypt_valid) { -#if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); -#endif - res &= ~4; - } - - return res; -} - -typedef struct scrypt_aligned_alloc_t { - uint8_t *mem, *ptr; -} scrypt_aligned_alloc; - -#if defined(SCRYPT_TEST_SPEED) -static uint8_t *mem_base = (uint8_t *)0; -static size_t mem_bump = 0; - -/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ -static scrypt_aligned_alloc -scrypt_alloc(uint64_t size) { - scrypt_aligned_alloc aa; - if (!mem_base) { - mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); - if (!mem_base) - scrypt_fatal_error("scrypt: out of memory"); - mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - } - aa.mem = mem_base + mem_bump; - aa.ptr = aa.mem; - mem_bump += (size_t)size; - return aa; -} - -static void -scrypt_free(scrypt_aligned_alloc *aa) { - mem_bump = 0; -} -#else -static scrypt_aligned_alloc -scrypt_alloc(uint64_t size) { - static const size_t max_alloc = (size_t)-1; - scrypt_aligned_alloc aa; - size += (SCRYPT_BLOCK_BYTES - 1); - if (size > max_alloc) - scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); - aa.mem = (uint8_t *)malloc((size_t)size); - aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - if (!aa.mem) - scrypt_fatal_error("scrypt: out of memory"); - return aa; -} - -static void -scrypt_free(scrypt_aligned_alloc *aa) { - free(aa->mem); -} -#endif - - -void -scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) { - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - uint32_t N, r, p, chunk_bytes, i; - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); -#endif - -#if !defined(SCRYPT_TEST) - static int power_on_self_test = 0; - if (!power_on_self_test) { - power_on_self_test = 1; - if (!scrypt_power_on_self_test()) - scrypt_fatal_error("scrypt: power on self test failed"); - } -#endif - - if (Nfactor > scrypt_maxN) - scrypt_fatal_error("scrypt: N out of range"); - if (rfactor > scrypt_maxr) - scrypt_fatal_error("scrypt: r out of range"); - if (pfactor > scrypt_maxp) - scrypt_fatal_error("scrypt: p out of range"); - - N = (1 << (Nfactor + 1)); - r = (1 << rfactor); - p = (1 << pfactor); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - V = scrypt_alloc((uint64_t)N * chunk_bytes); - YX = scrypt_alloc((p + 1) * chunk_bytes); - - /* 1: X = PBKDF2(password, salt) */ - Y = YX.ptr; - X = Y + chunk_bytes; - scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); - - /* 2: X = ROMix(X) */ - for (i = 0; i < p; i++) - scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); - - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); - - scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); - - scrypt_free(&V); - scrypt_free(&YX); -} diff --git a/scrypt-jane/test-speed.sh b/scrypt-jane/test-speed.sh deleted file mode 100644 index f223dae49..000000000 --- a/scrypt-jane/test-speed.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/sh - -test() { - sleep 0.25 # mingw is stupid and will occasionally not have permission to overwrite scrypt_speed - gcc scrypt-jane-speed.c -O3 -DSCRYPT_$1 -DSCRYPT_$2 $3 -o scrypt_speed 2>/dev/null - local RC=$? - if [ $RC -ne 0 ]; then - echo "$1/$2: failed to compile " - return - fi - ./scrypt_speed -} - -testhash() { - test $1 SALSA $2 - test $1 CHACHA $2 - test $1 SALSA64 $2 -} - -testhashes() { - testhash SHA256 $1 - testhash SHA512 $1 - testhash BLAKE256 $1 - testhash BLAKE512 $1 - testhash SKEIN512 $1 - testhash KECCAK256 $1 - testhash KECCAK512 $1 -} - -if [ -z $1 ]; then - testhashes -elif [ $1 -eq 32 ]; then - testhashes -m32 -elif [ $1 -eq 64 ]; then - testhashes -m64 -fi - -rm -f scrypt_speed \ No newline at end of file diff --git a/scrypt-jane/test.sh b/scrypt-jane/test.sh deleted file mode 100644 index dc3d03251..000000000 --- a/scrypt-jane/test.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/sh - -test() { - sleep 0.25 # mingw is stupid and will occasionally not have permission to overwrite scrypt_test - gcc scrypt-jane-test.c -O3 -DSCRYPT_$1 -DSCRYPT_$2 $3 -o scrypt_test 2>/dev/null - local RC=$? - if [ $RC -ne 0 ]; then - echo "$1/$2: failed to compile " - return - fi - ./scrypt_test >/dev/null - local RC=$? - if [ $RC -ne 0 ]; then - echo "$1/$2: validation failed" - return - fi - echo "$1/$2: OK" -} - -testhash() { - test $1 SALSA $2 - test $1 CHACHA $2 - test $1 SALSA64 $2 -} - -testhashes() { - testhash SHA256 $1 - testhash SHA512 $1 - testhash BLAKE256 $1 - testhash BLAKE512 $1 - testhash SKEIN512 $1 - testhash KECCAK256 $1 - testhash KECCAK512 $1 -} - -if [ -z $1 ]; then - testhashes -elif [ $1 -eq 32 ]; then - testhashes -m32 -elif [ $1 -eq 64 ]; then - testhashes -m64 -fi - -rm -f scrypt_test diff --git a/scrypt.c b/scrypt.c index f113c79f7..06fd76a19 100644 --- a/scrypt.c +++ b/scrypt.c @@ -31,8 +31,8 @@ #include "miner.h" #include +#include #include -#include static const uint32_t keypad[12] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 diff --git a/sha2.c b/sha2.c index d13a49514..817473586 100644 --- a/sha2.c +++ b/sha2.c @@ -12,7 +12,7 @@ #include "miner.h" #include -#include +#include #if defined(__arm__) && defined(__APCS_32__) #define EXTERN_SHA256 diff --git a/util.c b/util.c index 4837721fe..794491ee8 100644 --- a/util.c +++ b/util.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include