From cfe33ad88dd1eb27015cfa0d88edac625091d43a Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Wed, 19 Sep 2018 11:04:12 -0600 Subject: [PATCH] testing Monero POW v2 #1831 (as of 20180919) --- CMakeLists.txt | 32 ++- README.md | 1 + xmrstak/backend/amd/amd_gpu/gpu.cpp | 28 ++- xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + .../backend/amd/amd_gpu/opencl/cryptonight.cl | 177 ++++++++++++--- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 136 ++++++++++++ xmrstak/backend/amd/autoAdjust.hpp | 22 +- xmrstak/backend/amd/config.tpl | 5 +- xmrstak/backend/amd/jconf.cpp | 12 +- xmrstak/backend/amd/jconf.hpp | 1 + xmrstak/backend/amd/minethd.cpp | 1 + xmrstak/backend/cpu/autoAdjust.hpp | 2 +- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 2 +- xmrstak/backend/cpu/config.tpl | 13 +- .../cpu/crypto/asm/cryptonight_v8_main_loop.S | 27 +++ .../crypto/asm/cryptonight_v8_main_loop.asm | 18 ++ ...yptonight_v8_main_loop_ivybridge_linux.inc | 177 +++++++++++++++ ...yptonight_v8_main_loop_ivybridge_win64.inc | 176 +++++++++++++++ .../cryptonight_v8_main_loop_ryzen_linux.inc | 174 +++++++++++++++ .../cryptonight_v8_main_loop_ryzen_win64.inc | 174 +++++++++++++++ .../backend/cpu/crypto/cryptonight_aesni.h | 204 +++++++++++++++--- xmrstak/backend/cpu/jconf.cpp | 9 +- xmrstak/backend/cpu/jconf.hpp | 1 + xmrstak/backend/cpu/minethd.cpp | 55 ++++- xmrstak/backend/cpu/minethd.hpp | 5 +- xmrstak/backend/cryptonight.hpp | 13 ++ xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 191 +++++++++++++++- .../backend/nvidia/nvcc_code/cuda_extra.cu | 25 ++- xmrstak/jconf.cpp | 2 + xmrstak/misc/executor.cpp | 2 +- xmrstak/net/jpsock.cpp | 3 + xmrstak/pools.tpl | 4 +- 32 files changed, 1601 insertions(+), 92 deletions(-) create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc create mode 100644 xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc diff --git a/CMakeLists.txt b/CMakeLists.txt index a642b385d..eec03df9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -445,6 +445,26 @@ if(CMAKE_LINK_STATIC) endif() endif() +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + # asm optimized monero v8 code + enable_language(ASM_MASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" + ) +else() + # asm optimized monero v8 code + enable_language(ASM) + set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY C) + add_library(xmr-stak-asm + STATIC + "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" + ) +endif() + +set_property(TARGET xmr-stak-asm PROPERTY LINKER_LANGUAGE C) + # compile C files file(GLOB SRCFILES_C "xmrstak/backend/cpu/crypto/*.c") @@ -456,7 +476,7 @@ set_property(TARGET xmr-stak-c PROPERTY C_STANDARD 99) if(MICROHTTPD_ENABLE) target_link_libraries(xmr-stak-c ${MHTD}) endif() -target_link_libraries(xmr-stak-c ${LIBS}) +target_link_libraries(xmr-stak-c ${LIBS} xmr-stak-asm) # compile generic backend files file(GLOB BACKEND_CPP @@ -472,7 +492,7 @@ add_library(xmr-stak-backend STATIC ${BACKEND_CPP} ) -target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS}) +target_link_libraries(xmr-stak-backend xmr-stak-c ${CMAKE_DL_LIBS} xmr-stak-asm) # compile CUDA backend if(CUDA_FOUND) @@ -499,7 +519,7 @@ if(CUDA_FOUND) ) endif() target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) - target_link_libraries(xmrstak_cuda_backend xmr-stak-backend) + target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm) endif() # compile AMD backend @@ -512,7 +532,7 @@ if(OpenCL_FOUND) ${OPENCLSRCFILES} ) target_link_libraries(xmrstak_opencl_backend ${OpenCL_LIBRARY} ) - target_link_libraries(xmrstak_opencl_backend xmr-stak-backend) + target_link_libraries(xmrstak_opencl_backend xmr-stak-backend xmr-stak-asm) endif() # compile final binary @@ -528,7 +548,7 @@ endif() set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}") set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}") -target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend) +target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm) ################################################################################ # Install @@ -559,4 +579,4 @@ if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) else() # this rule is used if the install prefix is the build directory install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")") -endif() +endif() \ No newline at end of file diff --git a/README.md b/README.md index e3b01328a..2fe1bc511 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ If your prefered coin is not listed, you can choose one of the following algorit - cryptonight_masari - cryptonight_v7 - cryptonight_v7_stellite + - cryptonight_v8 - 4MiB scratchpad memory - cryptonight_haven - cryptonight_heavy diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 8d9b66853..767e53855 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -405,6 +405,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); options += " -DMEMORY=" + std::to_string(hashMemSize); options += " -DALGO=" + std::to_string(miner_algo[ii]); + options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* create a hash for the compile time cache * used data: @@ -901,6 +902,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) //char* source_code = LoadTextFile(sSourcePath); + const char *fastIntMathV2CL = + #include "./opencl/fast_int_math_v2.cl" + ; const char *cryptonightCL = #include "./opencl/cryptonight.cl" ; @@ -921,6 +925,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) ; std::string source_code(cryptonightCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -930,16 +935,37 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + for(int i = 0; i < num_gpus; ++i) { + const std::string backendName = xmrstak::params::inst().openCLVendor; if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; - const std::string backendName = xmrstak::params::inst().openCLVendor; printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); } + if(useCryptonight_v8) + { + if(ctx[i].stridedIndex == 1) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d stridedIndex is not allowed to be `true` or `1` for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + if(ctx[i].stridedIndex == 2 && ctx[i].memChunk < 2) + { + printer::inst()->print_msg(L0, "ERROR %s: gpu %d memChunk bust be >= 2 for the selected currency", backendName.c_str(), ctx[i].deviceIdx); + return ERR_STUPID_PARAMS; + } + } + if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5ab80b82a..63c5029d7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -27,6 +27,7 @@ struct GpuContext size_t workSize; int stridedIndex; int memChunk; + int unroll = 0; bool isNVIDIA = false; int compMode; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 78cd30c3a..286bc39b6 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -78,6 +78,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif +//#include "opencl/fast_int_math_v2.cl" +XMRSTAK_INCLUDE_FAST_INT_MATH_V2 //#include "opencl/wolf-aes.cl" XMRSTAK_INCLUDE_WOLF_AES //#include "opencl/wolf-skein.cl" @@ -416,6 +418,9 @@ void AESExpandKey256(uint *keybuf) } } +)===" +R"===( + #define MEM_CHUNK (1<> 4))))) +#else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx0) >> 4) ^ N)]) +#endif + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, ulong Threads // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 @@ -565,9 +578,29 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ) { - ulong a[2], b[2]; + ulong a[2]; + +// cryptonight_monero_v8 +#if(ALGO==11) + ulong b[4]; + uint4 b_x[2]; +// NVIDIA +# ifdef __NV_CL_C_VERSION + __local uint16 scratchpad_line_buf[WORKSIZE]; + __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0); +# endif +#else + ulong b[2]; + uint4 b_x[1]; +#endif __local uint AES0[256], AES1[256], AES2[256], AES3[256]; +// cryptonight_monero_v8 +#if(ALGO==11) + __local uint RCP[256]; + uint2 division_result; + uint sqrt_result; +#endif const ulong gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) @@ -577,6 +610,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES1[i] = rotate(tmp, 8U); AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); +// cryptonight_monero_v8 +#if(ALGO==11) + RCP[i] = RCP_C[i]; +#endif } barrier(CLK_LOCAL_MEM_FENCE); @@ -584,7 +621,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint2 tweak1_2; #endif - uint4 b_x; + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -604,7 +641,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] = states[1] ^ states[5]; b[1] = states[3] ^ states[7]; - b_x = ((uint4 *)b)[0]; + b_x[0] = ((uint4 *)b)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + a[1] = states[1] ^ states[5]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; + b_x[1] = ((uint4 *)b)[1]; + division_result = as_uint2(states[12]); + sqrt_result = as_uint2(states[13]).s0; +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) tweak1_2 = as_uint2(input[4]); @@ -622,37 +669,96 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0]; + ulong idx0 = a[0] & MASK; - #pragma unroll 8 + #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; +// cryptonight_monero_v8 && NVIDIA +#if(ALGO==11 && defined(__NV_CL_C_VERSION)) + ulong idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; +#endif - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; + ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif - b_x ^= ((uint4 *)c)[0]; + +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) uint table = 0x75310U; + b_x[0] ^= ((uint4 *)c)[0]; // cryptonight_stellite # if(ALGO == 7) - uint index = ((b_x.s2 >> 27) & 12) | ((b_x.s2 >> 23) & 2); + uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); +# else + uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); +# endif + b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; + SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; +// cryptonight_monero_v8 +#elif(ALGO==11) + SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; +# ifdef __NV_CL_C_VERSION + // flush shuffled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; + idx0 = c[0] & MASK; + idxS = idx0 & 0x30; + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; # else - uint index = ((b_x.s2 >> 26) & 12) | ((b_x.s2 >> 23) & 2); + idx0 = c[0] & MASK; # endif - b_x.s2 ^= ((table >> index) & 0x30U) << 24; +#else + b_x[0] ^= ((uint4 *)c)[0]; + SCRATCHPAD_CHUNK(0) = b_x[0]; + idx0 = c[0] & MASK; #endif - Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x; - uint4 tmp; - tmp = Scratchpad[IDX((c[0] & MASK) >> 4)]; - + tmp = SCRATCHPAD_CHUNK(0); +// cryptonight_monero_v8 +#if(ALGO==11) + // Use division and square root results from the _previous_ iteration to hide the latency + tmp.s0 ^= division_result.s0; + tmp.s1 ^= division_result.s1 ^ sqrt_result; + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + division_result = fast_div_v2(RCP, c[1], d); + // Use division_result as an input for the square root to prevent parallel implementation in hardware + sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); +#endif +// cryptonight_monero_v8 +#if(ALGO==11) + { + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); @@ -663,44 +769,55 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= tweak1_2; # endif #else - Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; #endif ((uint4 *)a)[0] ^= tmp; - idx0 = a[0]; - b_x = ((uint4 *)c)[0]; +// cryptonight_monero_v8 +#if (ALGO == 11) +# if defined(__NV_CL_C_VERSION) + // flush shuffled data + SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; +# endif + b_x[1] = b_x[0]; +#endif + b_x[0] = ((uint4 *)c)[0]; + idx0 = a[0] & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = d ^ q; -#endif + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = (d ^ q) & MASK; // cryptonight_haven -#if (ALGO == 9) - long n = *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - int d = ((__global int*)(Scratchpad + (IDX((idx0 & MASK) >> 4))))[2]; +#elif (ALGO == 9) + long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); + int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = n / (d | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n ^ q; - idx0 = (~d) ^ q; + *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; + idx0 = ((~d) ^ q) & MASK; #endif + } } mem_fence(CLK_GLOBAL_MEM_FENCE); } +)===" +R"===( + __attribute__((reqd_work_group_size(WORKSIZE, 8, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads) { diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl new file mode 100644 index 000000000..fe7cea1ee --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -0,0 +1,136 @@ +R"===( +/* + * @author SChernykh + */ +static const __constant uint RCP_C[256] = +{ + 0xfe01be73u,0xfd07ff01u,0xfa118c5au,0xf924fb13u,0xf630cddbu,0xf558f73cu,0xf25f2934u,0xf1a3f37bu, + 0xee9c4562u,0xee02efd0u,0xeae7ced5u,0xea76ec3au,0xe7417330u,0xe6ffe8b8u,0xe3a8e217u,0xe39be54au, + 0xe01dcd03u,0xe04ae1f0u,0xdc9fea3bu,0xdd0bdea8u,0xd92eef38u,0xd9dedb73u,0xd5ca9626u,0xd6c3d84fu, + 0xd27299dcu,0xd3b9d53cu,0xcf26b659u,0xd0bfd23au,0xcbe6ab09u,0xcdd5cf48u,0xc8b23886u,0xcafacc65u, + 0xc58920e5u,0xc82ec992u,0xc26b283eu,0xc572c6ceu,0xbf5813d7u,0xc2c3c419u,0xbc4facdbu,0xc023c171u, + 0xb951b9f6u,0xbd8fbed7u,0xb65e05c8u,0xbb09bc4bu,0xb3745d97u,0xb890b9cbu,0xb0948d04u,0xb624b758u, + 0xadbe61e8u,0xb3c3b4f2u,0xaaf1ae2au,0xb16eb297u,0xa82e412eu,0xaf25b048u,0xa573ec98u,0xace7ae05u, + 0xa2c28519u,0xaab4abcdu,0xa019df1cu,0xa88ca99fu,0x9d79cf91u,0xa66ea77cu,0x9ae22df8u,0xa45ba563u, + 0x9852d0ceu,0xa251a354u,0x95cb912eu,0xa050a14fu,0x934c48d6u,0x9e5a9f54u,0x90d4d228u,0x9c6c9d62u, + 0x8e650939u,0x9a879b79u,0x8bfccaf5u,0x98ac9998u,0x899bf212u,0x96d897c1u,0x87425eedu,0x950d95f2u, + 0x84efefd3u,0x934a942bu,0x82a48450u,0x918f926cu,0x805ffcb4u,0x8fdc90b5u,0x7e223ab7u,0x8e308f05u, + 0x7beb1f71u,0x8c8c8d5du,0x79ba8ce2u,0x8aef8bbdu,0x7790683eu,0x89598a23u,0x756c9343u,0x87ca8891u, + 0x734ef468u,0x86428705u,0x71376efbu,0x84c18581u,0x6f25e9ebu,0x83458402u,0x6d1a4b34u,0x81d0828au, + 0x6b147a52u,0x80628118u,0x69145cfbu,0x7ef97fadu,0x6719dd39u,0x7d967e47u,0x6524e2abu,0x7c397ce7u, + 0x6335561bu,0x7ae27b8du,0x614b21eau,0x79907a38u,0x5f662f10u,0x784478e9u,0x5d8667dfu,0x76fd77a0u, + 0x5babb887u,0x75bb765bu,0x59d60b2eu,0x747e751cu,0x58054d25u,0x734673e1u,0x5639688fu,0x721372acu, + 0x54724c2du,0x70e5717bu,0x52afe29cu,0x6fbb7050u,0x50f21c05u,0x6e966f28u,0x4f38e412u,0x6d766e06u, + 0x4d842a91u,0x6c5a6ce7u,0x4bd3dcd0u,0x6b426bcdu,0x4a27e96au,0x6a2e6ab8u,0x4880415eu,0x691f69a6u, + 0x46dcd25du,0x68136899u,0x453d8df4u,0x670c678fu,0x43a262a5u,0x6608668au,0x420b42d6u,0x65096588u, + 0x40781dd3u,0x640d648au,0x3ee8e49au,0x63146390u,0x3d5d8a11u,0x621f6299u,0x3bd5fee0u,0x612e61a6u, + 0x3a523496u,0x604060b7u,0x38d21e75u,0x5f565fcbu,0x3755aec4u,0x5e6f5ee2u,0x35dcd78fu,0x5d8b5dfdu, + 0x34678d72u,0x5cab5d1au,0x32f5c17cu,0x5bcd5c3bu,0x318767f1u,0x5af35b60u,0x301c7511u,0x5a1b5a87u, + 0x2eb4dccau,0x594759b1u,0x2d50935cu,0x587658deu,0x2bef8bfau,0x57a7580eu,0x2a91bc5cu,0x56db5741u, + 0x2937198fu,0x56125676u,0x27df970eu,0x554c55afu,0x268b2b78u,0x548854eau,0x2539cba1u,0x53c75428u, + 0x23eb6d84u,0x53095368u,0x22a00644u,0x524d52abu,0x21578cd3u,0x519451f0u,0x2011f5f9u,0x50dd5138u, + 0x1ecf388eu,0x50285082u,0x1d8f4b53u,0x4f764fcfu,0x1c5224abu,0x4ec64f1eu,0x1b17bb87u,0x4e184e6fu, + 0x19e0073fu,0x4d6d4dc2u,0x18aafe0au,0x4cc44d18u,0x177896f3u,0x4c1c4c70u,0x1648cb16u,0x4b784bcau, + 0x151b9051u,0x4ad54b26u,0x13f0deeau,0x4a344a84u,0x12c8aef3u,0x499549e4u,0x11a2f829u,0x48f84946u, + 0x107fb1ffu,0x485d48abu,0xf5ed5f0u,0x47c44811u,0xe405bc1u,0x472d4779u,0xd243bdau,0x469846e3u, + 0xc0a6fa1u,0x4605464eu,0xaf2edf2u,0x457345bcu,0x9ddb163u,0x44e3452bu,0x8cab264u,0x4455449cu, + 0x7b9e9d5u,0x43c9440fu,0x6ab5173u,0x433e4383u,0x59ee141u,0x42b542fau,0x49494c7u,0x422e4271u, + 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, +}; + +inline uint get_reciprocal(const __local uchar *RCP, uint a) +{ + const uint index1 = (a & 0x7F000000U) >> 21; + const int index2 = (int)((a >> 8) & 0xFFFFU) - 32768; + + const uint r1 = *(const __local uint*)(RCP + index1); + + uint r2_0 = *(const __local uint*)(RCP + index1 + 4); + if (index2 > 0) r2_0 >>= 16; + const int r2 = r2_0 & 0xFFFFU; + + const uint r = r1 - (uint)(mul24(r2, index2) >> 6); + + const ulong lo0 = (ulong)(r) * a; + ulong lo = lo0 + ((ulong)(a) << 32); + + a >>= 1; + const bool b = (a >= lo) || (lo >= lo0); + lo = a - lo; + + const ulong k = mul_hi(as_uint2(lo).s0, r) + ((ulong)(r) * as_uint2(lo).s1) + lo; + return as_uint2(k).s1 + (b ? r : 0); +} + +inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) +{ + const uint r = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + + ulong q; + ((uint*)&q)[0] = as_uint2(k).s1;; + ((uint*)&q)[1] = (k < a) ? 1 : 0; + + const long tmp = a - q * b; + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + return (uint2)( + as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), + as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) + ); +} + +inline void fast_div_full_q(const __local uint *RCP, ulong a, uint b, ulong *q, uint *r) +{ + const uint rcp = get_reciprocal((const __local uchar *)RCP, b); + const ulong k = mul_hi(as_uint2(a).s0, rcp) + ((ulong)(as_uint2(a).s1) * rcp) + a; + + ((uint*)q)[0] = as_uint2(k).s1; + ((uint*)q)[1] = (k < a) ? 1 : 0; + + long tmp = a - (*q) * b; + + const bool overshoot = (tmp < 0); + const bool undershoot = (tmp >= b); + + if (overshoot) + { + --(*q); + tmp += b; + } + + if (undershoot) + { + ++(*q); + tmp -= b; + } + + *r = tmp; +} + +inline uint fast_sqrt_v2(const ulong n1) +{ + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + + float x1 = native_rsqrt(x); + x = native_sqrt(x); + + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (((long)(x0) * x0) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + b) > 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + + return result; +} +)===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index d6acec971..c5b331c87 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -127,6 +127,24 @@ class autoAdjust minFreeMem = 512u * byteToMiB; } + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + + // set strided index to default + ctx.stridedIndex = 1; + + // nvidia performance is very bad if the scratchpad is not contiguous + if(ctx.isNVIDIA) + ctx.stridedIndex = 0; + + // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` + if(useCryptonight_v8) + ctx.stridedIndex = 2; + // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; @@ -153,8 +171,8 @@ class autoAdjust // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + (ctx.isNVIDIA ? "0" : "1") + ", \"mem_chunk\" : 2,\n" - " \"comp_mode\" : true\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : 8, \"comp_mode\" : true\n" + " },\n"; } else diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 28855f070..043b05355 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -9,17 +9,20 @@ R"===( * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks + * (not allowed for cryptonight_v8 and monero8) * 0 or false = use a contiguous block of memory per thread * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk * this value is only used if 'strided_index' == 2 * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte) + * unroll - allow to control how often the POW main loop is unrolled; valid range [0;128) - for most OpenCL implementations it must be a power of two. * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 9e15c930c..777dbdbb5 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -106,17 +106,18 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); stridedIndex = GetObjectMember(oThdConf, "strided_index"); memChunk = GetObjectMember(oThdConf, "mem_chunk"); + unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || - stridedIndex == nullptr || compMode == nullptr) + stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) @@ -149,6 +150,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) } cfg.memChunk = (int)memChunk->GetInt64(); + + if(!unroll->IsUint64() || (int)unroll->GetInt64() >= 128 || ) + { + printer::inst()->print_msg(L0, "ERROR: unroll must be smaller than 128 and a power of two"); + return false; + } + cfg.unroll = (int)unroll->GetInt64(); if(!compMode->IsBool()) return false; diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 580b69fe7..b852c5940 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -28,6 +28,7 @@ class jconf long long cpu_aff; int stridedIndex; int memChunk; + int unroll; bool compMode; }; diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index d6051ffcd..5ac246335 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -99,6 +99,7 @@ bool minethd::init_gpus() vGpuData[i].stridedIndex = cfg.stridedIndex; vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; + vGpuData[i].unroll = cfg.unroll; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index 57dbef053..28ff515d4 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -82,7 +82,7 @@ class autoAdjust conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); conf += std::to_string(aff_id); conf += std::string(" },\n"); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index 01d2280d8..2bebf82d0 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -70,7 +70,7 @@ class autoAdjust { conf += std::string(" { \"low_power_mode\" : "); conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); - conf += std::string(", \"no_prefetch\" : true, \"affine_to_cpu\" : "); + conf += std::string(", \"no_prefetch\" : true, \"asm\" : \"off\", \"affine_to_cpu\" : "); conf += std::to_string(id & 0x7FFFFFF); conf += std::string(" },\n"); } diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl index 2fc9a47ec..e4da15fad 100644 --- a/xmrstak/backend/cpu/config.tpl +++ b/xmrstak/backend/cpu/config.tpl @@ -7,10 +7,15 @@ R"===( * the maximum performance. When set to a number N greater than 1, this mode will increase the * cache usage and single thread performance by N times. * - * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make + * no_prefetch - Some systems can gain up to extra 5% here, but sometimes it will have no difference or make * things slower. * - * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading + * asm - Allow to switch to a assembler version of cryptonight_v8; allowed value [off, intel, ryzen] + * - off: used the default implementation (no assembler version) + * - intel: supports Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + * - ryzen: AMD Ryzen (1xxx and 2xxx series) + * + * affine_to_cpu - This can be either false (no affinity), or the CPU core number. Note that on hyperthreading * systems it is better to assign threads to physical cores. On Windows this usually means selecting * even or odd numbered cpu numbers. For Linux it will be usually the lower CPU numbers, so for a 4 * physical core CPU you should select cpu numbers 0-3. @@ -21,8 +26,8 @@ R"===( * A filled out configuration should look like this: * "cpu_threads_conf" : * [ - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 }, - * { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 0 }, + * { "low_power_mode" : false, "no_prefetch" : true, "asm" : "off", "affine_to_cpu" : 1 }, * ], * If you do not wish to mine with your CPU(s) then use: * "cpu_threads_conf" : diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S new file mode 100644 index 000000000..b6be9438f --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S @@ -0,0 +1,27 @@ +#define ALIGN .align +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif +.global FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm) +.global FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm) + +ALIGN 8 +FN_PREFIX(cryptonight_v8_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cryptonight_v8_main_loop_ivybridge_linux.inc" + add rsp, 48 + ret 0 + +ALIGN 8 +FN_PREFIX(cryptonight_v8_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cryptonight_v8_main_loop_ryzen_linux.inc" + add rsp, 48 + ret 0 diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm new file mode 100644 index 000000000..a1615e9bd --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm @@ -0,0 +1,18 @@ +_TEXT_CNV8_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cryptonight_v8_mainloop_ivybridge_asm +PUBLIC cryptonight_v8_mainloop_ryzen_asm + +ALIGN 8 +cryptonight_v8_mainloop_ivybridge_asm PROC + INCLUDE cryptonight_v8_main_loop_ivybridge_win64.inc + ret 0 +cryptonight_v8_mainloop_ivybridge_asm ENDP + +ALIGN 8 +cryptonight_v8_mainloop_ryzen_asm PROC + INCLUDE cryptonight_v8_main_loop_ryzen_win64.inc + ret 0 +cryptonight_v8_mainloop_ryzen_asm ENDP + +_TEXT_CNV8_MAINLOOP ENDS +END diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc new file mode 100644 index 000000000..21f1f48c3 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_linux.inc @@ -0,0 +1,177 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + + ALIGN 8 +main_loop_ivybridge: + movdqu xmm6, XMMWORD PTR [r10+rbx] + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movq rcx, xmm3 + movdqu XMMWORD PTR [rax+rbx], xmm2 + mov rax, rcx + movdqu XMMWORD PTR [r10+rbx], xmm1 + shl rax, 32 + xor rdi, rax + movq rbp, xmm6 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov r10, rbp + and r10d, 2097136 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r10+rbx] + lea r14, QWORD PTR [r10+rbx] + mov r12, QWORD PTR [r10+rbx+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test rdx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +sqrt_fixup_ivybridge_ret: + + mov r9, r10 + mov rax, rdi + mul rbp + + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + add r8, rdx + add r11, rax + movdqu xmm0, XMMWORD PTR [r10+rbx] + movdqu xmm2, XMMWORD PTR [r9+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov QWORD PTR [r14], r8 + xor r8, rdi + mov r10, r8 + mov QWORD PTR [r14+8], r11 + and r10d, 2097136 + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc new file mode 100644 index 000000000..ee7f31716 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ivybridge_win64.inc @@ -0,0 +1,176 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + + ALIGN 8 +main_loop_ivybridge: + movdqu xmm6, XMMWORD PTR [r10+rbx] + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movd rcx, xmm3 + movdqu XMMWORD PTR [rax+rbx], xmm2 + mov rax, rcx + movdqu XMMWORD PTR [r10+rbx], xmm1 + shl rax, 32 + xor rdi, rax + movd rbp, xmm6 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov r10, rbp + and r10d, 2097136 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r10+rbx] + lea r14, QWORD PTR [r10+rbx] + mov r12, QWORD PTR [r10+rbx+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movd rdx, xmm3 + test rdx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 + psubq xmm3, XMMWORD PTR [rsp+16] +sqrt_fixup_ivybridge_ret: + + mov r9, r10 + mov rax, rdi + mul rbp + + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + add r8, rdx + add r11, rax + movdqu xmm0, XMMWORD PTR [r10+rbx] + movdqu xmm2, XMMWORD PTR [r9+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov QWORD PTR [r14], r8 + xor r8, rdi + mov r10, r8 + mov QWORD PTR [r14+8], r11 + and r10d, 2097136 + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13, -4389456576512 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + mov r13, 4389456576511 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc new file mode 100644 index 000000000..9c177b85a --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_linux.inc @@ -0,0 +1,174 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 8 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm0, XMMWORD PTR [r10+rbx] + movdqa xmm2, XMMWORD PTR [r9+rbx] + movdqa xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm4 + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: diff --git a/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc new file mode 100644 index 000000000..f70dccef8 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop_ryzen_win64.inc @@ -0,0 +1,174 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 8 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm0, r11 + movd xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movd r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movd rax, xmm0 + + div r9 + movd xmm0, rax + movd xmm1, rdx + punpckldq xmm0, xmm1 + movd r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm0, XMMWORD PTR [r10+rbx] + movdqa xmm2, XMMWORD PTR [r9+rbx] + movdqa xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm0, xmm4 + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm0 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movd r9, xmm2 + dec rdi + mov rdx, 4389456576511 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + sub rcx, rdx + mov rdx, -4389456576512 + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 89c508990..6edae905e 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -19,6 +19,8 @@ #include "xmrstak/backend/cryptonight.hpp" #include #include +#include +#include #ifdef __GNUC__ #include @@ -422,6 +424,29 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) +{ + __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); + x = _mm_sqrt_sd(_mm_setzero_pd(), x); + uint64_t r = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + +#ifdef __INTEL_COMPILER + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned __int64*)&x2), r, 0, (unsigned __int64*)&r); +#elif defined(_MSC_VER) || (__GNUC__ >= 7) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +#else + // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence + // Fallback to simpler code + if (x2 < n0) ++r; +#endif + return r; +} + inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) { alignas(16) uint32_t k[4]; @@ -467,6 +492,94 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } +/** optimal type for sqrt + * + * Depending on the number of hashes calculated the optimal type for the sqrt value will be selected. + * + * @tparam N number of hashes per thread + */ +template +struct GetOptimalSqrtType +{ + using type = __m128i; +}; + +template<> +struct GetOptimalSqrtType<1u> +{ + using type = uint64_t; +}; +template +using GetOptimalSqrtType_t = typename GetOptimalSqrtType::type; + +/** assign a value and convert if necessary + * + * @param output output type + * @param input value which is assigned to output + * @{ + */ +inline void assign(__m128i& output, const uint64_t input) +{ + output = _mm_cvtsi64_si128(input); +} + +inline void assign(uint64_t& output, const uint64_t input) +{ + output = input; +} + +inline void assign(uint64_t& output, const __m128i& input) +{ + output = _mm_cvtsi128_si64(input); +} +/** @} */ + +inline void set_float_rounding_mode() +{ +#ifdef _MSC_VER + _control87(RC_DOWN, MCW_RC); +#else + std::fesetround(FE_DOWNWARD); +#endif +} + +#define CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } + +#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8) \ + { \ + uint64_t sqrt_result_tmp; \ + assign(sqrt_result_tmp, sqrt_result); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ + const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ + /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ + assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ + } + #define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ { \ @@ -474,7 +587,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ +#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -489,16 +602,27 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint64_t idx0; \ __m128i bx0; \ uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + GetOptimalSqrtType_t sqrt_result; \ + /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ idx0 = h0[0] ^ h0[4]; \ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + assign(sqrt_result, h0[13]); \ + set_float_rounding_mode(); \ + } \ } \ __m128i *ptr0 - -#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ +#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ __m128i cx; \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ cx = _mm_load_si128(ptr0); \ @@ -512,7 +636,8 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) cx = soft_aesenc(cx, ax0); \ else \ cx = _mm_aesenc_si128(cx, ax0); \ - } + } \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -524,15 +649,22 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - bx0 = cx; \ + if(ALGO != cryptonight_monero_v8) \ + bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ - \ + CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ + CN_MONERO_V8_SHUFFLE(n, l0, idx0, ax0, bx0, bx1); \ + if(ALGO == cryptonight_monero_v8) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ @@ -542,7 +674,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) ((uint64_t*)ptr0)[0] = al0; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - #define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -602,7 +733,7 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) /** add append n to all arguments and keeps n as first argument * * @param n number which is appended to the arguments (expect the first argument n) - * + * * @code{.cpp} * CN_ENUM_2(1, foo, bar) * // is transformed to @@ -622,6 +753,9 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) #define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n #define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n #define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n /** repeat a macro call multiple times * @@ -657,15 +791,14 @@ struct Cryptonight_hash<1> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_1(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - - REPEAT_1(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -687,14 +820,14 @@ struct Cryptonight_hash<2> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_2(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_2(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -716,14 +849,14 @@ struct Cryptonight_hash<3> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_3(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_3(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -745,14 +878,14 @@ struct Cryptonight_hash<4> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_4(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_4(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -774,14 +907,14 @@ struct Cryptonight_hash<5> constexpr size_t MEM = cn_select_memory(); CN_INIT_SINGLE; - REPEAT_5(6, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0); + REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_5(7, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx); + REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(11, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); + REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -789,3 +922,24 @@ struct Cryptonight_hash<5> REPEAT_5(0, CN_FINALIZE); } }; + +extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); + +template +void cryptonight_hash_v2_asm(const void* input, size_t len, void* output, cryptonight_ctx** ctx) +{ + constexpr size_t MEM = cn_select_memory(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + + if (asm_version == 1) + cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); + else + cryptonight_v8_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); +} diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index 49da7ae2d..a14be1732 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -108,12 +108,13 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *mode, *no_prefetch, *aff; + const Value *mode, *no_prefetch, *aff, *asm_version; mode = GetObjectMember(oThdConf, "low_power_mode"); no_prefetch = GetObjectMember(oThdConf, "no_prefetch"); aff = GetObjectMember(oThdConf, "affine_to_cpu"); + asm_version = GetObjectMember(oThdConf, "asm"); - if(mode == nullptr || no_prefetch == nullptr || aff == nullptr) + if(mode == nullptr || no_prefetch == nullptr || aff == nullptr || asm_version == nullptr) return false; if(!mode->IsBool() && !mode->IsNumber()) @@ -140,6 +141,10 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.iCpuAff = -1; + if(!asm_version->IsString()) + return false; + cfg.asm_version_str = asm_version->GetString(); + return true; } diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index be855036e..4ec9165d5 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -24,6 +24,7 @@ class jconf struct thd_cfg { int iMultiway; bool bNoPrefetch; + std::string asm_version_str; long long iCpuAff; }; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 93ce218a3..05743ae92 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -104,7 +104,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id #endif } -minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity) +minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version) { this->backendType = iBackend::CPU; oWork = pWork; @@ -113,6 +113,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, iJobNo = 0; bNoPrefetch = no_prefetch; this->affinity = affinity; + asm_version_str = asm_version; std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); @@ -305,6 +306,16 @@ bool minethd::self_test() hashf("This is a test This is a test This is a test", 44, out, ctx); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } + else if(algo == cryptonight_monero_v8) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult &= memcmp(out, "\x4c\xf1\xff\x9c\xa4\x6e\xb4\x33\xb3\x6c\xd9\xf7\x0e\x02\xb1\x4c\xc0\x6b\xfd\x18\xca\x77\xfa\x9c\xca\xaf\xd1\xfd\x96\xc6\x74\xb0", 32) == 0; + } else if(algo == cryptonight_aeon) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); @@ -431,7 +442,7 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work else printer::inst()->print_msg(L1, "Starting %dx thread, no affinity.", cfg.iMultiway); - minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff); + minethd* thd = new minethd(pWork, i + threadOffset, cfg.iMultiway, cfg.bNoPrefetch, cfg.iCpuAff, cfg.asm_version_str); pvThreads.push_back(thd); } @@ -439,9 +450,31 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work } template -minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); + + // check for asm optimized version for cryptonight_v8 + if(N == 1 && algo == cryptonight_monero_v8 && bHaveAes) + { + if(asm_version_str != "off") + { + if(asm_version_str == "intel") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + return cryptonight_hash_v2_asm; + } + if(asm_version_str == "ryzen") + { + // AMD Ryzen (1xxx and 2xxx series) + return cryptonight_hash_v2_asm; + } + else + { + printer::inst()->print_msg(L1, "Assembler %s unknown, fallback to non asm version of cryptonight_v8", asm_version_str.c_str()); + } + } + } // We have two independent flag bits in the functions // therefore we will build a binary digit and select the // function as a two digit binary @@ -479,6 +512,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_bittube2: algv = 9; break; + case cryptonight_monero_v8: + algv = 10; + break; default: algv = 2; break; @@ -533,7 +569,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; @@ -618,7 +659,7 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); uint8_t version = 0; size_t lastPoolId = 0; @@ -653,12 +694,12 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo); + hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 26478542c..eb77749f6 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -32,9 +32,9 @@ class minethd : public iBackend private: template - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off"); - minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity); + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template void multiway_work_main(); @@ -60,6 +60,7 @@ class minethd : public iBackend bool bQuit; bool bNoPrefetch; + std::string asm_version_str = "off"; }; } // namespace cpu diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index b6f656138..6b1afa928 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -16,6 +16,7 @@ enum xmrstak_algo cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + cryptonight_monero_v8 = 11 }; // define aeon settings @@ -45,6 +46,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTONIGH template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } @@ -72,6 +76,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MEMORY; @@ -100,6 +105,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } + template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } @@ -127,6 +135,7 @@ inline size_t cn_select_mask(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: return CRYPTONIGHT_MASK; @@ -155,6 +164,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTONIGH template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } @@ -182,6 +194,7 @@ inline size_t cn_select_iter(xmrstak_algo algo) { case cryptonight_stellite: case cryptonight_monero: + case cryptonight_monero_v8: case cryptonight: return CRYPTONIGHT_ITER; case cryptonight_ipbc: diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 6c6475150..1273f89e9 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -194,6 +194,31 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #endif } +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, val, src); + ((uint32_t*)&tmp)[1] = shuffle(ptr, sub, val, src2); + return tmp; +} + +__forceinline__ __device__ uint64_t int_sqrt33_1_double_precision(int i,const uint64_t n0) +{ + uint64_t x = (n0 >> 12) + (1023ULL << 52); + const double xx = sqrt( *reinterpret_cast(&x) ); + uint64_t r = *reinterpret_cast(&xx); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); + + if (x2 < n0) ++r; + + return r; +} + template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) @@ -250,7 +275,19 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = *(d_ctx_b + threads * 4 + thread); } } - d[1] = (d_ctx_b + thread * 4)[sub]; + + uint32_t bx1, division_result, sqrt_result; + if(ALGO == cryptonight_monero_v8) + { + d[1] = (d_ctx_b + thread * 12)[sub]; + bx1 = (d_ctx_b + thread * 12 + 4)[sub]; + + // must be valid only for `sub < 2` + division_result = (d_ctx_b + thread * 12 + 4 * 2)[sub % 2]; + sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2]; + } + else + d[1] = (d_ctx_b + thread * 4)[sub]; #pragma unroll 2 for ( i = start; i < end; ++i ) @@ -259,7 +296,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti for ( int x = 0; x < 2; ++x ) { j = ( ( idx0 & MASK ) >> 2 ) + sub; - + if(ALGO == cryptonight_bittube2) { uint32_t k[4]; @@ -290,6 +327,57 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti } } } + else if(ALGO == cryptonight_monero_v8) + { + + const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); + uint4 chunk0{}; + chunk0.x = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[0], 0); + chunk0.y = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[1], 0); + chunk0.z = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[2], 0); + chunk0.w = shuffle<4>(sPtr,sub, ((uint32_t*)&chunk)[3], 0); + + const uint32_t x_0 = ((uint32_t*)&chunk0)[sub]; + const uint32_t x_1 = ((uint32_t*)&chunk0)[(sub + 1) % 4]; + const uint32_t x_2 = ((uint32_t*)&chunk0)[(sub + 2) % 4]; + const uint32_t x_3 = ((uint32_t*)&chunk0)[(sub + 3) % 4]; + d[x] = a ^ + t_fn0( x_0 & 0xff ) ^ + t_fn1( (x_1 >> 8) & 0xff ) ^ + t_fn2( (x_2 >> 16) & 0xff ) ^ + t_fn3( ( x_3 >> 24 ) ); + + uint4 value; + const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); + if(sub == 1) + ((uint64_t*)&value)[0] = tmp10; + const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); + if(sub == 1) + ((uint64_t*)&value)[1] = tmp20; + const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); + if(sub == 2) + ((uint64_t*)&value)[0] = tmp11; + const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); + if(sub == 2) + ((uint64_t*)&value)[1] = tmp21; + const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); + if(sub == 3) + ((uint64_t*)&value)[0] = tmp12; + const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); + if(sub == 3) + ((uint64_t*)&value)[1] = tmp22; + + if(sub > 0) + { + uint4 store{}; + ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; + ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; + + const int dest = sub + 1; + const int dest2 = dest == 4 ? 1 : dest; + *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; + } + } else { const uint32_t x_0 = loadGlobal32( long_state + j ); @@ -302,7 +390,6 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti t_fn2( (x_2 >> 16) & 0xff ) ^ t_fn3( ( x_3 >> 24 ) ); } - //XOR_BLOCKS_DST(c, b, &long_state[j]); t1[0] = shuffle<4>(sPtr,sub, d[x], 0); @@ -331,10 +418,82 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); + + if(ALGO == cryptonight_monero_v8 ) + { + const uint64_t sqrt_result_64 = shuffle64<4>(sPtr, sub, sqrt_result, 0, 1); + + // Use division and square root results from the _previous_ iteration to hide the latency + const uint64_t cx0 = shuffle64<4>(sPtr, sub, d[x], 0, 1); + + + const uint64_t division_result_64 = shuffle64<4>(sPtr,sub, division_result, 0, 1); + const uint64_t cl_rhs = division_result_64 ^ (sqrt_result_64 << 32); + + if(sub < 2) + *((uint64_t*)yy) ^= cl_rhs; + + + const uint32_t dd = (cx0 + (sqrt_result_64 << 1)) | 0x80000001UL; + + // Most and least significant bits in the divisor are set to 1 + // to make sure we don't divide by a small or even number, + // so there are no shortcuts for such cases + // + // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 + // We drop the highest bit to fit both quotient and remainder in 32 bits + + // Compiler will optimize it to a single div instruction + const uint64_t cx1 = shuffle64<4>(sPtr, sub, d[x], 2, 3); + + + const uint64_t division_result_tmp = static_cast(cx1 / dd) + ((cx1 % dd) << 32); + + division_result = ((uint32_t*)&division_result_tmp)[sub % 2]; + + // Use division_result as an input for the square root to prevent parallel implementation in hardware + const uint64_t sqrt_result_tmp = int_sqrt33_1_double_precision(i, cx0 + division_result_tmp); + sqrt_result = ((uint32_t*)&sqrt_result_tmp)[sub % 2]; + } + uint32_t zz[2]; zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); - + // Shuffle the other 3x16 byte chunks in the current 64-byte cache line + if(ALGO == cryptonight_monero_v8) + { + uint4 value; + const uint64_t tmp10 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 0 , 1); + if(sub == 1) + ((uint64_t*)&value)[0] = tmp10; + const uint64_t tmp20 = shuffle64<4>(sPtr,sub, d[(x + 1) % 2], 2 , 3); + if(sub == 1) + ((uint64_t*)&value)[1] = tmp20; + const uint64_t tmp11 = shuffle64<4>(sPtr,sub, a, 0 , 1); + if(sub == 2) + ((uint64_t*)&value)[0] = tmp11; + const uint64_t tmp21 = shuffle64<4>(sPtr,sub, a, 2 , 3); + if(sub == 2) + ((uint64_t*)&value)[1] = tmp21; + const uint64_t tmp12 = shuffle64<4>(sPtr,sub, bx1, 0 , 1); + if(sub == 3) + ((uint64_t*)&value)[0] = tmp12; + const uint64_t tmp22 = shuffle64<4>(sPtr,sub, bx1, 2 , 3); + if(sub == 3) + ((uint64_t*)&value)[1] = tmp22; + if(sub > 0) + { + const uint4 chunk = *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (sub<<4)) ); + uint4 store{}; + ((uint64_t*)&store)[0] = ((uint64_t*)&chunk)[0] + ((uint64_t*)&value)[0]; + ((uint64_t*)&store)[1] = ((uint64_t*)&chunk)[1] + ((uint64_t*)&value)[1]; + + const int dest = sub + 1; + const int dest2 = dest == 4 ? 1 : dest; + *( (uint4*)((uint64_t)(long_state + (j & 0xFFFFFFFC)) ^ (dest2<<4)) ) = store; + } + } + t1[1] = shuffle<4>(sPtr,sub, d[x], 1); #pragma unroll for ( k = 0; k < 2; k++ ) @@ -384,13 +543,31 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti idx0 = (~d) ^ q; } + if(ALGO == cryptonight_monero_v8) + { + bx1 = d[(x + 1) % 2]; + } } } if ( bfactor > 0 ) { (d_ctx_a + thread * 4)[sub] = a; - (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_monero_v8) + { + (d_ctx_b + thread * 12)[sub] = d[1]; + (d_ctx_b + thread * 12 + 4)[sub] = bx1; + + if(sub < 2) + { + // must be valid only for `sub < 2` + (d_ctx_b + thread * 12 + 4 * 2)[sub % 2] = division_result; + (d_ctx_b + thread * 12 + 4 * 2 + 2)[sub % 2] = sqrt_result; + } + } + else + (d_ctx_b + thread * 4)[sub] = d[1]; + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; @@ -534,6 +711,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t { cryptonight_core_gpu_hash(ctx, startNonce); } + else if(miner_algo == cryptonight_monero_v8) + { + cryptonight_core_gpu_hash(ctx, startNonce); + } else if(miner_algo == cryptonight_heavy) { cryptonight_core_gpu_hash(ctx, startNonce); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b455f55ca..1ea54ddba 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -142,7 +142,19 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + if(ALGO == cryptonight_monero_v8) + { + memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); + // bx1 + XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); + memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 ); + // division_result + memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 ); + // sqrt_result + memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + } + else + memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); @@ -298,6 +310,12 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } + else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + { + // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) + ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; + } else ctx->d_ctx_state2 = ctx->d_ctx_state; @@ -340,6 +358,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + if(miner_algo == cryptonight_monero_v8) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index b6580ea9a..c69d47ab8 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -99,12 +99,14 @@ xmrstak::coin_selection coins[] = { { "cryptonight_lite_v7", {cryptonight_lite, cryptonight_aeon, 255u}, {cryptonight_aeon, cryptonight_lite, 7u}, nullptr }, { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 255u}, nullptr }, { "cryptonight_v7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero, cryptonight_stellite, 255u}, {cryptonight_monero, cryptonight_monero, 255u}, nullptr }, { "graft", {cryptonight_monero, cryptonight, 8u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "haven", {cryptonight_haven, cryptonight_heavy, 3u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "intense", {cryptonight_monero, cryptonight, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "masari", {cryptonight_masari, cryptonight_monero, 7u}, {cryptonight_monero, cryptonight_monero, 0u},nullptr }, { "monero7", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, "pool.usxmrpool.com:3333" }, + { "monero8", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, { "qrl", {cryptonight_monero, cryptonight_monero, 0u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_stellite, cryptonight_monero, 4u}, {cryptonight_monero, cryptonight_monero, 0u}, nullptr }, diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 11d0f6df0..02ac8b7f5 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -560,7 +560,7 @@ void executor::ex_main() else pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true); break; - + case cryptonight_monero_v8: case cryptonight_monero: if(dev_tls) pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false); diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 9fce9b7e5..d20ba082f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -685,6 +685,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_monero: algo_name = "cryptonight_v7"; break; + case cryptonight_monero_v8: + algo_name = "cryptonight_v8"; + break; case cryptonight_aeon: algo_name = "cryptonight_lite_v7"; break; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 78f2315ac..9c3dd5a59 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -27,7 +27,8 @@ POOLCONF], * haven (automatic switch with block version 3 to cryptonight_haven) * intense * masari - * monero7 (use this for Monero's new PoW) + * monero7 + * monero8 (use this to support Monero's Oct 2018 fork) * qrl - Quantum Resistant Ledger * ryo * turtlecoin @@ -41,6 +42,7 @@ POOLCONF], * # 2MiB scratchpad memory * cryptonight * cryptonight_v7 + * cryptonight_v8 * # 4MiB scratchpad memory * cyrptonight_bittube2 * cryptonight_haven