From 8b83e5a7f06f51b7211b9dd53e017bd19908ebd0 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 16 Jun 2020 19:26:41 +0200 Subject: [PATCH 1/5] Optimized KawPow - Hashrate improved ~2% - DAG initialization 10% faster --- src/KawPow/raven/CudaKawPow_gen.cpp | 90 +++++++++++++++++++++++------ src/KawPow/raven/CudaKawPow_gen.h | 3 +- src/KawPow/raven/KawPow.cu | 42 +------------- src/KawPow/raven/KawPow.h | 4 +- src/KawPow/raven/KawPow_dag.h | 2 +- 5 files changed, 80 insertions(+), 61 deletions(-) diff --git a/src/KawPow/raven/CudaKawPow_gen.cpp b/src/KawPow/raven/CudaKawPow_gen.cpp index f77404a..9dfc312 100644 --- a/src/KawPow/raven/CudaKawPow_gen.cpp +++ b/src/KawPow/raven/CudaKawPow_gen.cpp @@ -82,14 +82,51 @@ static void background_exec(T&& func) } +static inline uint32_t clz(uint32_t a) +{ +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse(&index, a); + return 31 - index; +#else + return __builtin_clz(a); +#endif +} + + +void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift) +{ + if ((divisor & (divisor - 1)) == 0) { + reciprocal = 1; + increment = 0; + shift = 31U - clz(divisor); + } + else { + shift = 63U - clz(divisor); + const uint64_t N = 1ULL << shift; + const uint64_t q = N / divisor; + const uint64_t r = N - q * divisor; + if (r * 2 < divisor) + { + reciprocal = static_cast(q); + increment = 1; + } + else + { + reciprocal = static_cast(q + 1); + increment = 0; + } + } +} + + static void KawPow_build_program( std::vector& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, - std::string source, - const uint64_t* dag_sizes) + std::string source) { { std::lock_guard g(KawPow_cache_mutex); @@ -142,18 +179,8 @@ static void KawPow_build_program( char opt0[64]; sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); - std::string options = " -DPROGPOW_DAG_ELEMENTS="; - - constexpr int PERIOD_LENGTH = 3; - constexpr int EPOCH_LENGTH = 7500; - - const uint64_t epoch = (period * PERIOD_LENGTH) / EPOCH_LENGTH; - const uint64_t dag_elements = dag_sizes[epoch] / 256; - - options += std::to_string(dag_elements); - - const char* opts[2] = { opt0, options.c_str() }; - result = nvrtcCompileProgram(prog, 2, opts); + const char* opts[1] = { opt0 }; + result = nvrtcCompileProgram(prog, 1, opts); if (result != NVRTC_SUCCESS) { size_t logSize; if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) { @@ -375,10 +402,10 @@ static void get_code(uint64_t prog_seed, std::string& random_math, std::string& dag_loads = ret.str(); } -void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background) +void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background) { if (background) { - background_exec([=]() { std::vector tmp; std::string s; KawPow_get_program(tmp, s, period, arch_major, arch_minor, dag_sizes, false); }); + background_exec([=]() { std::vector tmp; std::string s; KawPow_get_program(tmp, s, period, threads, arch_major, arch_minor, dag_sizes, false); }); return; } @@ -398,6 +425,35 @@ void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint6 const char dag_loads_include[] = "XMRIG_INCLUDE_PROGPOW_DATA_LOADS"; source_code.replace(source_code.find(dag_loads_include), sizeof(dag_loads_include) - 1, dag_loads); + constexpr int PERIOD_LENGTH = 3; + constexpr int EPOCH_LENGTH = 7500; + + const uint64_t epoch = (period * PERIOD_LENGTH) / EPOCH_LENGTH; + const uint64_t dag_elements = dag_sizes[epoch] / 256; + + uint32_t r, i, s; + calculate_fast_mod_data(dag_elements, r, i, s); + + std::stringstream ss; + if (i) { + ss << "const uint32_t offset1 = offset + " << i << ";\n"; + ss << "const uint32_t rcp = " << r << ";\n"; + ss << "offset -= ((offset1 ? __umulhi(offset1, rcp) : rcp) >> " << (s - 32) << ") * " << dag_elements << ";\n"; + } + else { + ss << "offset -= (__umulhi(offset, " << r << ") >> " << (s - 32) << ") * " << dag_elements << ";\n"; + } + + const char offset_mod_include[] = "XMRIG_INCLUDE_OFFSET_MOD_DAG_ELEMENTS"; + source_code.replace(source_code.find(offset_mod_include), sizeof(offset_mod_include) - 1, ss.str()); + + ss.str(std::string()); + + ss << "__launch_bounds__(" << threads << ", 3)"; + + const char launch_bounds_include[] = "XMRIG_INCLUDE_LAUNCH_BOUNDS"; + source_code.replace(source_code.find(launch_bounds_include), sizeof(launch_bounds_include) - 1, ss.str()); + { std::lock_guard g(KawPow_cache_mutex); @@ -412,5 +468,5 @@ void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint6 } } - KawPow_build_program(ptx, lowered_name, period, arch_major, arch_minor, source_code, dag_sizes); + KawPow_build_program(ptx, lowered_name, period, arch_major, arch_minor, source_code); } diff --git a/src/KawPow/raven/CudaKawPow_gen.h b/src/KawPow/raven/CudaKawPow_gen.h index d7e09b5..49dc4d4 100644 --- a/src/KawPow/raven/CudaKawPow_gen.h +++ b/src/KawPow/raven/CudaKawPow_gen.h @@ -5,6 +5,7 @@ #include #include -void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false); +void KawPow_get_program(std::vector& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false); +void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift); #endif // XMRIG_CUDAKAWPOW_GEN_H diff --git a/src/KawPow/raven/KawPow.cu b/src/KawPow/raven/KawPow.cu index b7d46b8..842ac75 100644 --- a/src/KawPow/raven/KawPow.cu +++ b/src/KawPow/raven/KawPow.cu @@ -33,44 +33,6 @@ #include "CudaKawPow_gen.h" -static inline uint32_t clz(uint32_t a) -{ -#ifdef _MSC_VER - unsigned long index; - _BitScanReverse(&index, a); - return 31 - index; -#else - return __builtin_clz(a); -#endif -} - - -static void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift) -{ - if ((divisor & (divisor - 1)) == 0) { - reciprocal = 1; - increment = 0; - shift = 31U - clz(divisor); - } - else { - shift = 63U - clz(divisor); - const uint64_t N = 1ULL << shift; - const uint64_t q = N / divisor; - const uint64_t r = N - q * divisor; - if (r * 2 < divisor) - { - reciprocal = static_cast(q); - increment = 1; - } - else - { - reciprocal = static_cast(q + 1); - increment = 0; - } - } -} - - void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes) { constexpr size_t MEM_ALIGN = 1024 * 1024; @@ -140,14 +102,14 @@ void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const v std::vector ptx; std::string lowered_name; - KawPow_get_program(ptx, lowered_name, period, ctx->device_arch[0], ctx->device_arch[1], dag_sizes); + KawPow_get_program(ptx, lowered_name, period, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes); CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->kawpow_module, ptx.data(), 0, 0, 0)); CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kawpow_kernel, ctx->kawpow_module, lowered_name.c_str())); ctx->kawpow_period = period; - KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true); + KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true); } if (!ctx->kawpow_stop) { diff --git a/src/KawPow/raven/KawPow.h b/src/KawPow/raven/KawPow.h index cc19fff..a3484bd 100644 --- a/src/KawPow/raven/KawPow.h +++ b/src/KawPow/raven/KawPow.h @@ -40,7 +40,7 @@ DEV_INLINE void progPowLoop(const uint32_t loop, uint32_t mix[PROGPOW_REGS], con // global load offset = SHFL(mix[0], loop % PROGPOW_LANES, PROGPOW_LANES); - offset %= PROGPOW_DAG_ELEMENTS; + XMRIG_INCLUDE_OFFSET_MOD_DAG_ELEMENTS offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES; data_dag = g_dag[offset]; @@ -194,7 +194,7 @@ __device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id, mix[i] = kiss99(st); } -__global__ void progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop) +__global__ void XMRIG_INCLUDE_LAUNCH_BOUNDS progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop) { if (*stop) { if (threadIdx.x == 0) { diff --git a/src/KawPow/raven/KawPow_dag.h b/src/KawPow/raven/KawPow_dag.h index 1581b03..a85d6f4 100644 --- a/src/KawPow/raven/KawPow_dag.h +++ b/src/KawPow/raven/KawPow_dag.h @@ -146,7 +146,7 @@ __global__ void ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint6 const int thread_id = threadIdx.x & 3; - #pragma unroll(8) + #pragma unroll(4) for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) { uint32_t parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words); From f76570b58320754f428f7c60e91731db3f9d14c6 Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 17 Jun 2020 07:13:37 +0700 Subject: [PATCH 2/5] v6.2.1 --- CHANGELOG.md | 3 +++ src/version.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7dcf3c..abd0b12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# v6.2.1 +- [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization. + # v6.2.0 - [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal. - [#53](https://github.com/xmrig/xmrig-cuda/pull/53) Fixed build with CUDA 11. diff --git a/src/version.h b/src/version.h index 3e10366..b3bb410 100644 --- a/src/version.h +++ b/src/version.h @@ -28,14 +28,14 @@ #define APP_ID "xmrig-cuda" #define APP_NAME "XMRig" #define APP_DESC "XMRig CUDA plugin" -#define APP_VERSION "6.2.0" +#define APP_VERSION "6.2.1" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com" #define APP_VER_MAJOR 6 #define APP_VER_MINOR 2 -#define APP_VER_PATCH 0 +#define APP_VER_PATCH 1 #define API_VERSION 3 From 6529ebb1efee4b3dd57e3bbc71cb905031ad7938 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 17 Jun 2020 15:55:49 +0200 Subject: [PATCH 3/5] KawPow: fast job switching Restarts GPU batch as soon as new job is received. Almost zero stale shares. --- src/KawPow/raven/KawPow.cu | 18 +++++++++--------- src/cryptonight.h | 3 ++- src/cuda_extra.cu | 2 +- src/xmrig-cuda.cpp | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/KawPow/raven/KawPow.cu b/src/KawPow/raven/KawPow.cu index 842ac75..7578e2a 100644 --- a/src/KawPow/raven/KawPow.cu +++ b/src/KawPow/raven/KawPow.cu @@ -112,15 +112,18 @@ void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const v KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true); } - if (!ctx->kawpow_stop) { - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_stop, sizeof(uint32_t) * 2)); + if (!ctx->kawpow_stop_host) { + CUDA_CHECK(ctx->device_id, cudaMallocHost(&ctx->kawpow_stop_host, sizeof(uint32_t) * 2)); + CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->kawpow_stop_device, ctx->kawpow_stop_host, 0)); } } void kawpow_stop_hash(nvid_ctx *ctx) { - // TODO: this is called from the main thread which doesn't have a valid CUDA context, so nothing works here + if (ctx->kawpow_stop_host) { + *ctx->kawpow_stop_host = 1; + } } @@ -132,11 +135,11 @@ void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, dim3 block(ctx->device_threads); uint32_t hack_false = 0; - void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop }; + void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop_device }; CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, job_blob, 40, cudaMemcpyHostToDevice)); CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, sizeof(uint32_t))); - CUDA_CHECK(ctx->device_id, cudaMemset(ctx->kawpow_stop, 0, sizeof(uint32_t) * 2)); + memset(ctx->kawpow_stop_host, 0, sizeof(uint32_t) * 2); CU_CHECK(ctx->device_id, cuLaunchKernel( ctx->kawpow_kernel, @@ -146,10 +149,7 @@ void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount, )); CU_CHECK(ctx->device_id, cuCtxSynchronize()); - uint32_t stop[2]; - CUDA_CHECK(ctx->device_id, cudaMemcpy(stop, ctx->kawpow_stop, sizeof(stop), cudaMemcpyDeviceToHost)); - - *skipped_hashes = stop[1]; + *skipped_hashes = ctx->kawpow_stop_host[1]; uint32_t results[16]; CUDA_CHECK(ctx->device_id, cudaMemcpy(results, ctx->d_result_nonce, sizeof(results), cudaMemcpyDeviceToHost)); diff --git a/src/cryptonight.h b/src/cryptonight.h index d799067..2f0523b 100644 --- a/src/cryptonight.h +++ b/src/cryptonight.h @@ -102,7 +102,8 @@ struct nvid_ctx { size_t kawpow_dag_size = 0; size_t kawpow_dag_capacity = 0; - uint32_t* kawpow_stop = nullptr; + uint32_t* kawpow_stop_host = nullptr; + uint32_t* kawpow_stop_device = nullptr; uint32_t kawpow_period = 0; CUmodule kawpow_module = nullptr; diff --git a/src/cuda_extra.cu b/src/cuda_extra.cu index 574e3e7..4591920 100644 --- a/src/cuda_extra.cu +++ b/src/cuda_extra.cu @@ -553,7 +553,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx) } if ((ctx->algorithm.family() == Algorithm::KAWPOW) && ((ctx->device_blocks < 0) || (ctx->device_threads < 0))) { - ctx->device_threads = 128; + ctx->device_threads = 256; ctx->device_blocks = props.multiProcessorCount * 2048; } diff --git a/src/xmrig-cuda.cpp b/src/xmrig-cuda.cpp index f91bb46..bd6720a 100644 --- a/src/xmrig-cuda.cpp +++ b/src/xmrig-cuda.cpp @@ -596,7 +596,7 @@ void release(nvid_ctx *ctx) cudaFree(ctx->kawpow_cache); cudaFree(ctx->kawpow_dag); - cudaFree(ctx->kawpow_stop); + cudaFreeHost(ctx->kawpow_stop_host); cuModuleUnload(ctx->module); cuModuleUnload(ctx->kawpow_module); From ab530445bf03446a6d877c0edc2e666489046d17 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 19 Jun 2020 10:26:07 +0200 Subject: [PATCH 4/5] KawPow: reduced overhead when fast switching jobs 16 times fewer atomicAdd() --- src/KawPow/raven/KawPow.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KawPow/raven/KawPow.h b/src/KawPow/raven/KawPow.h index a3484bd..362f695 100644 --- a/src/KawPow/raven/KawPow.h +++ b/src/KawPow/raven/KawPow.h @@ -197,9 +197,9 @@ __device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id, __global__ void XMRIG_INCLUDE_LAUNCH_BOUNDS progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop) { if (*stop) { - if (threadIdx.x == 0) { + if ((threadIdx.x == 0) && ((blockIdx.x & 15) == 0)) { // Count groups of skipped hashes (if we don't count them we'll break hashrate display) - atomicAdd(stop + 1, blockDim.x); + atomicAdd(stop + 1, blockDim.x * 16); } return; } From a143e7d7be1de4a6625e39123b582c2211b3f36e Mon Sep 17 00:00:00 2001 From: xmrig Date: Tue, 23 Jun 2020 09:18:49 +0700 Subject: [PATCH 5/5] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index abd0b12..5e46e3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ # v6.2.1 - [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization. +- [#55](https://github.com/xmrig/xmrig-cuda/pull/55) Added fast job switching for KawPow, almost zero stale shares. # v6.2.0 - [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal.