From 8b83e5a7f06f51b7211b9dd53e017bd19908ebd0 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Tue, 16 Jun 2020 19:26:41 +0200
Subject: [PATCH 1/5] Optimized KawPow

- Hashrate improved ~2%
- DAG initialization 10% faster
---
 src/KawPow/raven/CudaKawPow_gen.cpp | 90 +++++++++++++++++++++++------
 src/KawPow/raven/CudaKawPow_gen.h   |  3 +-
 src/KawPow/raven/KawPow.cu          | 42 +-------------
 src/KawPow/raven/KawPow.h           |  4 +-
 src/KawPow/raven/KawPow_dag.h       |  2 +-
 5 files changed, 80 insertions(+), 61 deletions(-)
diff --git a/src/KawPow/raven/CudaKawPow_gen.cpp b/src/KawPow/raven/CudaKawPow_gen.cpp
index f77404a..9dfc312 100644
--- a/src/KawPow/raven/CudaKawPow_gen.cpp
+++ b/src/KawPow/raven/CudaKawPow_gen.cpp
@@ -82,14 +82,51 @@ static void background_exec(T&& func)
 }
 
 
+static inline uint32_t clz(uint32_t a)
+{
+#ifdef _MSC_VER
+    unsigned long index;
+    _BitScanReverse(&index, a);
+    return 31 - index;
+#else
+    return __builtin_clz(a);
+#endif
+}
+
+
+void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
+{
+    if ((divisor & (divisor - 1)) == 0) {
+        reciprocal = 1;
+        increment = 0;
+        shift = 31U - clz(divisor);
+    }
+    else {
+        shift = 63U - clz(divisor);
+        const uint64_t N = 1ULL << shift;
+        const uint64_t q = N / divisor;
+        const uint64_t r = N - q * divisor;
+        if (r * 2 < divisor)
+        {
+            reciprocal = static_cast<uint32_t>(q);
+            increment = 1;
+        }
+        else
+        {
+            reciprocal = static_cast<uint32_t>(q + 1);
+            increment = 0;
+        }
+    }
+}
+
+
 static void KawPow_build_program(
     std::vector<char>& ptx,
     std::string& lowered_name,
     uint64_t period,
     int arch_major,
     int arch_minor,
-    std::string source,
-    const uint64_t* dag_sizes)
+    std::string source)
 {
     {
         std::lock_guard<std::mutex> g(KawPow_cache_mutex);
@@ -142,18 +179,8 @@ static void KawPow_build_program(
     char opt0[64];
     sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
 
-    std::string options = " -DPROGPOW_DAG_ELEMENTS=";
-
-    constexpr int PERIOD_LENGTH = 3;
-    constexpr int EPOCH_LENGTH = 7500;
-
-    const uint64_t epoch = (period * PERIOD_LENGTH) / EPOCH_LENGTH;
-    const uint64_t dag_elements = dag_sizes[epoch] / 256;
-
-    options += std::to_string(dag_elements);
-
-    const char* opts[2] = { opt0, options.c_str() };
-    result = nvrtcCompileProgram(prog, 2, opts);
+    const char* opts[1] = { opt0 };
+    result = nvrtcCompileProgram(prog, 1, opts);
     if (result != NVRTC_SUCCESS) {
         size_t logSize;
         if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) {
@@ -375,10 +402,10 @@ static void get_code(uint64_t prog_seed, std::string& random_math, std::string&
     dag_loads = ret.str();
 }
 
-void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background)
+void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background)
 {
     if (background) {
-        background_exec([=]() { std::vector<char> tmp; std::string s; KawPow_get_program(tmp, s, period, arch_major, arch_minor, dag_sizes, false); });
+        background_exec([=]() { std::vector<char> tmp; std::string s; KawPow_get_program(tmp, s, period, threads, arch_major, arch_minor, dag_sizes, false); });
         return;
     }
 
@@ -398,6 +425,35 @@ void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint6
     const char dag_loads_include[] = "XMRIG_INCLUDE_PROGPOW_DATA_LOADS";
     source_code.replace(source_code.find(dag_loads_include), sizeof(dag_loads_include) - 1, dag_loads);
 
+    constexpr int PERIOD_LENGTH = 3;
+    constexpr int EPOCH_LENGTH = 7500;
+
+    const uint64_t epoch = (period * PERIOD_LENGTH) / EPOCH_LENGTH;
+    const uint64_t dag_elements = dag_sizes[epoch] / 256;
+
+    uint32_t r, i, s;
+    calculate_fast_mod_data(dag_elements, r, i, s);
+
+    std::stringstream ss;
+    if (i) {
+        ss << "const uint32_t offset1 = offset + " << i << ";\n";
+        ss << "const uint32_t rcp = " << r << ";\n";
+        ss << "offset -= ((offset1 ? __umulhi(offset1, rcp) : rcp) >> " << (s - 32) << ") * " << dag_elements << ";\n";
+    }
+    else {
+        ss << "offset -= (__umulhi(offset, " << r << ") >> " << (s - 32) << ") * " << dag_elements << ";\n";
+    }
+
+    const char offset_mod_include[] = "XMRIG_INCLUDE_OFFSET_MOD_DAG_ELEMENTS";
+    source_code.replace(source_code.find(offset_mod_include), sizeof(offset_mod_include) - 1, ss.str());
+
+    ss.str(std::string());
+
+    ss << "__launch_bounds__(" << threads << ", 3)";
+
+    const char launch_bounds_include[] = "XMRIG_INCLUDE_LAUNCH_BOUNDS";
+    source_code.replace(source_code.find(launch_bounds_include), sizeof(launch_bounds_include) - 1, ss.str());
+
     {
         std::lock_guard<std::mutex> g(KawPow_cache_mutex);
 
@@ -412,5 +468,5 @@ void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint6
         }
     }
 
-    KawPow_build_program(ptx, lowered_name, period, arch_major, arch_minor, source_code, dag_sizes);
+    KawPow_build_program(ptx, lowered_name, period, arch_major, arch_minor, source_code);
 }
diff --git a/src/KawPow/raven/CudaKawPow_gen.h b/src/KawPow/raven/CudaKawPow_gen.h
index d7e09b5..49dc4d4 100644
--- a/src/KawPow/raven/CudaKawPow_gen.h
+++ b/src/KawPow/raven/CudaKawPow_gen.h
@@ -5,6 +5,7 @@
 #include <vector>
 #include <string>
 
-void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false);
+void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false);
+void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift);
 
 #endif // XMRIG_CUDAKAWPOW_GEN_H
diff --git a/src/KawPow/raven/KawPow.cu b/src/KawPow/raven/KawPow.cu
index b7d46b8..842ac75 100644
--- a/src/KawPow/raven/KawPow.cu
+++ b/src/KawPow/raven/KawPow.cu
@@ -33,44 +33,6 @@
 #include "CudaKawPow_gen.h"
 
 
-static inline uint32_t clz(uint32_t a)
-{
-#ifdef _MSC_VER
-    unsigned long index;
-    _BitScanReverse(&index, a);
-    return 31 - index;
-#else
-    return __builtin_clz(a);
-#endif
-}
-
-
-static void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
-{
-    if ((divisor & (divisor - 1)) == 0) {
-        reciprocal = 1;
-        increment = 0;
-        shift = 31U - clz(divisor);
-    }
-    else {
-        shift = 63U - clz(divisor);
-        const uint64_t N = 1ULL << shift;
-        const uint64_t q = N / divisor;
-        const uint64_t r = N - q * divisor;
-        if (r * 2 < divisor)
-        {
-            reciprocal = static_cast<uint32_t>(q);
-            increment = 1;
-        }
-        else
-        {
-            reciprocal = static_cast<uint32_t>(q + 1);
-            increment = 0;
-        }
-    }
-}
-
-
 void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes)
 {
     constexpr size_t MEM_ALIGN = 1024 * 1024;
@@ -140,14 +102,14 @@ void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const v
 
         std::vector<char> ptx;
         std::string lowered_name;
-        KawPow_get_program(ptx, lowered_name, period, ctx->device_arch[0], ctx->device_arch[1], dag_sizes);
+        KawPow_get_program(ptx, lowered_name, period, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes);
 
         CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->kawpow_module, ptx.data(), 0, 0, 0));
         CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kawpow_kernel, ctx->kawpow_module, lowered_name.c_str()));
 
         ctx->kawpow_period = period;
 
-        KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true);
+        KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true);
     }
 
     if (!ctx->kawpow_stop) {
diff --git a/src/KawPow/raven/KawPow.h b/src/KawPow/raven/KawPow.h
index cc19fff..a3484bd 100644
--- a/src/KawPow/raven/KawPow.h
+++ b/src/KawPow/raven/KawPow.h
@@ -40,7 +40,7 @@ DEV_INLINE void progPowLoop(const uint32_t loop, uint32_t mix[PROGPOW_REGS], con
 
     // global load
     offset = SHFL(mix[0], loop % PROGPOW_LANES, PROGPOW_LANES);
-    offset %= PROGPOW_DAG_ELEMENTS;
+    XMRIG_INCLUDE_OFFSET_MOD_DAG_ELEMENTS
     offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;
     data_dag = g_dag[offset];
 
@@ -194,7 +194,7 @@ __device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id,
         mix[i] = kiss99(st);
 }
 
-__global__ void progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop)
+__global__ void XMRIG_INCLUDE_LAUNCH_BOUNDS progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop)
 {
     if (*stop) {
         if (threadIdx.x == 0) {
diff --git a/src/KawPow/raven/KawPow_dag.h b/src/KawPow/raven/KawPow_dag.h
index 1581b03..a85d6f4 100644
--- a/src/KawPow/raven/KawPow_dag.h
+++ b/src/KawPow/raven/KawPow_dag.h
@@ -146,7 +146,7 @@ __global__ void ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint6
 
 	const int thread_id = threadIdx.x & 3;
 
-	#pragma unroll(8)
+	#pragma unroll(4)
 	for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) {
 		uint32_t parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words);
 

From f76570b58320754f428f7c60e91731db3f9d14c6 Mon Sep 17 00:00:00 2001
From: XMRig <support@xmrig.com>
Date: Wed, 17 Jun 2020 07:13:37 +0700
Subject: [PATCH 2/5] v6.2.1

---
 CHANGELOG.md  | 3 +++
 src/version.h | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f7dcf3c..abd0b12 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,6 @@
+# v6.2.1
+- [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization.
+
 # v6.2.0
 - [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal.
 - [#53](https://github.com/xmrig/xmrig-cuda/pull/53) Fixed build with CUDA 11.
diff --git a/src/version.h b/src/version.h
index 3e10366..b3bb410 100644
--- a/src/version.h
+++ b/src/version.h
@@ -28,14 +28,14 @@
 #define APP_ID        "xmrig-cuda"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig CUDA plugin"
-#define APP_VERSION   "6.2.0"
+#define APP_VERSION   "6.2.1"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"
 
 #define APP_VER_MAJOR  6
 #define APP_VER_MINOR  2
-#define APP_VER_PATCH  0
+#define APP_VER_PATCH  1
 
 #define API_VERSION    3
 

From 6529ebb1efee4b3dd57e3bbc71cb905031ad7938 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Wed, 17 Jun 2020 15:55:49 +0200
Subject: [PATCH 3/5] KawPow: fast job switching

Restarts GPU batch as soon as new job is received. Almost zero stale shares.
---
 src/KawPow/raven/KawPow.cu | 18 +++++++++---------
 src/cryptonight.h          |  3 ++-
 src/cuda_extra.cu          |  2 +-
 src/xmrig-cuda.cpp         |  2 +-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/KawPow/raven/KawPow.cu b/src/KawPow/raven/KawPow.cu
index 842ac75..7578e2a 100644
--- a/src/KawPow/raven/KawPow.cu
+++ b/src/KawPow/raven/KawPow.cu
@@ -112,15 +112,18 @@ void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const v
         KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true);
     }
 
-    if (!ctx->kawpow_stop) {
-        CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_stop, sizeof(uint32_t) * 2));
+    if (!ctx->kawpow_stop_host) {
+        CUDA_CHECK(ctx->device_id, cudaMallocHost(&ctx->kawpow_stop_host, sizeof(uint32_t) * 2));
+        CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->kawpow_stop_device, ctx->kawpow_stop_host, 0));
     }
 }
 
 
 void kawpow_stop_hash(nvid_ctx *ctx)
 {
-    // TODO: this is called from the main thread which doesn't have a valid CUDA context, so nothing works here
+    if (ctx->kawpow_stop_host) {
+        *ctx->kawpow_stop_host = 1;
+    }
 }
 
 
@@ -132,11 +135,11 @@ void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount,
     dim3 block(ctx->device_threads);
 
     uint32_t hack_false = 0;
-    void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop };
+    void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop_device };
 
     CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, job_blob, 40, cudaMemcpyHostToDevice));
     CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, sizeof(uint32_t)));
-    CUDA_CHECK(ctx->device_id, cudaMemset(ctx->kawpow_stop, 0, sizeof(uint32_t) * 2));
+    memset(ctx->kawpow_stop_host, 0, sizeof(uint32_t) * 2);
 
     CU_CHECK(ctx->device_id, cuLaunchKernel(
         ctx->kawpow_kernel,
@@ -146,10 +149,7 @@ void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount,
     ));
     CU_CHECK(ctx->device_id, cuCtxSynchronize());
 
-    uint32_t stop[2];
-    CUDA_CHECK(ctx->device_id, cudaMemcpy(stop, ctx->kawpow_stop, sizeof(stop), cudaMemcpyDeviceToHost));
-
-    *skipped_hashes = stop[1];
+    *skipped_hashes = ctx->kawpow_stop_host[1];
 
     uint32_t results[16];
     CUDA_CHECK(ctx->device_id, cudaMemcpy(results, ctx->d_result_nonce, sizeof(results), cudaMemcpyDeviceToHost));
diff --git a/src/cryptonight.h b/src/cryptonight.h
index d799067..2f0523b 100644
--- a/src/cryptonight.h
+++ b/src/cryptonight.h
@@ -102,7 +102,8 @@ struct nvid_ctx {
     size_t kawpow_dag_size              = 0;
     size_t kawpow_dag_capacity          = 0;
 
-    uint32_t* kawpow_stop               = nullptr;
+    uint32_t* kawpow_stop_host          = nullptr;
+    uint32_t* kawpow_stop_device        = nullptr;
 
     uint32_t kawpow_period              = 0;
     CUmodule kawpow_module              = nullptr;
diff --git a/src/cuda_extra.cu b/src/cuda_extra.cu
index 574e3e7..4591920 100644
--- a/src/cuda_extra.cu
+++ b/src/cuda_extra.cu
@@ -553,7 +553,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
     }
 
     if ((ctx->algorithm.family() == Algorithm::KAWPOW) && ((ctx->device_blocks < 0) || (ctx->device_threads < 0))) {
-        ctx->device_threads = 128;
+        ctx->device_threads = 256;
         ctx->device_blocks = props.multiProcessorCount * 2048;
     }
 
diff --git a/src/xmrig-cuda.cpp b/src/xmrig-cuda.cpp
index f91bb46..bd6720a 100644
--- a/src/xmrig-cuda.cpp
+++ b/src/xmrig-cuda.cpp
@@ -596,7 +596,7 @@ void release(nvid_ctx *ctx)
 
     cudaFree(ctx->kawpow_cache);
     cudaFree(ctx->kawpow_dag);
-    cudaFree(ctx->kawpow_stop);
+    cudaFreeHost(ctx->kawpow_stop_host);
 
     cuModuleUnload(ctx->module);
     cuModuleUnload(ctx->kawpow_module);

From ab530445bf03446a6d877c0edc2e666489046d17 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Fri, 19 Jun 2020 10:26:07 +0200
Subject: [PATCH 4/5] KawPow: reduced overhead when fast switching jobs

16 times fewer atomicAdd()
---
 src/KawPow/raven/KawPow.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KawPow/raven/KawPow.h b/src/KawPow/raven/KawPow.h
index a3484bd..362f695 100644
--- a/src/KawPow/raven/KawPow.h
+++ b/src/KawPow/raven/KawPow.h
@@ -197,9 +197,9 @@ __device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id,
 __global__ void XMRIG_INCLUDE_LAUNCH_BOUNDS progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop)
 {
     if (*stop) {
-        if (threadIdx.x == 0) {
+        if ((threadIdx.x == 0) && ((blockIdx.x & 15) == 0)) {
             // Count groups of skipped hashes (if we don't count them we'll break hashrate display)
-            atomicAdd(stop + 1, blockDim.x);
+            atomicAdd(stop + 1, blockDim.x * 16);
         }
         return;
     }

From a143e7d7be1de4a6625e39123b582c2211b3f36e Mon Sep 17 00:00:00 2001
From: xmrig <support@xmrig.com>
Date: Tue, 23 Jun 2020 09:18:49 +0700
Subject: [PATCH 5/5] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index abd0b12..5e46e3c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,6 @@
 # v6.2.1
 - [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization.
+- [#55](https://github.com/xmrig/xmrig-cuda/pull/55) Added fast job switching for KawPow, almost zero stale shares.
 
 # v6.2.0
 - [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal.