Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
xmrig committed Jun 23, 2020
2 parents ec0c066 + a143e7d commit ebe4b1d
Show file tree
Hide file tree
Showing 10 changed files with 101 additions and 77 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# v6.2.1
- [#54](https://github.com/xmrig/xmrig-cuda/pull/54) Optimized KawPow, about 2% hashrate improvement, 10% faster DAG initialization.
- [#55](https://github.com/xmrig/xmrig-cuda/pull/55) Added fast job switching for KawPow, almost zero stale shares.

# v6.2.0
- [#52](https://github.com/xmrig/xmrig-cuda/pull/52) Added new algorithm `cn/ccx` for Conceal.
- [#53](https://github.com/xmrig/xmrig-cuda/pull/53) Fixed build with CUDA 11.
Expand Down
90 changes: 73 additions & 17 deletions src/KawPow/raven/CudaKawPow_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,51 @@ static void background_exec(T&& func)
}


static inline uint32_t clz(uint32_t a)
{
#ifdef _MSC_VER
unsigned long index;
_BitScanReverse(&index, a);
return 31 - index;
#else
return __builtin_clz(a);
#endif
}


void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
{
if ((divisor & (divisor - 1)) == 0) {
reciprocal = 1;
increment = 0;
shift = 31U - clz(divisor);
}
else {
shift = 63U - clz(divisor);
const uint64_t N = 1ULL << shift;
const uint64_t q = N / divisor;
const uint64_t r = N - q * divisor;
if (r * 2 < divisor)
{
reciprocal = static_cast<uint32_t>(q);
increment = 1;
}
else
{
reciprocal = static_cast<uint32_t>(q + 1);
increment = 0;
}
}
}


static void KawPow_build_program(
std::vector<char>& ptx,
std::string& lowered_name,
uint64_t period,
int arch_major,
int arch_minor,
std::string source,
const uint64_t* dag_sizes)
std::string source)
{
{
std::lock_guard<std::mutex> g(KawPow_cache_mutex);
Expand Down Expand Up @@ -142,18 +179,8 @@ static void KawPow_build_program(
char opt0[64];
sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);

std::string options = " -DPROGPOW_DAG_ELEMENTS=";

constexpr int PERIOD_LENGTH = 3;
constexpr int EPOCH_LENGTH = 7500;

const uint64_t epoch = (period * PERIOD_LENGTH) / EPOCH_LENGTH;
const uint64_t dag_elements = dag_sizes[epoch] / 256;

options += std::to_string(dag_elements);

const char* opts[2] = { opt0, options.c_str() };
result = nvrtcCompileProgram(prog, 2, opts);
const char* opts[1] = { opt0 };
result = nvrtcCompileProgram(prog, 1, opts);
if (result != NVRTC_SUCCESS) {
size_t logSize;
if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) {
Expand Down Expand Up @@ -375,10 +402,10 @@ static void get_code(uint64_t prog_seed, std::string& random_math, std::string&
dag_loads = ret.str();
}

void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background)
void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background)
{
if (background) {
background_exec([=]() { std::vector<char> tmp; std::string s; KawPow_get_program(tmp, s, period, arch_major, arch_minor, dag_sizes, false); });
background_exec([=]() { std::vector<char> tmp; std::string s; KawPow_get_program(tmp, s, period, threads, arch_major, arch_minor, dag_sizes, false); });
return;
}

Expand All @@ -398,6 +425,35 @@ void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint6
const char dag_loads_include[] = "XMRIG_INCLUDE_PROGPOW_DATA_LOADS";
source_code.replace(source_code.find(dag_loads_include), sizeof(dag_loads_include) - 1, dag_loads);

constexpr int PERIOD_LENGTH = 3;
constexpr int EPOCH_LENGTH = 7500;

const uint64_t epoch = (period * PERIOD_LENGTH) / EPOCH_LENGTH;
const uint64_t dag_elements = dag_sizes[epoch] / 256;

uint32_t r, i, s;
calculate_fast_mod_data(dag_elements, r, i, s);

std::stringstream ss;
if (i) {
ss << "const uint32_t offset1 = offset + " << i << ";\n";
ss << "const uint32_t rcp = " << r << ";\n";
ss << "offset -= ((offset1 ? __umulhi(offset1, rcp) : rcp) >> " << (s - 32) << ") * " << dag_elements << ";\n";
}
else {
ss << "offset -= (__umulhi(offset, " << r << ") >> " << (s - 32) << ") * " << dag_elements << ";\n";
}

const char offset_mod_include[] = "XMRIG_INCLUDE_OFFSET_MOD_DAG_ELEMENTS";
source_code.replace(source_code.find(offset_mod_include), sizeof(offset_mod_include) - 1, ss.str());

ss.str(std::string());

ss << "__launch_bounds__(" << threads << ", 3)";

const char launch_bounds_include[] = "XMRIG_INCLUDE_LAUNCH_BOUNDS";
source_code.replace(source_code.find(launch_bounds_include), sizeof(launch_bounds_include) - 1, ss.str());

{
std::lock_guard<std::mutex> g(KawPow_cache_mutex);

Expand All @@ -412,5 +468,5 @@ void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint6
}
}

KawPow_build_program(ptx, lowered_name, period, arch_major, arch_minor, source_code, dag_sizes);
KawPow_build_program(ptx, lowered_name, period, arch_major, arch_minor, source_code);
}
3 changes: 2 additions & 1 deletion src/KawPow/raven/CudaKawPow_gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <vector>
#include <string>

void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false);
void KawPow_get_program(std::vector<char>& ptx, std::string& lowered_name, uint64_t period, uint32_t threads, int arch_major, int arch_minor, const uint64_t* dag_sizes, bool background = false);
void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift);

#endif // XMRIG_CUDAKAWPOW_GEN_H
60 changes: 11 additions & 49 deletions src/KawPow/raven/KawPow.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,44 +33,6 @@
#include "CudaKawPow_gen.h"


static inline uint32_t clz(uint32_t a)
{
#ifdef _MSC_VER
unsigned long index;
_BitScanReverse(&index, a);
return 31 - index;
#else
return __builtin_clz(a);
#endif
}


static void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
{
if ((divisor & (divisor - 1)) == 0) {
reciprocal = 1;
increment = 0;
shift = 31U - clz(divisor);
}
else {
shift = 63U - clz(divisor);
const uint64_t N = 1ULL << shift;
const uint64_t q = N / divisor;
const uint64_t r = N - q * divisor;
if (r * 2 < divisor)
{
reciprocal = static_cast<uint32_t>(q);
increment = 1;
}
else
{
reciprocal = static_cast<uint32_t>(q + 1);
increment = 0;
}
}
}


void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes)
{
constexpr size_t MEM_ALIGN = 1024 * 1024;
Expand Down Expand Up @@ -140,25 +102,28 @@ void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const v

std::vector<char> ptx;
std::string lowered_name;
KawPow_get_program(ptx, lowered_name, period, ctx->device_arch[0], ctx->device_arch[1], dag_sizes);
KawPow_get_program(ptx, lowered_name, period, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes);

CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->kawpow_module, ptx.data(), 0, 0, 0));
CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kawpow_kernel, ctx->kawpow_module, lowered_name.c_str()));

ctx->kawpow_period = period;

KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true);
KawPow_get_program(ptx, lowered_name, period + 1, ctx->device_threads, ctx->device_arch[0], ctx->device_arch[1], dag_sizes, true);
}

if (!ctx->kawpow_stop) {
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->kawpow_stop, sizeof(uint32_t) * 2));
if (!ctx->kawpow_stop_host) {
CUDA_CHECK(ctx->device_id, cudaMallocHost(&ctx->kawpow_stop_host, sizeof(uint32_t) * 2));
CUDA_CHECK(ctx->device_id, cudaHostGetDevicePointer(&ctx->kawpow_stop_device, ctx->kawpow_stop_host, 0));
}
}


void kawpow_stop_hash(nvid_ctx *ctx)
{
// TODO: this is called from the main thread which doesn't have a valid CUDA context, so nothing works here
if (ctx->kawpow_stop_host) {
*ctx->kawpow_stop_host = 1;
}
}


Expand All @@ -170,11 +135,11 @@ void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount,
dim3 block(ctx->device_threads);

uint32_t hack_false = 0;
void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop };
void* args[] = { &ctx->kawpow_dag, &ctx->d_input, &target, &hack_false, &ctx->d_result_nonce, &ctx->kawpow_stop_device };

CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, job_blob, 40, cudaMemcpyHostToDevice));
CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0, sizeof(uint32_t)));
CUDA_CHECK(ctx->device_id, cudaMemset(ctx->kawpow_stop, 0, sizeof(uint32_t) * 2));
memset(ctx->kawpow_stop_host, 0, sizeof(uint32_t) * 2);

CU_CHECK(ctx->device_id, cuLaunchKernel(
ctx->kawpow_kernel,
Expand All @@ -184,10 +149,7 @@ void hash(nvid_ctx *ctx, uint8_t* job_blob, uint64_t target, uint32_t *rescount,
));
CU_CHECK(ctx->device_id, cuCtxSynchronize());

uint32_t stop[2];
CUDA_CHECK(ctx->device_id, cudaMemcpy(stop, ctx->kawpow_stop, sizeof(stop), cudaMemcpyDeviceToHost));

*skipped_hashes = stop[1];
*skipped_hashes = ctx->kawpow_stop_host[1];

uint32_t results[16];
CUDA_CHECK(ctx->device_id, cudaMemcpy(results, ctx->d_result_nonce, sizeof(results), cudaMemcpyDeviceToHost));
Expand Down
8 changes: 4 additions & 4 deletions src/KawPow/raven/KawPow.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ DEV_INLINE void progPowLoop(const uint32_t loop, uint32_t mix[PROGPOW_REGS], con

// global load
offset = SHFL(mix[0], loop % PROGPOW_LANES, PROGPOW_LANES);
offset %= PROGPOW_DAG_ELEMENTS;
XMRIG_INCLUDE_OFFSET_MOD_DAG_ELEMENTS
offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;
data_dag = g_dag[offset];

Expand Down Expand Up @@ -194,12 +194,12 @@ __device__ __forceinline__ void fill_mix(uint32_t* hash_seed, uint32_t lane_id,
mix[i] = kiss99(st);
}

__global__ void progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop)
__global__ void XMRIG_INCLUDE_LAUNCH_BOUNDS progpow_search(const dag_t *g_dag, const uint32_t* job_blob, const uint64_t target, bool hack_false, uint32_t* results, uint32_t* stop)
{
if (*stop) {
if (threadIdx.x == 0) {
if ((threadIdx.x == 0) && ((blockIdx.x & 15) == 0)) {
// Count groups of skipped hashes (if we don't count them we'll break hashrate display)
atomicAdd(stop + 1, blockDim.x);
atomicAdd(stop + 1, blockDim.x * 16);
}
return;
}
Expand Down
2 changes: 1 addition & 1 deletion src/KawPow/raven/KawPow_dag.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ __global__ void ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint6

const int thread_id = threadIdx.x & 3;

#pragma unroll(8)
#pragma unroll(4)
for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) {
uint32_t parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words);

Expand Down
3 changes: 2 additions & 1 deletion src/cryptonight.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ struct nvid_ctx {
size_t kawpow_dag_size = 0;
size_t kawpow_dag_capacity = 0;

uint32_t* kawpow_stop = nullptr;
uint32_t* kawpow_stop_host = nullptr;
uint32_t* kawpow_stop_device = nullptr;

uint32_t kawpow_period = 0;
CUmodule kawpow_module = nullptr;
Expand Down
2 changes: 1 addition & 1 deletion src/cuda_extra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ int cuda_get_deviceinfo(nvid_ctx *ctx)
}

if ((ctx->algorithm.family() == Algorithm::KAWPOW) && ((ctx->device_blocks < 0) || (ctx->device_threads < 0))) {
ctx->device_threads = 128;
ctx->device_threads = 256;
ctx->device_blocks = props.multiProcessorCount * 2048;
}

Expand Down
4 changes: 2 additions & 2 deletions src/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@
#define APP_ID "xmrig-cuda"
#define APP_NAME "XMRig"
#define APP_DESC "XMRig CUDA plugin"
#define APP_VERSION "6.2.0"
#define APP_VERSION "6.2.1"
#define APP_DOMAIN "xmrig.com"
#define APP_SITE "www.xmrig.com"
#define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"

#define APP_VER_MAJOR 6
#define APP_VER_MINOR 2
#define APP_VER_PATCH 0
#define APP_VER_PATCH 1

#define API_VERSION 3

Expand Down
2 changes: 1 addition & 1 deletion src/xmrig-cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ void release(nvid_ctx *ctx)

cudaFree(ctx->kawpow_cache);
cudaFree(ctx->kawpow_dag);
cudaFree(ctx->kawpow_stop);
cudaFreeHost(ctx->kawpow_stop_host);

cuModuleUnload(ctx->module);
cuModuleUnload(ctx->kawpow_module);
Expand Down

0 comments on commit ebe4b1d

Please sign in to comment.