From 5452725f1ae50b00a3143c2a717a5e836fffe259 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Tue, 18 Dec 2018 19:04:16 -0700 Subject: [PATCH 01/19] Add Rig ID to connection status web+cli --- xmrstak/http/webdesign.cpp | 1 + xmrstak/misc/executor.cpp | 2 ++ xmrstak/net/jpsock.hpp | 1 + 3 files changed, 4 insertions(+) diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 93e217519..30a844349 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -168,6 +168,7 @@ extern const char sHtmlHashrateBodyLow [] = extern const char sHtmlConnectionBodyHigh [] = "
" "" + "" "" "" "" diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index a303b34cd..c77645ff6 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -945,6 +945,7 @@ void executor::connection_report(std::string& out) pool = pick_pool_by_id(last_usr_pool_id); out.append("CONNECTION REPORT\n"); + out.append("Rig ID : ").append(pool != nullptr ? pool->get_rigid() : "").append(1, '\n'); out.append("Pool address : ").append(pool != nullptr ? pool->get_pool_addr() : "").append(1, '\n'); if(pool != nullptr && pool->is_running() && pool->is_logged_in()) out.append("Connected since : ").append(time_format(date, sizeof(date), tPoolConnTime)).append(1, '\n'); @@ -1145,6 +1146,7 @@ void executor::http_connection_report(std::string& out) } snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh, + pool != nullptr ? pool->get_rigid() : "", pool != nullptr ? pool->get_pool_addr() : "not connected", cdate, ping_time); out.append(buffer); diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp index ad34f6c86..96fec6b98 100644 --- a/xmrstak/net/jpsock.hpp +++ b/xmrstak/net/jpsock.hpp @@ -58,6 +58,7 @@ class jpsock inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; } inline const char* get_pool_addr() { return net_addr.c_str(); } inline const char* get_tls_fp() { return tls_fp.c_str(); } + inline const char* get_rigid() { return usr_rigid.c_str(); } inline bool is_nicehash() { return nicehash; } bool get_pool_motd(std::string& strin); From 0e034f4ab4be2f563e030fde14a659065ddefda9 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Tue, 18 Dec 2018 19:15:24 -0700 Subject: [PATCH 02/19] Add backend thread tags to Thread IDs on Hashrate Report (web) --- xmrstak/http/webdesign.cpp | 2 +- xmrstak/misc/executor.cpp | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 93e217519..f58698e7e 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -157,7 +157,7 @@ extern const char sHtmlHashrateBodyHigh [] = ""; extern const char sHtmlHashrateTableRow [] = - ""; + ""; extern const char sHtmlHashrateBodyLow [] = "" diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index a303b34cd..5d5b8948b 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -1040,9 +1040,27 @@ void executor::http_hashrate_report(std::string& out) out.append(buffer); double fTotal[3] = { 0.0, 0.0, 0.0}; + auto bTypePrev = static_cast(0); + std::string name; + size_t j = 0; for(size_t i=0; i < nthd; i++) { double fHps[3]; + char csThreadTag[25]; + auto bType = static_cast(pvThreads->at(i)->backendType); + if(bTypePrev == bType) + j++; + else + { + j = 0; + bTypePrev = bType; + name = xmrstak::iBackend::getName(bType); + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + } + snprintf(csThreadTag, sizeof(csThreadTag), + (99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"), + name.c_str(), (unsigned int)(j), (unsigned int)i + ); fHps[0] = telem->calc_telemetry_data(10000, i); fHps[1] = telem->calc_telemetry_data(60000, i); @@ -1057,7 +1075,7 @@ void executor::http_hashrate_report(std::string& out) fTotal[1] += fHps[1]; fTotal[2] += fHps[2]; - snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, (unsigned int)i, num_a, num_b, num_c); + snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, csThreadTag, num_a, num_b, num_c); out.append(buffer); } From 72c8dacfaa1c8754402fe1d3ba64015cc24ddc77 Mon Sep 17 00:00:00 2001 From: Tony Butler Date: Tue, 18 Dec 2018 19:37:34 -0700 Subject: [PATCH 03/19] Add currency to results report web+cli --- xmrstak/http/webdesign.cpp | 1 + xmrstak/misc/executor.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 93e217519..27653bba4 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -185,6 +185,7 @@ extern const char sHtmlConnectionBodyLow [] = extern const char sHtmlResultBodyHigh [] = "
" "
Rig ID%s
Pool address%s
Connected since%s
Pool ping time%u ms
Thread ID10s60s15mH/s
%u%s%s%s
%s%s%s%s
Totals:%s%s%s
" + "" "" "" "" diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index a303b34cd..c0054ce43 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -884,6 +884,8 @@ void executor::result_report(std::string& out) iTotalRes += vMineResults[i].count; out.append("RESULT REPORT\n"); + out.append("Currency : "). + append(jconf::inst()->GetMiningCoin()).append("\n"); if(iTotalRes == 0) { out.append("You haven't found any results yet.\n"); @@ -1100,6 +1102,7 @@ void executor::http_result_report(std::string& out) } snprintf(buffer, sizeof(buffer), sHtmlResultBodyHigh, + jconf::inst()->GetMiningCoin().c_str(), iPoolDiff, iGoodRes, iTotalRes, fGoodResPrc, fAvgResTime, iPoolHashes, int_port(iTopDiff[0]), int_port(iTopDiff[1]), int_port(iTopDiff[2]), int_port(iTopDiff[3]), int_port(iTopDiff[4]), int_port(iTopDiff[5]), int_port(iTopDiff[6]), int_port(iTopDiff[7]), From a39ee0886cf613b70490164c4f33b066230709bc Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 29 Dec 2018 20:53:43 +0100 Subject: [PATCH 04/19] OpenCL: allow more than two algorithms In the current implementation the POW algorithm in dev pool section of a currency will not be taken into account during the binary creation. This PR changes the behavior and allow to create binaries for more than two POW algorihms. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 132 ++++++++++++---------------- xmrstak/backend/amd/amd_gpu/gpu.hpp | 6 +- 2 files changed, 61 insertions(+), 77 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 408cad97a..e4fb765e7 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -390,18 +390,26 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - xmrstak_algo miner_algo[2] = { + std::array selectedAlgos = { + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo(), + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot(), ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(), ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() }; - int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2; - for(int ii = 0; ii < num_algos; ++ii) + for(int ii = 0; ii < selectedAlgos.size(); ++ii) { + xmrstak_algo miner_algo = selectedAlgos[ii]; + bool alreadyCompiled = ctx->Program.find(miner_algo) != ctx->Program.end(); + if(alreadyCompiled) + { + printer::inst()->print_msg(L1,"OpenCL device %u - Skip %u",ctx->deviceIdx, (uint32_t)miner_algo); + continue; + } // scratchpad size for the selected mining algorithm - size_t hashMemSize = cn_select_memory(miner_algo[ii]); - int threadMemMask = cn_select_mask(miner_algo[ii]); - int hashIterations = cn_select_iter(miner_algo[ii]); + size_t hashMemSize = cn_select_memory(miner_algo); + int threadMemMask = cn_select_mask(miner_algo); + int hashIterations = cn_select_iter(miner_algo); size_t mem_chunk_exp = 1u << ctx->memChunk; size_t strided_index = ctx->stridedIndex; @@ -409,7 +417,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ * this is required if the dev pool is mining monero * but the user tuned there settings for another currency */ - if(miner_algo[ii] == cryptonight_monero_v8) + if(miner_algo == cryptonight_monero_v8) { if(ctx->memChunk < 2) mem_chunk_exp = 1u << 2; @@ -428,7 +436,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U"; options += " -DCOMP_MODE=" + std::to_string(needCompMode); options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU"; - options += " -DALGO=" + std::to_string(miner_algo[ii]); + options += " -DALGO=" + std::to_string(miner_algo); options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* AMD driver output is something like: `1445.5 (VM)` * and is mapped to `14` only. The value is only used for a compiler @@ -457,20 +465,20 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ { if(xmrstak::params::inst().AMDCache) printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); - ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); + ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } - ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL); + ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, options.c_str(), NULL, NULL); if(ret != CL_SUCCESS) { size_t len; printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); - if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); return ERR_OCL_API; @@ -479,7 +487,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char* BuildLog = (char*)malloc(len + 1); BuildLog[0] = '\0'; - if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) { free(BuildLog); printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); @@ -494,11 +502,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } cl_uint num_devices; - clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); std::vector devices_ids(num_devices); - clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); int dev_id = 0; /* Search for the gpu within the program context. * The id can be different to ctx->DeviceID. @@ -513,7 +521,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ cl_build_status status; do { - if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); return ERR_OCL_API; @@ -525,7 +533,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(xmrstak::params::inst().AMDCache) { std::vector binary_sizes(num_devices); - clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); std::vector all_programs(num_devices); std::vector> program_storage; @@ -541,7 +549,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ p_id++; } - if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); return ERR_OCL_API; @@ -565,7 +573,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ auto data_ptr = s.data(); cl_int clStatus; - ctx->Program[ii] = clCreateProgramWithBinary( + ctx->Program[miner_algo] = clCreateProgramWithBinary( opencl_ctx, 1, &ctx->DeviceID, &bin_size, (const unsigned char **)&data_ptr, &clStatus, &ret ); @@ -574,7 +582,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } - ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL); + ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); @@ -585,37 +593,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ std::vector KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; // append algorithm number to kernel name for(int k = 0; k < 3; k++) - KernelNames[k] += std::to_string(miner_algo[ii]); + KernelNames[k] += std::to_string(miner_algo); - if(ii == 0) - { - for(int i = 0; i < 7; ++i) - { - ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); - return ERR_OCL_API; - } - } - } - else + for(int i = 0; i < 7; ++i) { - for(int i = 0; i < 3; ++i) - { - ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str()); - return ERR_OCL_API; - } - } - // move kernel from the main algorithm into the root algorithm kernel space - for(int i = 3; i < 7; ++i) + ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret); + if(ret != CL_SUCCESS) { - ctx->Kernels[ii][i] = ctx->Kernels[0][i]; + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); + return ERR_OCL_API; } - } } ctx->Nonce = 0; @@ -996,8 +983,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo) { - // switch to the kernel storage - int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; cl_int ret; @@ -1015,28 +1000,28 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); return(ERR_OCL_API); @@ -1045,21 +1030,21 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // CN1 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); return(ERR_OCL_API); @@ -1068,7 +1053,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2) { // Input - if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if ((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; @@ -1077,49 +1062,49 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // CN3 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Branch 0 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Branch 1 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } // Branch 2 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } // Branch 3 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); @@ -1128,34 +1113,34 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar for(int i = 0; i < 4; ++i) { // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); return ERR_OCL_API; } // Nonce buffer - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); return ERR_OCL_API; } // Output - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); return ERR_OCL_API; } // Target - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } - if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); return(ERR_OCL_API); @@ -1258,9 +1243,6 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { - // switch to the kernel storage - int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; - cl_int ret; cl_uint zero = 0; size_t BranchNonces[4]; @@ -1294,7 +1276,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) } size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; @@ -1302,13 +1284,13 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); return ERR_OCL_API; @@ -1317,7 +1299,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) for(int i = 0; i < 4; ++i) { size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 80fcbefde..5e9f618ed 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) @@ -50,8 +52,8 @@ struct GpuContext cl_mem InputBuffer; cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; - cl_program Program[2]; - cl_kernel Kernels[2][8]; + std::map Program; + std::map> Kernels; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; From 758dbfb14b4ca54e44ba9dca1ce5b81a34d00603 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 29 Dec 2018 22:16:46 +0100 Subject: [PATCH 05/19] improve POW algorithm selection - add helper method `GetAllAlgorithms()` to get all active POW algorithms - select max scratchpad memory size based on the dev pool and user algorithms --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 27 ++++------- xmrstak/backend/amd/autoAdjust.hpp | 27 +++++------ xmrstak/backend/cpu/autoAdjust.hpp | 12 +++-- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 14 +++--- .../backend/cpu/crypto/cryptonight_common.cpp | 22 +++++---- xmrstak/backend/cpu/minethd.cpp | 20 ++++---- .../backend/nvidia/nvcc_code/cuda_extra.cu | 47 ++++++++++--------- xmrstak/misc/coinDescription.hpp | 25 +++++++++- 8 files changed, 109 insertions(+), 85 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index e4fb765e7..e9a1447ca 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -316,10 +316,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - size_t scratchPadSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t scratchPadSize = 0; + for(const auto algo : neededAlgorithms) + { + scratchPadSize = std::max(scratchPadSize, cn_select_memory(algo)); + } size_t g_thd = ctx->rawIntensity; ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); @@ -390,22 +393,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - std::array selectedAlgos = { - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo(), - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot(), - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(), - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() - }; - - for(int ii = 0; ii < selectedAlgos.size(); ++ii) + for(const auto miner_algo : neededAlgorithms) { - xmrstak_algo miner_algo = selectedAlgos[ii]; - bool alreadyCompiled = ctx->Program.find(miner_algo) != ctx->Program.end(); - if(alreadyCompiled) - { - printer::inst()->print_msg(L1,"OpenCL device %u - Skip %u",ctx->deviceIdx, (uint32_t)miner_algo); - continue; - } // scratchpad size for the selected mining algorithm size_t hashMemSize = cn_select_memory(miner_algo); int threadMemMask = cn_select_mask(miner_algo); diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index ba4cebb7b..0442fde91 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -83,10 +83,13 @@ class autoAdjust constexpr size_t byteToMiB = 1024u * 1024u; - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } std::string conf; for(auto& ctx : devVec) @@ -128,18 +131,10 @@ class autoAdjust } // check if cryptonight_monero_v8 is selected for the user or dev pool - bool useCryptonight_v8 = - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end(); // true for all cryptonight_heavy derivates since we check the user and dev pool - bool useCryptonight_heavy = - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy; + bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); // set strided index to default ctx.stridedIndex = 1; @@ -154,8 +149,8 @@ class autoAdjust else if(useCryptonight_heavy) ctx.stridedIndex = 3; - // increase all intensity limits by two for aeon - if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) + // increase all intensity limits by two if scratchpad is only 1 MiB + if(hashMemSize <= CRYPTONIGHT_LITE_MEMORY) maxThreads *= 2u; // keep 128MiB memory free (value is randomly chosen) from the max available memory diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index e7f3e9148..91da7a6ee 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -28,11 +28,15 @@ class autoAdjust bool printConfig() { + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } + const size_t hashMemSizeKB = hashMemSize / 1024u; - const size_t hashMemSizeKB = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ) / 1024u; const size_t halfHashMemSizeKB = hashMemSizeKB / 2u; configEditor configTpl{}; diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index b61582588..39e80a376 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -28,10 +28,12 @@ class autoAdjust autoAdjust() { - hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } halfHashMemSize = hashMemSize / 2u; } @@ -93,8 +95,8 @@ class autoAdjust } private: - size_t hashMemSize; - size_t halfHashMemSize; + size_t hashMemSize = 0; + size_t halfHashMemSize = 0; std::vector results; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index a7e4696a8..ee1ff2386 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -203,10 +203,13 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg) cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg) { - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096); @@ -284,10 +287,13 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al void cryptonight_free_ctx(cryptonight_ctx* ctx) { - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } if(ctx->ctx_info[0] != 0) { diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 20203a3c5..975f2d4a2 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -239,15 +239,17 @@ bool minethd::self_test() cn_hash_fun hashf; cn_hash_fun hashf_multi; - xmrstak_algo algo = xmrstak_algo::invalid_algo; - - for(int algo_idx = 0; algo_idx < 2; ++algo_idx) + if(xmrstak_algo::invalid_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() || + xmrstak_algo::invalid_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) { - if(algo_idx == 0) - algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(); - else - algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); + printer::inst()->print_msg(L0, "Root algorithm is not allowed to be invalid"); + return false; + } + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + for(const auto algo : neededAlgorithms) + { if(algo == cryptonight) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); @@ -590,7 +592,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - + Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, @@ -633,7 +635,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); } } - + return selected_function; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 45afec9ac..f98de2c14 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -287,19 +287,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // prefer shared memory over L1 cache CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } size_t wsize = ctx->device_blocks * ctx->device_threads; CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize)); size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize; if( - cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() ) { // extent ctx_b to hold the state of idx0 @@ -307,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } - else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) { // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; @@ -409,7 +411,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); - } + } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_MSG_KERNEL( @@ -656,10 +658,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->total_device_memory = totalMemory; ctx->free_device_memory = freeMemory; - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, cn_select_memory(algo)); + } #ifdef WIN32 /* We use in windows bfactor (split slow kernel into smaller parts) to avoid @@ -688,10 +693,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) // 680bytes are extra meta data memory per hash size_t perThread = hashMemSize + 16192u + 680u; if( - cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() ) perThread += 50 * 4; // state double buffer @@ -707,9 +712,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } // check if cryptonight_monero_v8 is selected for the user pool - bool useCryptonight_v8 = - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8; + bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end(); // overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0 if(useCryptonight_v8 && gpuArch >= 50) diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp index 55e86f4e2..26688aeea 100644 --- a/xmrstak/misc/coinDescription.hpp +++ b/xmrstak/misc/coinDescription.hpp @@ -4,7 +4,8 @@ #include #include - +#include +#include namespace xmrstak { @@ -56,5 +57,27 @@ namespace xmrstak coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]); return tmp; } + + /** return all POW algorithm for the current selected currency + * + * @return required POW algorithms without duplicated entries + */ + inline std::vector GetAllAlgorithms() + { + std::vector allAlgos = { + GetDescription(0).GetMiningAlgo(), + GetDescription(0).GetMiningAlgoRoot(), + GetDescription(1).GetMiningAlgo(), + GetDescription(1).GetMiningAlgoRoot() + }; + + std::sort(allAlgos.begin(), allAlgos.end()); + std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo); + auto last = std::unique(allAlgos.begin(), allAlgos.end()); + // remove duplicated algorithms + allAlgos.erase(last, allAlgos.end()); + + return allAlgos; + } }; } // namespace xmrstak From 0643f60177aa473d4772e887582ae9a4b6613c0c Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 29 Dec 2018 22:26:43 +0100 Subject: [PATCH 06/19] OpenCl: avoid multiple map lookups Avoid that we do multiple lookups to `std::map` to find the OpenCL kernel binaries. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 52 ++++++++++++++++------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index e9a1447ca..e0f3bcf1f 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -973,6 +973,8 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo) { + const auto & Kernels = ctx->Kernels[miner_algo]; + cl_int ret; if(input_len > 84) @@ -989,28 +991,28 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); return(ERR_OCL_API); @@ -1019,21 +1021,21 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // CN1 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); return(ERR_OCL_API); @@ -1042,7 +1044,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2) { // Input - if ((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; @@ -1051,49 +1053,49 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // CN3 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Branch 0 - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Branch 1 - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } // Branch 2 - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } // Branch 3 - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); @@ -1102,34 +1104,34 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar for(int i = 0; i < 4; ++i) { // States - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); return ERR_OCL_API; } // Nonce buffer - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); return ERR_OCL_API; } // Output - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); return ERR_OCL_API; } // Target - if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } - if((clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); return(ERR_OCL_API); @@ -1232,6 +1234,8 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { + const auto & Kernels = ctx->Kernels[miner_algo]; + cl_int ret; cl_uint zero = 0; size_t BranchNonces[4]; @@ -1265,7 +1269,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) } size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; @@ -1273,13 +1277,13 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); return ERR_OCL_API; @@ -1288,7 +1292,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) for(int i = 0; i < 4; ++i) { size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; From 749751e3c01f5f1da977b3f49ff0d7061c1965fa Mon Sep 17 00:00:00 2001 From: Brandon Lehmann Date: Sun, 6 Jan 2019 14:59:31 -0500 Subject: [PATCH 07/19] Add CryptoNight Turtle Support. Special thanks to @DaveLong for his hard work in getting this done. --- README.md | 4 +- xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 20 ++++---- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 2 +- xmrstak/backend/amd/autoAdjust.hpp | 7 ++- .../backend/cpu/crypto/cryptonight_aesni.h | 12 ++--- xmrstak/backend/cpu/minethd.cpp | 48 ++++++++++++++++++- xmrstak/backend/cryptonight.hpp | 22 ++++++++- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 21 ++++---- .../backend/nvidia/nvcc_code/cuda_extra.cu | 15 ++++-- xmrstak/jconf.cpp | 4 +- xmrstak/misc/executor.cpp | 1 + xmrstak/net/jpsock.cpp | 3 ++ xmrstak/pools.tpl | 3 ++ 14 files changed, 128 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 046a930e1..e0b62503c 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,14 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [QRL](https://theqrl.org) - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo** - [TurtleCoin](https://turtlecoin.lol) +- [Plenteum](https://www.plenteum.com/) Ryo currency is a way for us to implement the ideas that we were unable to in Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details. If your prefered coin is not listed, you can choose one of the following algorithms: - +- 256Kib scratchpad memory + - cryptonight_turtle - 1MiB scratchpad memory - cryptonight_lite - cryptonight_lite_v7 diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index e0f3bcf1f..4ad4d59e9 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -406,7 +406,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ * this is required if the dev pool is mining monero * but the user tuned there settings for another currency */ - if(miner_algo == cryptonight_monero_v8) + if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_turtle) { if(ctx->memChunk < 2) mem_chunk_exp = 1u << 2; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 6a3def72c..f647bcafc 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -538,7 +538,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, R"===( // cryptonight_monero_v8 && NVIDIA -#if(ALGO==11 && defined(__NV_CL_C_VERSION)) +#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) # define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) #else @@ -556,7 +556,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong a[2]; // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 || ALGO==13) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -571,7 +571,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states __local uint AES0[256], AES1[256]; // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 || ALGO==13) # if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; # endif @@ -587,7 +587,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); // cryptonight_monero_v8 -#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) +#if((ALGO==11 || ALGO==13) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } @@ -622,7 +622,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)b)[0]; // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 || ALGO==13) a[1] = states[1] ^ states[5]; b[2] = states[8] ^ states[10]; b[3] = states[9] ^ states[11]; @@ -654,7 +654,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states { ulong c[2]; // cryptonight_monero_v8 && NVIDIA -#if(ALGO==11 && defined(__NV_CL_C_VERSION)) +#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION)) uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif @@ -668,7 +668,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 || ALGO==13) { ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); @@ -693,7 +693,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states SCRATCHPAD_CHUNK(0) = b_x[0]; idx0 = as_uint2(c[0]).s0 & MASK; // cryptonight_monero_v8 -#elif(ALGO==11) +#elif(ALGO==11 || ALGO==13) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data @@ -712,7 +712,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 || ALGO==13) // Use division and square root results from the _previous_ iteration to hide the latency tmp.s0 ^= division_result.s0; tmp.s1 ^= division_result.s1 ^ sqrt_result; @@ -770,7 +770,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)a)[0] ^= tmp; // cryptonight_monero_v8 -#if (ALGO == 11) +#if (ALGO == 11 || ALGO==13) # if defined(__NV_CL_C_VERSION) // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index c170387b4..4205a67c3 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -4,7 +4,7 @@ R"===( */ // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 || ALGO==13) static const __constant uint RCP_C[256] = { diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 0442fde91..f2dce7f90 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -131,7 +131,8 @@ class autoAdjust } // check if cryptonight_monero_v8 is selected for the user or dev pool - bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end(); + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end()); // true for all cryptonight_heavy derivates since we check the user and dev pool bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); @@ -153,6 +154,10 @@ class autoAdjust if(hashMemSize <= CRYPTONIGHT_LITE_MEMORY) maxThreads *= 2u; + // increase all intensity limits by eight for turtle (*2u shadowed from lite) + if (hashMemSize <= CRYPTONIGHT_TURTLE_MEMORY) + maxThreads *= 4u; + // keep 128MiB memory free (value is randomly chosen) from the max available memory const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 06cbe8740..2218bf88a 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -545,7 +545,7 @@ inline void set_float_rounding_mode() #define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \ { \ const uint64_t idx1 = idx0 & MASK; \ const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ @@ -558,7 +558,7 @@ inline void set_float_rounding_mode() #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \ { \ const uint64_t idx1 = idx0 & MASK; \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ @@ -572,7 +572,7 @@ inline void set_float_rounding_mode() } #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \ { \ uint64_t sqrt_result_tmp; \ assign(sqrt_result_tmp, sqrt_result); \ @@ -627,7 +627,7 @@ inline void set_float_rounding_mode() idx0 = h0[0] ^ h0[4]; \ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \ { \ bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ @@ -664,7 +664,7 @@ inline void set_float_rounding_mode() ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - if(ALGO != cryptonight_monero_v8) \ + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_turtle) \ bx0 = cx #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ @@ -681,7 +681,7 @@ inline void set_float_rounding_mode() ah0 += lo; \ al0 += hi; \ } \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \ { \ bx1 = bx0; \ bx0 = cx; \ diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 975f2d4a2..2327bed1d 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -403,6 +403,15 @@ bool minethd::self_test() hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx); bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; } + else if (algo == cryptonight_turtle) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + } if(!bResult) @@ -532,6 +541,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_superfast: algv = 11; break; + case cryptonight_turtle: + algv = 12; + break; default: algv = 2; break; @@ -596,7 +608,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; @@ -636,6 +653,35 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc } } + if (N <= 2 && (algo == cryptonight_turtle) && bHaveAes) + { + std::string selected_asm = asm_version_str; + if (selected_asm == "auto") + selected_asm = cpu::getAsmName(N); + + if (selected_asm != "off") + { + if (selected_asm == "intel_avx" && asm_version_str != "auto") + { + // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) + if (N == 1) + selected_function = Cryptonight_hash_asm<1u, 0u>::template hash; + else if (N == 2) + selected_function = Cryptonight_hash_asm<2u, 0u>::template hash; + } + // supports only 1 thread per hash + if (N == 1 && selected_asm == "amd_avx") + { + // AMD Ryzen (1xxx and 2xxx series) + selected_function = Cryptonight_hash_asm<1u, 1u>::template hash; + } + if (asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx")) + printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str()); + else if (selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type + printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); + } + } + return selected_function; } diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index e905caa9f..2dd922f91 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -17,7 +17,8 @@ enum xmrstak_algo cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks cryptonight_monero_v8 = 11, - cryptonight_superfast = 12 + cryptonight_superfast = 12, + cryptonight_turtle = 13 }; // define aeon settings @@ -37,6 +38,10 @@ constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000; constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; +constexpr size_t CRYPTONIGHT_TURTLE_MEMORY = 256 * 1024; +constexpr uint32_t CRYPTONIGHT_TURTLE_MASK = 0x1FFF0; +constexpr uint32_t CRYPTONIGHT_TURTLE_ITER = 0x10000; + template inline constexpr size_t cn_select_memory() { return 0; } @@ -76,6 +81,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTO template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_TURTLE_MEMORY; } + inline size_t cn_select_memory(xmrstak_algo algo) { switch(algo) @@ -95,6 +103,8 @@ inline size_t cn_select_memory(xmrstak_algo algo) case cryptonight_haven: case cryptonight_heavy: return CRYPTONIGHT_HEAVY_MEMORY; + case cryptonight_turtle: + return CRYPTONIGHT_TURTLE_MEMORY; default: return 0; } @@ -139,6 +149,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTO template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_TURTLE_MASK; } + inline size_t cn_select_mask(xmrstak_algo algo) { switch(algo) @@ -158,6 +171,8 @@ inline size_t cn_select_mask(xmrstak_algo algo) case cryptonight_haven: case cryptonight_heavy: return CRYPTONIGHT_HEAVY_MASK; + case cryptonight_turtle: + return CRYPTONIGHT_TURTLE_MASK; default: return 0; } @@ -202,6 +217,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTO template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_SUPERFAST_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_TURTLE_ITER; } + inline size_t cn_select_iter(xmrstak_algo algo) { switch(algo) @@ -223,6 +241,8 @@ inline size_t cn_select_iter(xmrstak_algo algo) return CRYPTONIGHT_MASARI_ITER; case cryptonight_superfast: return CRYPTONIGHT_SUPERFAST_ITER; + case cryptonight_turtle: + return CRYPTONIGHT_TURTLE_ITER; default: return 0; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 87c1befa8..c3a97808c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -310,7 +310,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in uint64_t bx1; uint32_t sqrt_result; uint64_t division_result; - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub]; bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub]; @@ -350,7 +350,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) )) ); - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ]; @@ -393,14 +393,14 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in else ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; - if(ALGO != cryptonight_monero_v8) + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_turtle) bx0 = cx_aes; uint64_t cx_mul; ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); - if(ALGO == cryptonight_monero_v8 && sub == 1) + if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && sub == 1) { // Use division and square root results from the _previous_ iteration to hide the latency ((uint32_t*)&division_result)[1] ^= sqrt_result; @@ -424,7 +424,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in uint64_t cl = ((uint64_t*)myChunks)[ idx1 ]; // sub 0 -> hi, sub 1 -> lo uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; @@ -441,7 +441,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } ax0 += res; } - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { bx1 = bx0; bx0 = cx_aes; @@ -464,7 +464,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in if ( bfactor > 0 ) { ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; @@ -771,7 +771,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) for ( int i = 0; i < partcount; i++ ) { - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { // two threads per block CUDA_CHECK_MSG_KERNEL( @@ -884,7 +884,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash }; std::bitset<1> digit; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index f98de2c14..7149f37a6 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -142,7 +142,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); // bx1 @@ -310,7 +310,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } - else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) + else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end() ) { // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; @@ -362,11 +363,16 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } - if(miner_algo == cryptonight_monero_v8) + else if(miner_algo == cryptonight_monero_v8) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + else if (miner_algo == cryptonight_turtle) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare << > > (wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, @@ -712,7 +718,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } // check if cryptonight_monero_v8 is selected for the user pool - bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end(); + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end()); // overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0 if(useCryptonight_v8 && gpuArch >= 50) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 2a2dc8dbc..165595c5f 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -99,6 +99,7 @@ xmrstak::coin_selection coins[] = { { "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "cryptonight_superfast", {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, + { "cryptonight_turtle", {cryptonight_turtle, cryptonight_turtle, 0u}, {cryptonight_turtle, cryptonight_turtle, 0u}, nullptr }, { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, @@ -111,7 +112,8 @@ xmrstak::coin_selection coins[] = { { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "turtlecoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } + { "turtlecoin", {cryptonight_turtle, cryptonight_aeon, 5u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, + { "plenteum", {cryptonight_turtle, cryptonight_aeon, 5u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } }; constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index a303b34cd..58d5b0e83 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -562,6 +562,7 @@ void executor::ex_main() break; case cryptonight_monero_v8: case cryptonight_monero: + case cryptonight_turtle: if(dev_tls) pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false); else diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 406c535d2..cbdf1d0c1 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -709,6 +709,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_superfast: algo_name = "cryptonight_superfast"; break; + case cryptonight_turtle: + algo_name = "cryptonight_turtle"; + break; default: algo_name = "unknown"; break; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 58762de56..2019f2b86 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -33,9 +33,12 @@ POOLCONF], * qrl - Quantum Resistant Ledger * ryo * turtlecoin + * plenteum * * Native algorithms which not depends on any block versions: * + * # 256KiB scratchpad memory + * cryptonight_turtle * # 1MiB scratchpad memory * cryptonight_lite * cryptonight_lite_v7 From 346933d1e0977434944a26e313d3150880507b5c Mon Sep 17 00:00:00 2001 From: fireice-uk Date: Wed, 30 Jan 2019 08:47:57 +0000 Subject: [PATCH 08/19] Implement CN-GPU Proof-of-Work Algo Co-authored-by: psychocrypt Co-authored-by: fireice-uk --- CMakeLists.txt | 11 + xmrstak/backend/amd/amd_gpu/gpu.cpp | 179 ++++-- xmrstak/backend/amd/amd_gpu/gpu.hpp | 2 +- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 193 +++--- .../amd/amd_gpu/opencl/cryptonight_gpu.cl | 383 ++++++++++++ xmrstak/backend/amd/autoAdjust.hpp | 13 +- xmrstak/backend/cpu/crypto/cn_gpu.hpp | 43 ++ xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp | 176 ++++++ xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp | 182 ++++++ .../backend/cpu/crypto/cryptonight_aesni.h | 72 ++- xmrstak/backend/cpu/minethd.cpp | 38 +- xmrstak/backend/cryptonight.hpp | 20 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 77 ++- .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 552 ++++++++++++++++++ .../backend/nvidia/nvcc_code/cuda_extra.cu | 92 +-- .../backend/nvidia/nvcc_code/cuda_keccak.hpp | 20 + xmrstak/jconf.cpp | 1 + xmrstak/misc/executor.cpp | 6 + 18 files changed, 1866 insertions(+), 194 deletions(-) create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl create mode 100644 xmrstak/backend/cpu/crypto/cn_gpu.hpp create mode 100644 xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp create mode 100644 xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b714ee0ce..7d21fa928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,6 +152,9 @@ if(CUDA_ENABLE) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") endif() + # required for cryptonight_gpu (fast floating point operations are not allowed) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --fmad=false --prec-div=true --ftz=false") + # avoid that nvcc in CUDA 8 complains about sm_20 pending removal if(CUDA_VERSION VERSION_EQUAL 8.0) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets") @@ -279,6 +282,14 @@ else() list(APPEND BACKEND_TYPES "cpu") endif() +################################################################################ +# Explicit march setting for Clang +################################################################################ + +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") +endif() + ################################################################################ # Find PThreads ################################################################################ diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 4ad4d59e9..857abc138 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -413,6 +413,10 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(strided_index == 1) strided_index = 0; } + if(miner_algo == cryptonight_gpu) + { + strided_index = 0; + } // if intensity is a multiple of worksize than comp mode is not needed int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0; @@ -433,6 +437,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ */ options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100); + if(miner_algo == cryptonight_gpu) + options += " -cl-fp32-correctly-rounded-divide-sqrt"; + /* create a hash for the compile time cache * used data: * - source code @@ -579,12 +586,23 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } } - std::vector KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; + std::vector KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" }; + if(miner_algo == cryptonight_gpu) + { + KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu"); + KernelNames.insert(KernelNames.begin(), "cn0_cn_gpu"); + } + else + { + KernelNames.insert(KernelNames.begin(), "cn1"); + KernelNames.insert(KernelNames.begin(), "cn0"); + } + // append algorithm number to kernel name for(int k = 0; k < 3; k++) KernelNames[k] += std::to_string(miner_algo); - for(int i = 0; i < 7; ++i) + for(int i = 0; i < KernelNames.size(); ++i) { ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret); if(ret != CL_SUCCESS) @@ -919,6 +937,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) const char *wolfSkeinCL = #include "./opencl/wolf-skein.cl" ; + const char *cryptonight_gpu = + #include "./opencl/cryptonight_gpu.cl" + ; std::string source_code(cryptonightCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); @@ -928,6 +949,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu); // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); @@ -1066,76 +1088,102 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - // Branch 0 - if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + if(miner_algo == cryptonight_gpu) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); - return ERR_OCL_API; - } - - // Branch 1 - if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); - return ERR_OCL_API; - } - - // Branch 2 - if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); - return ERR_OCL_API; - } + // Output + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); + return ERR_OCL_API; + } - // Branch 3 - if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); - return ERR_OCL_API; - } + // Target + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); + return ERR_OCL_API; + } - // Threads - if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); - return(ERR_OCL_API); + // Threads + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + return(ERR_OCL_API); + } } - - for(int i = 0; i < 4; ++i) - { - // States - if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + else { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + // Branch 0 + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } - // Nonce buffer - if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + // Branch 1 + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } - // Output - if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + // Branch 2 + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } - // Target - if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + // Branch 3 + if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } - if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + // Threads + if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); } + + for(int i = 0; i < 4; ++i) + { + // States + if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + return ERR_OCL_API; + } + + // Nonce buffer + if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + return ERR_OCL_API; + } + + // Output + if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + return ERR_OCL_API; + } + + // Target + if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + return ERR_OCL_API; + } + + if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } + } } return ERR_SUCCESS; @@ -1277,10 +1325,24 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if(miner_algo == cryptonight_gpu) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); - return ERR_OCL_API; + size_t w_size_cn_gpu = w_size * 16; + size_t g_thd_cn_gpu = g_thd * 16; + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } + } + else + { + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } } if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) @@ -1289,13 +1351,16 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) return ERR_OCL_API; } - for(int i = 0; i < 4; ++i) + if(miner_algo != cryptonight_gpu) { - size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + for(int i = 0; i < 4; ++i) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); - return ERR_OCL_API; + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; + } } } diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5e9f618ed..5b95e9865 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -53,7 +53,7 @@ struct GpuContext cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; std::map Program; - std::map> Kernels; + std::map> Kernels; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index f647bcafc..ea07d6a8b 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -78,21 +78,6 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width) } #endif -//#include "opencl/fast_int_math_v2.cl" -XMRSTAK_INCLUDE_FAST_INT_MATH_V2 -//#include "fast_div_heavy.cl" -XMRSTAK_INCLUDE_FAST_DIV_HEAVY -//#include "opencl/wolf-aes.cl" -XMRSTAK_INCLUDE_WOLF_AES -//#include "opencl/wolf-skein.cl" -XMRSTAK_INCLUDE_WOLF_SKEIN -//#include "opencl/jh.cl" -XMRSTAK_INCLUDE_JH -//#include "opencl/blake256.cl" -XMRSTAK_INCLUDE_BLAKE256 -//#include "opencl/groestl256.cl" -XMRSTAK_INCLUDE_GROESTL256 - static const __constant ulong keccakf_rndc[24] = { 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, @@ -186,31 +171,49 @@ static const __constant uint keccakf_piln[24] = 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 }; -void keccakf1600_1(ulong *st) +inline void keccakf1600_1(ulong st[25]) { int i, round; ulong t, bc[5]; #pragma unroll 1 - for(round = 0; round < 24; ++round) + for (round = 0; round < 24; ++round) { + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL); + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL); + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL); + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL); + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL); - // Theta - bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; - bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; - bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; - bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; - bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; - - #pragma unroll 1 - for (i = 0; i < 5; ++i) { - t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL); - st[i ] ^= t; - st[i + 5] ^= t; - st[i + 10] ^= t; - st[i + 15] ^= t; - st[i + 20] ^= t; - } + st[0] ^= bc[4]; + st[5] ^= bc[4]; + st[10] ^= bc[4]; + st[15] ^= bc[4]; + st[20] ^= bc[4]; + + st[1] ^= bc[0]; + st[6] ^= bc[0]; + st[11] ^= bc[0]; + st[16] ^= bc[0]; + st[21] ^= bc[0]; + + st[2] ^= bc[1]; + st[7] ^= bc[1]; + st[12] ^= bc[1]; + st[17] ^= bc[1]; + st[22] ^= bc[1]; + + st[3] ^= bc[2]; + st[8] ^= bc[2]; + st[13] ^= bc[2]; + st[18] ^= bc[2]; + st[23] ^= bc[2]; + + st[4] ^= bc[3]; + st[9] ^= bc[3]; + st[14] ^= bc[3]; + st[19] ^= bc[3]; + st[24] ^= bc[3]; // Rho Pi t = st[1]; @@ -221,17 +224,16 @@ void keccakf1600_1(ulong *st) t = bc[0]; } - #pragma unroll 1 + #pragma unroll for(int i = 0; i < 25; i += 5) { - ulong tmp[5]; - - #pragma unroll 1 - for(int x = 0; x < 5; ++x) - tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]); + ulong tmp1 = st[i], tmp2 = st[i + 1]; - #pragma unroll 1 - for(int x = 0; x < 5; ++x) st[i + x] = tmp[x]; + st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]); + st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]); + st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]); + st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]); + st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1); } // Iota @@ -311,6 +313,43 @@ void keccakf1600_2(__local ulong *st) } } +#define MEM_CHUNK (1<> 4))))) #else @@ -556,7 +575,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong a[2]; // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -571,7 +590,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states __local uint AES0[256], AES1[256]; // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) # if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; # endif @@ -587,7 +606,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); // cryptonight_monero_v8 -#if((ALGO==11 || ALGO==13) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) +#if((ALGO==11 || ALGO==14) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } @@ -622,7 +641,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)b)[0]; // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) a[1] = states[1] ^ states[5]; b[2] = states[8] ^ states[10]; b[3] = states[9] ^ states[11]; @@ -654,7 +673,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states { ulong c[2]; // cryptonight_monero_v8 && NVIDIA -#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION)) +#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION)) uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif @@ -668,7 +687,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) { ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); @@ -693,7 +712,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states SCRATCHPAD_CHUNK(0) = b_x[0]; idx0 = as_uint2(c[0]).s0 & MASK; // cryptonight_monero_v8 -#elif(ALGO==11 || ALGO==13) +#elif(ALGO==11 || ALGO==14) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data @@ -712,7 +731,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) // Use division and square root results from the _previous_ iteration to hide the latency tmp.s0 ^= division_result.s0; tmp.s1 ^= division_result.s1 ^ sqrt_result; @@ -805,7 +824,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states R"===( __attribute__((reqd_work_group_size(8, 8, 1))) -__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) +__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, +// cryptonight_gpu +#if (ALGO == 13) + __global uint *output, ulong Target, uint Threads) +#else + __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) +#endif { __local uint AES0[256], AES1[256], AES2[256], AES3[256]; uint ExpandedKey2[40]; @@ -823,8 +848,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif @@ -862,8 +887,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; @@ -876,7 +901,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states if (gIdx < Threads) #endif { -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) #pragma unroll 2 for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) { @@ -916,8 +942,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif } -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) /* Also left over threads performe this loop. * The left over thread results will be ignored */ @@ -959,7 +985,15 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states for(int i = 0; i < 25; ++i) State[i] = states[i]; keccakf1600_2(State); - +#if (ALGO == 13) + if(State[3] <= Target) + { + //printf("gt %lu\n", State[3]); + ulong outIdx = atomic_inc(output + 0xFF); + if(outIdx < 0xFF) + output[outIdx] = get_global_id(0); + } +#else for(int i = 0; i < 25; ++i) states[i] = State[i]; uint StateSwitch = State[0] & 3; @@ -967,6 +1001,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; +#endif } } mem_fence(CLK_GLOBAL_MEM_FENCE); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl new file mode 100644 index 000000000..a99243e44 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -0,0 +1,383 @@ +R"===( + + +inline float4 _mm_add_ps(float4 a, float4 b) +{ + return a + b; +} + +inline float4 _mm_sub_ps(float4 a, float4 b) +{ + return a - b; +} + +inline float4 _mm_mul_ps(float4 a, float4 b) +{ + + //#pragma OPENCL SELECT_ROUNDING_MODE rte + return a * b; +} + +inline float4 _mm_div_ps(float4 a, float4 b) +{ + return a / b; +} + +inline float4 _mm_and_ps(float4 a, int b) +{ + return as_float4(as_int4(a) & (int4)(b)); +} + +inline float4 _mm_or_ps(float4 a, int b) +{ + return as_float4(as_int4(a) | (int4)(b)); +} + +inline float4 _mm_fmod_ps(float4 v, float dc) +{ + float4 d = (float4)(dc); + float4 c = _mm_div_ps(v, d); + c = trunc(c); + c = _mm_mul_ps(c, d); + return _mm_sub_ps(v, c); +} + +inline int4 _mm_xor_si128(int4 a, int4 b) +{ + return a ^ b; +} + +inline float4 _mm_xor_ps(float4 a, int b) +{ + return as_float4(as_int4(a) ^ (int4)(b)); +} + +inline int4 _mm_alignr_epi8(int4 a, const uint rot) +{ + const uint right = 8 * rot; + const uint left = (32 - 8 * rot); + return (int4)( + ((uint)a.x >> right) | ( a.y << left ), + ((uint)a.y >> right) | ( a.z << left ), + ((uint)a.z >> right) | ( a.w << left ), + ((uint)a.w >> right) | ( a.x << left ) + ); +} + + +inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); } + +inline float4 fma_break(float4 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); +} + +inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c) +{ + n1 = _mm_add_ps(n1, *c); + float4 nn = _mm_mul_ps(n0, *c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + *n = _mm_add_ps(*n, nn); + + n3 = _mm_sub_ps(n3, *c); + float4 dd = _mm_mul_ps(n2, *c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + *d = _mm_add_ps(*d, dd); + + //Constant feedback + *c = _mm_add_ps(*c, rnd_c); + *c = _mm_add_ps(*c, (float4)(0.734375f)); + float4 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *c = _mm_add_ps(*c, r); + +} + +// 9*8 + 2 = 74 +inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r) +{ + float4 n = (float4)(0.0f); + float4 d = (float4)(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); + sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c); + sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c); + sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c); + sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c); + sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c); + sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c); + sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + *r =_mm_add_ps(*r, _mm_div_ps(n,d)); +} + +inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum) +{ + float4 c= (float4)(cnt); + // 35 maths calls follow (140 FLOPS) + float4 r = (float4)(0.0f); + + for(int i = 0; i < 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, &c, &r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *sum = r; // 34 + float4 x = (float4)(536870880.0f); + r = _mm_mul_ps(r, x); // 35 + return convert_int4_rte(r); +} + +inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out) +{ + float4 n0 = convert_float4_rte(v0); + float4 n1 = convert_float4_rte(v1); + float4 n2 = convert_float4_rte(v2); + float4 n3 = convert_float4_rte(v3); + + int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + *out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} + +)===" +R"===( + +static const __constant uint look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} +}; + +static const __constant float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f +}; + +__attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1))) +__kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads) +{ + const uint gIdx = getIdx(); + +#if(COMP_MODE==1) + if(gIdx < Threads) + return; +#endif + + uint chunk = get_local_id(0) / 16; + +#if(STRIDED_INDEX==0) + __global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16)); +#endif + + __local int4 smem2[1 * 4 * WORKSIZE]; + __local int4 smemOut2[1 * 16 * WORKSIZE]; + __local float4 smemVa2[1 * 16 * WORKSIZE]; + + __local int4* smem = smem2 + 4 * chunk; + __local int4* smemOut = smemOut2 + 16 * chunk; + __local float4* smemVa = smemVa2 + 16 * chunk; + + uint tid = get_local_id(0) % 16; + + uint idxHash = gIdx/16; + uint s = ((__global uint*)spad)[idxHash * 50] >> 8; + float4 vs = (float4)(0); + + for(size_t i = 0; i < ITERATIONS; i++) + { + mem_fence(CLK_LOCAL_MEM_FENCE); + ((__local int*)smem)[tid] = ((__global int*)scratchpad_ptr(s, tid/4, lpad))[tid%4]; + mem_fence(CLK_LOCAL_MEM_FENCE); + + float4 rc = vs; + + { + single_comupte_wrap( + tid%4, + *(smem + look[tid][0]), + *(smem + look[tid][1]), + *(smem + look[tid][2]), + *(smem + look[tid][3]), + ccnt[tid], rc, smemVa + tid, + smemOut + tid + ); + } + mem_fence(CLK_LOCAL_MEM_FENCE); + + int4 tmp2; + if(tid % 4 == 0) + { + int4 out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]); + int4 out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]); + out = _mm_xor_si128(out, out2); + tmp2 = out; + *scratchpad_ptr(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out); + } + mem_fence(CLK_LOCAL_MEM_FENCE); + if(tid % 4 == 0) + smemOut[tid] = tmp2; + mem_fence(CLK_LOCAL_MEM_FENCE); + int4 out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12]; + + if(tid%2 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 1]; + if(tid%4 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 2]; + if(tid%8 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 4]; + if(tid%16 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 8]; + vs = smemVa[0]; + + vs = fabs(vs); // take abs(va) by masking the float sign bit + float4 xx = _mm_mul_ps(vs, (float4)(16777216.0f)); + // vs range 0 - 64 + int4 tmp = convert_int4_rte(xx); + tmp = _mm_xor_si128(tmp, out2); + // vs is now between 0 and 1 + vs = _mm_div_ps(vs, (float4)(64.0f)); + s = tmp.x ^ tmp.y ^ tmp.z ^ tmp.w; + } +} + +)===" +R"===( + +inline void generate_512(ulong idx, __local ulong* in, __global ulong* out) +{ + ulong hash[25]; + + hash[0] = in[0] ^ idx; + for(int i = 1; i < 25; ++i) + hash[i] = in[i]; + + keccakf1600_1(hash); + for(int i = 0; i < 20; ++i) + out[i] = hash[i]; + out+=160/8; + + keccakf1600_1(hash); + for(int i = 0; i < 22; ++i) + out[i] = hash[i]; + out+=176/8; + + keccakf1600_1(hash); + for(int i = 0; i < 22; ++i) + out[i] = hash[i]; +} + +__attribute__((reqd_work_group_size(8, 8, 1))) +__kernel void JOIN(cn0_cn_gpu,ALGO)(__global ulong *input, __global int *Scratchpad, __global ulong *states, uint Threads) +{ + const uint gIdx = getIdx(); + __local ulong State_buf[8 * 25]; + __local ulong* State = State_buf + get_local_id(0) * 25; + +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + states += 25 * gIdx; + +#if(STRIDED_INDEX==0) + Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx); +#endif + + if (get_local_id(1) == 0) + { + +// NVIDIA +#ifdef __NV_CL_C_VERSION + for(uint i = 0; i < 8; ++i) + State[i] = input[i]; +#else + ((__local ulong8 *)State)[0] = vload8(0, input); +#endif + State[8] = input[8]; + State[9] = input[9]; + State[10] = input[10]; + + ((__local uint *)State)[9] &= 0x00FFFFFFU; + ((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; + ((__local uint *)State)[10] &= 0xFF000000U; + /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) + * handle get_global_id and get_global_offset as signed long long int and add + * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` + * (even if it is correct casted to unsigned on the host) + */ + ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); + + for (int i = 11; i < 25; ++i) { + State[i] = 0x00UL; + } + + // Last bit of padding + State[16] = 0x8000000000000000UL; + + keccakf1600_2(State); + + #pragma unroll + for (int i = 0; i < 25; ++i) { + states[i] = State[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + for(ulong i = get_local_id(1); i < MEMORY / 512; i += get_local_size(1)) + { + generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512)); + } + } +} + +)===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index f2dce7f90..7ca072c95 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -137,6 +137,9 @@ class autoAdjust // true for all cryptonight_heavy derivates since we check the user and dev pool bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); + // true for all cryptonight_gpu derivates since we check the user and dev pool + bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); + // set strided index to default ctx.stridedIndex = 1; @@ -158,13 +161,21 @@ class autoAdjust if (hashMemSize <= CRYPTONIGHT_TURTLE_MEMORY) maxThreads *= 4u; + if(useCryptonight_gpu) + { + // 6 waves per compute unit are a good value (based on profiling) + // @todo check again after all optimizations + maxThreads = ctx.computeUnits * 6 * 8; + ctx.stridedIndex = 0; + } + // keep 128MiB memory free (value is randomly chosen) from the max available memory const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem; size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem); uint32_t numThreads = 1u; - if(ctx.isAMD) + if(ctx.isAMD && !useCryptonight_gpu) { numThreads = 2; size_t memDoubleThread = maxAvailableFreeMem / numThreads; diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp new file mode 100644 index 000000000..4a7697b02 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#define HAS_WIN_INTRIN_API +#endif + +#ifdef __GNUC__ +#include +#if !defined(HAS_WIN_INTRIN_API) +#include +#endif // !defined(HAS_WIN_INTRIN_API) +#endif // __GNUC__ + +inline void cngpu_cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) +{ + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + +#if defined(HAS_WIN_INTRIN_API) + __cpuidex(val, eax, ecx); +#else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); +#endif +} + +inline bool cngpu_check_avx2() +{ + int32_t cpu_info[4]; + cngpu_cpuid(7, 0, cpu_info); + return (cpu_info[1] & (1 << 5)) != 0; +} + +template +void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad); + +template +void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp new file mode 100644 index 000000000..e46705fd0 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp @@ -0,0 +1,176 @@ +#include "cn_gpu.hpp" +#include "../../cryptonight.hpp" + +#pragma GCC target ("avx2") + +inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01) +{ + v = _mm256_load_si256(idx); + n01 = _mm256_cvtepi32_ps(v); +} + +inline __m256 fma_break(const __m256& x) +{ + // Break the dependency chain by setitng the exp to ?????01 + __m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x); + return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx); +} + +// 14 +inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c) +{ + __m256 nn = _mm256_mul_ps(n0, c); + nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn)); + nn = fma_break(nn); + n = _mm256_add_ps(n, nn); + + __m256 dd = _mm256_mul_ps(n2, c); + dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd)); + dd = fma_break(dd); + d = _mm256_add_ps(d, dd); + + //Constant feedback + c = _mm256_add_ps(c, rnd_c); + c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f)); + __m256 r = _mm256_add_ps(nn, dd); + r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r); + r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r); + c = _mm256_add_ps(c, r); +} + +// 14*8 + 2 = 112 +inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r) +{ + __m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps(); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d); + d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d); + r = _mm256_add_ps(r, _mm256_div_ps(n, d)); +} + +// 112×4 = 448 +template +inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) +{ + __m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1); + __m256 r = _mm256_setzero_ps(); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r); + r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r); + + if(add) + sum = _mm256_add_ps(sum, r); + else + sum = r; + + r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35 + return _mm256_cvttps_epi32(r); +} + +template +inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) +{ + __m256i r = double_comupte(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum); + if(rot != 0) + r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot)); + + out = _mm256_xor_si256(out, r); +} + +template +inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m256i*>(lpad + (idx & MASK) + n*16); } + +template +void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad) +{ + uint32_t s = reinterpret_cast(spad)[0] >> 8; + __m256i* idx0 = scratchpad_ptr(lpad, s, 0); + __m256i* idx2 = scratchpad_ptr(lpad, s, 2); + __m256 sum0 = _mm256_setzero_ps(); + + for(size_t i = 0; i < ITER; i++) + { + __m256i v01, v23; + __m256 suma, sumb, sum1; + __m256 rc = sum0; + + __m256 n01, n23; + __m256 d01, d23; + prep_dv_avx(idx0, v01, n01); + prep_dv_avx(idx2, v23, n23); + + __m256i out, out2; + __m256 n10, n22, n33; + n10 = _mm256_permute2f128_ps(n01, n01, 0x01); + n22 = _mm256_permute2f128_ps(n23, n23, 0x00); + n33 = _mm256_permute2f128_ps(n23, n23, 0x11); + + out = _mm256_setzero_si256(); + double_comupte_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out); + double_comupte_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out); + double_comupte_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out); + double_comupte_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out); + _mm256_store_si256(idx0, _mm256_xor_si256(v01, out)); + sum0 = _mm256_add_ps(suma, sumb); + out2 = out; + + __m256 n11, n02, n30; + n11 = _mm256_permute2f128_ps(n01, n01, 0x11); + n02 = _mm256_permute2f128_ps(n01, n23, 0x20); + n30 = _mm256_permute2f128_ps(n01, n23, 0x03); + + out = _mm256_setzero_si256(); + double_comupte_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out); + double_comupte_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out); + double_comupte_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out); + double_comupte_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out); + _mm256_store_si256(idx2, _mm256_xor_si256(v23, out)); + sum1 = _mm256_add_ps(suma, sumb); + + out2 = _mm256_xor_si256(out2, out); + out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2); + suma = _mm256_permute2f128_ps(sum0, sum1, 0x30); + sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21); + sum0 = _mm256_add_ps(suma, sumb); + sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41)); + + // Clear the high 128 bits + __m128 sum = _mm256_castps256_ps128(sum0); + + sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + __m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f))); + v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2)); + __m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3)); + v0 = _mm_xor_si128(v0, v1); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1)); + v0 = _mm_xor_si128(v0, v1); + + // vs is now between 0 and 1 + sum = _mm_div_ps(sum, _mm_set1_ps(64.0f)); + sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1); + uint32_t n = _mm_cvtsi128_si32(v0); + idx0 = scratchpad_ptr(lpad, n, 0); + idx2 = scratchpad_ptr(lpad, n, 2); + } +} + +template void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp new file mode 100644 index 000000000..bde34162a --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp @@ -0,0 +1,182 @@ +#include "cn_gpu.hpp" +#include "../../cryptonight.hpp" + +#pragma GCC target ("sse2") + +inline void prep_dv(__m128i* idx, __m128i& v, __m128& n) +{ + v = _mm_load_si128(idx); + n = _mm_cvtepi32_ps(v); +} + +inline __m128 fma_break(__m128 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x); + return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x); +} + +// 14 +inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) +{ + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, _mm_set1_ps(0.734375f)); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r); + r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r); + c = _mm_add_ps(c, r); +} + +// 14*8 + 2 = 112 +inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) +{ + __m128 n = _mm_setzero_ps(), d = _mm_setzero_ps(); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d); + d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d); + r =_mm_add_ps(r, _mm_div_ps(n,d)); +} + +// 112×4 = 448 +template +inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +{ + __m128 c = _mm_set1_ps(cnt); + __m128 r = _mm_setzero_ps(); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r); + r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r); + + if(add) + sum = _mm_add_ps(sum, r); + else + sum = r; + + r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35 + return _mm_cvttps_epi32(r); +} + +template +inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +{ + __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + if(rot != 0) + r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot)); + out = _mm_xor_si128(out, r); +} + +template +inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m128i*>(lpad + (idx & MASK) + n*16); } + +template +void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad) +{ + uint32_t s = reinterpret_cast(spad)[0] >> 8; + __m128i* idx0 = scratchpad_ptr(lpad, s, 0); + __m128i* idx1 = scratchpad_ptr(lpad, s, 1); + __m128i* idx2 = scratchpad_ptr(lpad, s, 2); + __m128i* idx3 = scratchpad_ptr(lpad, s, 3); + __m128 sum0 = _mm_setzero_ps(); + + for(size_t i = 0; i < ITER; i++) + { + __m128 n0, n1, n2, n3; + __m128i v0, v1, v2, v3; + __m128 suma, sumb, sum1, sum2, sum3; + + prep_dv(idx0, v0, n0); + prep_dv(idx1, v1, n1); + prep_dv(idx2, v2, n2); + prep_dv(idx3, v3, n3); + __m128 rc = sum0; + + __m128i out, out2; + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out); + single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out); + single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out); + single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out); + sum0 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx0, _mm_xor_si128(v0, out)); + out2 = out; + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out); + single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out); + single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out); + single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out); + sum1 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx1, _mm_xor_si128(v1, out)); + out2 = _mm_xor_si128(out2, out); + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out); + single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out); + single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out); + single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out); + sum2 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx2, _mm_xor_si128(v2, out)); + out2 = _mm_xor_si128(out2, out); + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out); + single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out); + single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out); + single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out); + sum3 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx3, _mm_xor_si128(v3, out)); + out2 = _mm_xor_si128(out2, out); + sum0 = _mm_add_ps(sum0, sum1); + sum2 = _mm_add_ps(sum2, sum3); + sum0 = _mm_add_ps(sum0, sum2); + + sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f)); + v0 = _mm_cvttps_epi32(n0); + v0 = _mm_xor_si128(v0, out2); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3)); + v0 = _mm_xor_si128(v0, v1); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1)); + v0 = _mm_xor_si128(v0, v1); + + // vs is now between 0 and 1 + sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f)); + uint32_t n = _mm_cvtsi128_si32(v0); + idx0 = scratchpad_ptr(lpad, n, 0); + idx1 = scratchpad_ptr(lpad, n, 1); + idx2 = scratchpad_ptr(lpad, n, 2); + idx3 = scratchpad_ptr(lpad, n, 3); + } +} + +template void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad); diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2218bf88a..c75eff8ff 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -17,6 +17,7 @@ #include "cryptonight.h" #include "xmrstak/backend/cryptonight.hpp" +#include "cn_gpu.hpp" #include #include #include @@ -167,6 +168,8 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3 template void cn_explode_scratchpad(const __m128i* input, __m128i* output) { + constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast; + // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -182,7 +185,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) xin6 = _mm_load_si128(input + 10); xin7 = _mm_load_si128(input + 11); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) { for(size_t i=0; i < 16; i++) { @@ -263,9 +266,45 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } +template +void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output) +{ + constexpr size_t hash_size = 200; // 25x8 bytes + alignas(128) uint64_t hash[25]; + + for (uint64_t i = 0; i < MEM / 512; i++) + { + memcpy(hash, input, hash_size); + hash[0] ^= i; + + keccakf(hash, 24); + memcpy(output, hash, 160); + output+=160; + + keccakf(hash, 24); + memcpy(output, hash, 176); + output+=176; + + keccakf(hash, 24); + memcpy(output, hash, 176); + output+=176; + + if(PREFETCH) + { + _mm_prefetch((const char*)output - 512, _MM_HINT_T2); + _mm_prefetch((const char*)output - 384, _MM_HINT_T2); + _mm_prefetch((const char*)output - 256, _MM_HINT_T2); + _mm_prefetch((const char*)output - 128, _MM_HINT_T2); + } + } +} + template void cn_implode_scratchpad(const __m128i* input, __m128i* output) { + constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; + // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -326,11 +365,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) { for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { @@ -377,7 +416,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } @@ -1000,3 +1039,28 @@ struct Cryptonight_hash_asm<2, 0> } } }; + +struct Cryptonight_hash_gpu +{ + static constexpr size_t N = 1; + + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + { + constexpr size_t MASK = cn_select_mask(); + constexpr size_t ITERATIONS = cn_select_iter(); + constexpr size_t MEM = cn_select_memory(); + + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad_gpu(ctx[0]->hash_state, ctx[0]->long_state); + + if(cngpu_check_avx2()) + cn_gpu_inner_avx(ctx[0]->hash_state, ctx[0]->long_state); + else + cn_gpu_inner_ssse3(ctx[0]->hash_state, ctx[0]->long_state); + + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + memcpy(output, ctx[0]->hash_state, 32); + } +}; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 2327bed1d..e1af701e8 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -397,22 +397,32 @@ bool minethd::self_test() hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } - else if(algo == cryptonight_superfast) + else if(algo == cryptonight_superfast) { hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast); hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx); bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; } - else if (algo == cryptonight_turtle) + else if(algo == cryptonight_gpu) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle); - hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle); - hashf("This is a test This is a test This is a test", 44, out, ctx); - bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; - } + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_gpu); + hashf("", 0, out, ctx); + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_gpu); + hashf("", 0, out, ctx); + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + } + else if (algo == cryptonight_turtle) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle); + hashf("This is a test This is a test This is a test", 44, out, ctx); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + } if(!bResult) printer::inst()->print_msg(L0, @@ -541,9 +551,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_superfast: algv = 11; break; - case cryptonight_turtle: + case cryptonight_gpu: algv = 12; break; + case cryptonight_turtle: + algv = 13; + break; default: algv = 2; break; @@ -609,6 +622,11 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, + + Cryptonight_hash_gpu::template hash, + Cryptonight_hash_gpu::template hash, + Cryptonight_hash_gpu::template hash, + Cryptonight_hash_gpu::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index 2dd922f91..ae862abae 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -18,7 +18,8 @@ enum xmrstak_algo cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks cryptonight_monero_v8 = 11, cryptonight_superfast = 12, - cryptonight_turtle = 13 + cryptonight_gpu = 13, + cryptonight_turtle = 14 }; // define aeon settings @@ -34,6 +35,9 @@ constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024; constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0; constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000; +constexpr uint32_t CRYPTONIGHT_GPU_MASK = 0x1FFFC0; +constexpr uint32_t CRYPTONIGHT_GPU_ITER = 0xC000; + constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000; constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; @@ -81,6 +85,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTO template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_TURTLE_MEMORY; } @@ -94,6 +101,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) case cryptonight_masari: case cryptonight: case cryptonight_superfast: + case cryptonight_gpu: return CRYPTONIGHT_MEMORY; case cryptonight_ipbc: case cryptonight_aeon: @@ -149,6 +157,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTO template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_GPU_MASK; } + template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_TURTLE_MASK; } @@ -171,6 +182,8 @@ inline size_t cn_select_mask(xmrstak_algo algo) case cryptonight_haven: case cryptonight_heavy: return CRYPTONIGHT_HEAVY_MASK; + case cryptonight_gpu: + return CRYPTONIGHT_GPU_MASK; case cryptonight_turtle: return CRYPTONIGHT_TURTLE_MASK; default: @@ -217,6 +230,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTO template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_SUPERFAST_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_GPU_ITER; } + template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_TURTLE_ITER; } @@ -241,6 +257,8 @@ inline size_t cn_select_iter(xmrstak_algo algo) return CRYPTONIGHT_MASARI_ITER; case cryptonight_superfast: return CRYPTONIGHT_SUPERFAST_ITER; + case cryptonight_gpu: + return CRYPTONIGHT_GPU_ITER; case cryptonight_turtle: return CRYPTONIGHT_TURTLE_ITER; default: diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index c3a97808c..2acf1a387 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -10,6 +10,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" #ifdef _WIN32 @@ -724,7 +725,8 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti cn_aes_pseudo_round_mut( sharedMemory, text, key ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { #pragma unroll for ( int j = 0; j < 4; ++j ) @@ -843,6 +845,73 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) } } +template +void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce) +{ + dim3 grid( ctx->device_blocks ); + dim3 block( ctx->device_threads ); + dim3 block2( ctx->device_threads << 1 ); + dim3 block4( ctx->device_threads << 2 ); + dim3 block8( ctx->device_threads << 3 ); + + size_t intensity = ctx->device_blocks * ctx->device_threads; + + CUDA_CHECK_KERNEL( + ctx->device_id, + xmrstak::nvidia::cn_explode_gpu<<>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state) + ); + + int partcount = 1 << ctx->device_bfactor; + for(int i = 0; i < partcount; i++) + { + CUDA_CHECK_KERNEL( + ctx->device_id, + // 36 x 16byte x numThreads + xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu + <<device_blocks, ctx->device_threads * 16, 36 * 16 * ctx->device_threads>>> + ( + (int*)ctx->d_ctx_state, + (int*)ctx->d_long_state, + ctx->device_bfactor, + i, + ctx->d_ctx_a, + ctx->d_ctx_b + ) + ); + } + + /* bfactor for phase 3 + * + * 3 consume less time than phase 2, therefore we begin with the + * kernel splitting if the user defined a `bfactor >= 5` + */ + int bfactorOneThree = ctx->device_bfactor - 4; + if( bfactorOneThree < 0 ) + bfactorOneThree = 0; + + int partcountOneThree = 1 << bfactorOneThree; + int roundsPhase3 = partcountOneThree; + + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + { + // cryptonight_heavy used two full rounds over the scratchpad memory + roundsPhase3 *= 2; + } + + for ( int i = 0; i < roundsPhase3; i++ ) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( ctx->device_blocks*ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2 )); + } +} + void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); @@ -882,10 +951,13 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, - + cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash_gpu, + cryptonight_core_gpu_hash_gpu, + cryptonight_core_gpu_hash, cryptonight_core_gpu_hash }; @@ -895,4 +967,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; selected_function(ctx, startNonce); + } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp new file mode 100644 index 000000000..a0fe53418 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -0,0 +1,552 @@ +#pragma once + +#include +#include +#include + +#include "cuda_keccak.hpp" +#include "cuda_extra.hpp" + +namespace xmrstak +{ +namespace nvidia +{ + +struct __m128i : public int4 +{ + + __forceinline__ __device__ __m128i(){} + + __forceinline__ __device__ __m128i( + const uint32_t x0, const uint32_t x1, + const uint32_t x2, const uint32_t x3) + { + x = x0; + y = x1; + z = x2; + w = x3; + } + + __forceinline__ __device__ __m128i( const int x0) + { + x = x0; + y = x0; + z = x0; + w = x0; + } + + __forceinline__ __device__ __m128i operator|(const __m128i& other) + { + return __m128i( + x | other.x, + y | other.y, + z | other.z, + w | other.w + ); + } + + __forceinline__ __device__ __m128i operator^(const __m128i& other) + { + return __m128i( + x ^ other.x, + y ^ other.y, + z ^ other.z, + w ^ other.w + ); + } +}; + +struct __m128 : public float4 +{ + + __forceinline__ __device__ __m128(){} + + __forceinline__ __device__ __m128( + const float x0, const float x1, + const float x2, const float x3) + { + float4::x = x0; + float4::y = x1; + float4::z = x2; + float4::w = x3; + } + + __forceinline__ __device__ __m128( const float x0) + { + float4::x = x0; + float4::y = x0; + float4::z = x0; + float4::w = x0; + } + + __forceinline__ __device__ __m128( const __m128i& x0) + { + float4::x = int2float(x0.x); + float4::y = int2float(x0.y); + float4::z = int2float(x0.z); + float4::w = int2float(x0.w); + } + + __forceinline__ __device__ __m128i get_int( ) + { + return __m128i( + (int)x, + (int)y, + (int)z, + (int)w + ); + } + + __forceinline__ __device__ __m128 operator+(const __m128& other) + { + return __m128( + x + other.x, + y + other.y, + z + other.z, + w + other.w + ); + } + + __forceinline__ __device__ __m128 operator-(const __m128& other) + { + return __m128( + x - other.x, + y - other.y, + z - other.z, + w - other.w + ); + } + + __forceinline__ __device__ __m128 operator*(const __m128& other) + { + return __m128( + x * other.x, + y * other.y, + z * other.z, + w * other.w + ); + } + + __forceinline__ __device__ __m128 operator/(const __m128& other) + { + return __m128( + x / other.x, + y / other.y, + z / other.z, + w / other.w + ); + } + + __forceinline__ __device__ __m128& trunc() + { + x=::truncf(x); + y=::truncf(y); + z=::truncf(z); + w=::truncf(w); + + return *this; + } + + __forceinline__ __device__ __m128& abs() + { + x=::fabsf(x); + y=::fabsf(y); + z=::fabsf(z); + w=::fabsf(w); + + return *this; + } + + __forceinline__ __device__ __m128& floor() + { + x=::floorf(x); + y=::floorf(y); + z=::floorf(z); + w=::floorf(w); + + return *this; + } +}; + + +template +__device__ void print(const char* name, T value) +{ + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%08X ",((uint32_t*)&value)[i]); + } + printf("\n"); +} + +template<> +__device__ void print<__m128>(const char* name, __m128 value) +{ + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%f ",((float*)&value)[i]); + } + printf("\n"); +} + +#define SHOW(name) print(#name, name) + + +__forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return a + b; +} + +__forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return a - b; +} + +__forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return a * b; +} + +__forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b) +{ + return a / b; +} + +__forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) & b), + int_as_float(float_as_int(a.y) & b), + int_as_float(float_as_int(a.z) & b), + int_as_float(float_as_int(a.w) & b) + ); +} + +__forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) | b), + int_as_float(float_as_int(a.y) | b), + int_as_float(float_as_int(a.z) | b), + int_as_float(float_as_int(a.w) | b) + ); +} + +__forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) ^ b), + int_as_float(float_as_int(a.y) ^ b), + int_as_float(float_as_int(a.z) ^ b), + int_as_float(float_as_int(a.w) ^ b) + ); +} + +__forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc) +{ + __m128 d(dc); + __m128 c = _mm_div_ps(v, d); + c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); + // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 + c = _mm_mul_ps(c, d); + return _mm_sub_ps(v, c); + + + //return a.fmodf(b); +} + +__forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return a ^ b; +} + + +__forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot) +{ + const uint32_t right = 8 * rot; + const uint32_t left = (32 - 8 * rot); + return __m128i( + ((uint32_t)a.x >> right) | ( a.y << left ), + ((uint32_t)a.y >> right) | ( a.z << left ), + ((uint32_t)a.z >> right) | ( a.w << left ), + ((uint32_t)a.w >> right) | ( a.x << left ) + ); +} + +template +__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } + + +__forceinline__ __device__ __m128 fma_break(__m128 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); +} + +// 9 +__forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) +{ + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, 0.734375f); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + c = _mm_add_ps(c, r); +} + +// 9*8 + 2 = 74 +__forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) +{ + __m128 n(0.0f), d(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + r =_mm_add_ps(r, _mm_div_ps(n,d)); +} + +// 74*8 = 595 +__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +{ + __m128 c(cnt); + // 35 maths calls follow (140 FLOPS) + __m128 r = __m128(0.0f); + for(int i=0; i< 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, c, r); + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + sum = r; // 34 + r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 + return r.get_int(); + +} + +__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, __m128i v0, __m128i v1, __m128i v2, __m128i v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +{ + __m128 n0(v0); + __m128 n1(v1); + __m128 n2(v2); + __m128 n3(v3); + + __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} + +__constant__ uint32_t look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} +}; + +__constant__ float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f +}; + +template +__global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) +{ + static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6; + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + + extern __shared__ __m128i smemExtern_in[]; + + const uint32_t chunk = threadIdx.x / 16; + const uint32_t numHashPerBlock = blockDim.x / 16; + + int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk)); + + __m128i* smem = smemExtern_in + 4 * chunk; + + __m128i* smemExtern = smemExtern_in + numHashPerBlock * 4; + __m128i* smemOut = smemExtern + 16 * chunk; + + smemExtern = smemExtern + numHashPerBlock * 16; + __m128* smemVa = (__m128*)smemExtern + 16 * chunk; + + uint32_t tid = threadIdx.x % 16; + + const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; + uint32_t s = 0; + + __m128 vs(0); + if(partidx != 0) + { + vs = ((__m128*)roundVs)[idxHash]; + s = roundS[idxHash]; + } + else + { + s = ((uint32_t*)spad)[idxHash * 50] >> 8; + } + + for(size_t i = 0; i < batchsize; i++) + { + __syncthreads(); + + ((int*)smem)[tid] = ((int*)scratchpad_ptr(s, tid/4, lpad))[tid%4]; + __syncthreads(); + + __m128 rc = vs; + + + single_comupte_wrap( + tid%4, + *(smem + look[tid][0]), + *(smem + look[tid][1]), + *(smem + look[tid][2]), + *(smem + look[tid][3]), + ccnt[tid], rc, smemVa[tid], + smemOut[tid] + ); + + __syncthreads(); + + if(tid % 4 == 0) + { + __m128i out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]); + __m128i out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]); + out = _mm_xor_si128(out, out2); + smemOut[tid] = out; + *scratchpad_ptr(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out); + } + __syncthreads(); + + + __m128i out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12]; + + if(tid%2 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 1]; + + if(tid%4 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 2]; + if(tid%8 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 4]; + if(tid%16 == 0) + smemVa[tid] = smemVa[tid] + smemVa[tid + 8]; + vs = smemVa[0]; + + vs.abs(); // take abs(va) by masking the float sign bit + auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); + // vs range 0 - 64 + *smem = xx.get_int(); + *smem = _mm_xor_si128(*smem, out2); + // vs is now between 0 and 1 + vs = _mm_div_ps(vs, __m128(64.0f)); + s = smem->x ^ smem->y ^ smem->z ^ smem->w; + } + if(partidx != ((1< +__global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in) +{ + __shared__ uint64_t state[25]; + + uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY; + uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200); + + constexpr size_t hash_size = 200; // 25x8 bytes + memcpy(state, spad, hash_size); + + for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x) + { + generate_512(i, state, (uint8_t*)lpad + i*512); + } +} + +} // namespace xmrstak +} // namespace nvidia diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 7149f37a6..e4574e20a 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -9,21 +9,6 @@ #include #include "xmrstak/jconf.hpp" -#ifdef __CUDACC__ -__constant__ -#else -const -#endif -uint64_t keccakf_rndc[24] ={ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; typedef unsigned char BitSequence; typedef unsigned long long DataLength; @@ -184,7 +169,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -201,7 +187,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 for ( i = 0; i < 50; i++ ) state[i] = ctx_state[i]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { uint32_t key[40]; @@ -220,33 +207,46 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } cn_keccakf2( (uint64_t *) state ); - switch ( ( (uint8_t *) state )[0] & 0x03 ) + if(ALGO == cryptonight_gpu) { - case 0: - cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); - break; - case 1: - cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - case 2: - cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - case 3: - cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - default: - break; + if ( ((uint64_t*)state)[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + + if(idx < 10) + d_res_nonce[idx] = thread; + } } + else + { + switch ( ( (uint8_t *) state )[0] & 0x03 ) + { + case 0: + cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); + break; + case 1: + cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 2: + cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 3: + cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + default: + break; + } - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries - if ( hash[3] < target ) - { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + if ( hash[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); - if(idx < 10) - d_res_nonce[idx] = thread; + if(idx < 10) + d_res_nonce[idx] = thread; + } } } @@ -373,6 +373,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare << > > (wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } + else if(miner_algo == cryptonight_gpu) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, @@ -426,6 +431,15 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); } + else if(miner_algo == cryptonight_gpu) + { + // fallback for all other algorithms + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } else { // fallback for all other algorithms diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp index 99c651645..c75c74964 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -1,3 +1,23 @@ +#pragma once + +#include "cuda_extra.hpp" + +#ifdef __CUDACC__ +__constant__ +#else +const +#endif +uint64_t keccakf_rndc[24] ={ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + #if __CUDA_ARCH__ >= 350 __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) { diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 165595c5f..80e6002d7 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -103,6 +103,7 @@ xmrstak::coin_selection coins[] = { { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "cryptonight_gpu", {cryptonight_gpu, cryptonight_gpu, 255u}, {cryptonight_gpu, cryptonight_gpu, 0u}, nullptr }, { "freehaven", {cryptonight_heavy, cryptonight_superfast, 255u}, {cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, { "graft", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 2f4e2a11f..c475c4129 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -560,6 +560,12 @@ void executor::ex_main() else pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true); break; + case cryptonight_gpu: + if(dev_tls) + pools.emplace_front(0, "donate.xmr-stak.net:8811", "", "", "", 0.0, true, true, "", false); + else + pools.emplace_front(0, "donate.xmr-stak.net:5511", "", "", "", 0.0, true, false, "", false); + break; case cryptonight_monero_v8: case cryptonight_monero: case cryptonight_turtle: From 17f3aef08bd21e820d0c58b288dbc7f46b207f79 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 30 Jan 2019 21:23:25 +0100 Subject: [PATCH 09/19] fix compile - fix broken trutle coin - fix non cn_gpu algorithms --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 9 ++++++--- xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index ea07d6a8b..53394037b 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -347,8 +347,11 @@ XMRSTAK_INCLUDE_JH XMRSTAK_INCLUDE_BLAKE256 //#include "opencl/groestl256.cl" XMRSTAK_INCLUDE_GROESTL256 -//#include "opencl/cryptonight_gpu.cl" -XMRSTAK_INCLUDE_CN_GPU + +#if (ALGO == 13) + //#include "opencl/cryptonight_gpu.cl" + XMRSTAK_INCLUDE_CN_GPU +#endif )===" R"===( @@ -789,7 +792,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)a)[0] ^= tmp; // cryptonight_monero_v8 -#if (ALGO == 11 || ALGO==13) +#if (ALGO == 11 || ALGO==14) # if defined(__NV_CL_C_VERSION) // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 4205a67c3..93e304aee 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -4,7 +4,7 @@ R"===( */ // cryptonight_monero_v8 -#if(ALGO==11 || ALGO==13) +#if(ALGO==11 || ALGO==14) static const __constant uint RCP_C[256] = { From adeeab6fe7dc4f5fbb89da0790dc2dee8fae3aff Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 30 Jan 2019 21:51:55 +0100 Subject: [PATCH 10/19] fix cuda 10 - fix race condition during shared memory access - optimize memory access --- .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp index a0fe53418..d3df0fed2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -462,30 +462,30 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int __syncthreads(); - if(tid % 4 == 0) - { - __m128i out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]); - __m128i out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]); - out = _mm_xor_si128(out, out2); - smemOut[tid] = out; - *scratchpad_ptr(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out); - } - __syncthreads(); + const uint32_t b = tid / 4; + const uint32_t bb = tid % 4; + int outXor = ((int*)smemOut)[b * 16 + bb]; + for(uint32_t dd = b * 16 + 4 + bb; dd < (b + 1) * 16; dd += 4) + outXor ^= ((int*)smemOut)[dd]; - __m128i out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12]; + ((int*)scratchpad_ptr(s, tid/4, lpad))[tid%4] = outXor ^ ((int*)smem)[tid]; + ((int*)smemOut)[tid] = outXor; - if(tid%2 == 0) - smemVa[tid] = smemVa[tid] + smemVa[tid + 1]; + float va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4]; + float va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12]; + ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; - if(tid%4 == 0) - smemVa[tid] = smemVa[tid] + smemVa[tid + 2]; - if(tid%8 == 0) - smemVa[tid] = smemVa[tid] + smemVa[tid + 4]; - if(tid%16 == 0) - smemVa[tid] = smemVa[tid] + smemVa[tid + 8]; - vs = smemVa[0]; + __syncthreads(); + + __m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3]; + va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4]; + va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12]; + ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; + __syncthreads(); + + vs = smemVa[0]; vs.abs(); // take abs(va) by masking the float sign bit auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); // vs range 0 - 64 From 88ea7f3693a04c8b66df10dca2d4be2d60b26e01 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 1 Feb 2019 10:23:07 +0100 Subject: [PATCH 11/19] OpenCL: use algorithm names instead of number Use the algorithm names from `cryptonight.hpp` instead if number within the OpenCL kernel. --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 114 +++++++++--------- .../amd/amd_gpu/opencl/fast_div_heavy.cl | 4 +- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 3 +- 3 files changed, 61 insertions(+), 60 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 53394037b..faea409ed 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -14,6 +14,23 @@ R"===( * along with this program. If not, see . */ +// defines to translate algorithm names int a same number used within cryptonight.h +#define invalid_algo 0 +#define cryptonight 1 +#define cryptonight_lite 2 +#define cryptonight_monero 3 +#define cryptonight_heavy 4 +#define cryptonight_aeon 5 +#define cryptonight_ipbc 6 +#define cryptonight_stellite 7 +#define cryptonight_masari 8 +#define cryptonight_haven 9 +#define cryptonight_bittube2 10 +#define cryptonight_monero_v8 11 +#define cryptonight_superfast 12 +#define cryptonight_gpu 13 +#define cryptonight_turtle 14 + /* For Mesa clover support */ #ifdef cl_clang_storage_class_specifiers # pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable @@ -348,7 +365,7 @@ XMRSTAK_INCLUDE_BLAKE256 //#include "opencl/groestl256.cl" XMRSTAK_INCLUDE_GROESTL256 -#if (ALGO == 13) +#if (ALGO == cryptonight_gpu) //#include "opencl/cryptonight_gpu.cl" XMRSTAK_INCLUDE_CN_GPU #endif @@ -504,8 +521,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, mem_fence(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +#if (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) __local uint4 xin[8][8]; { @@ -559,8 +575,8 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, )===" R"===( -// cryptonight_monero_v8 && NVIDIA -#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION)) +// __NV_CL_C_VERSION checks if NVIDIA opencl is used +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) # define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) #else @@ -569,16 +585,15 @@ R"===( __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, uint Threads -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) + +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) , __global ulong *input #endif ) { ulong a[2]; -// cryptonight_monero_v8 -#if(ALGO==11 || ALGO==14) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -592,8 +607,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif __local uint AES0[256], AES1[256]; -// cryptonight_monero_v8 -#if(ALGO==11 || ALGO==14) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) # if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; # endif @@ -608,15 +622,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states const uint tmp = AES0_C[i]; AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); -// cryptonight_monero_v8 -#if((ALGO==11 || ALGO==14) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) + +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) + +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) uint2 tweak1_2; #endif @@ -643,8 +657,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)b)[0]; -// cryptonight_monero_v8 -#if(ALGO==11 || ALGO==14) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) a[1] = states[1] ^ states[5]; b[2] = states[8] ^ states[10]; b[3] = states[9] ^ states[11]; @@ -652,8 +665,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states division_result = as_uint2(states[12]); sqrt_result = as_uint2(states[13]).s0; #endif -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) + +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) tweak1_2 = as_uint2(input[4]); tweak1_2.s0 >>= 24; tweak1_2.s0 |= tweak1_2.s1 << 8; @@ -675,22 +688,21 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states for(int i = 0; i < ITERATIONS; ++i) { ulong c[2]; -// cryptonight_monero_v8 && NVIDIA -#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION)) + +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && defined(__NV_CL_C_VERSION)) uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); -// cryptonight_bittube2 -#if(ALGO == 10) + +#if(ALGO == cryptonight_bittube2) ((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif -// cryptonight_monero_v8 -#if(ALGO==11 || ALGO==14) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) { ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); @@ -701,12 +713,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states } #endif -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) uint table = 0x75310U; b_x[0] ^= ((uint4 *)c)[0]; -// cryptonight_stellite -# if(ALGO == 7) + +# if(ALGO == cryptonight_stellite) uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); # else uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); @@ -714,8 +725,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; SCRATCHPAD_CHUNK(0) = b_x[0]; idx0 = as_uint2(c[0]).s0 & MASK; -// cryptonight_monero_v8 -#elif(ALGO==11 || ALGO==14) + +#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data @@ -733,8 +744,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); -// cryptonight_monero_v8 -#if(ALGO==11 || ALGO==14) + +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) // Use division and square root results from the _previous_ iteration to hide the latency tmp.s0 ^= division_result.s0; tmp.s1 ^= division_result.s1 ^ sqrt_result; @@ -770,11 +781,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); #endif -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) -// cryptonight_ipbc || cryptonight_bittube2 -# if(ALGO == 6 || ALGO == 10) +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) + +# if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; @@ -791,8 +801,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)a)[0] ^= tmp; -// cryptonight_monero_v8 -#if (ALGO == 11 || ALGO==14) +#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) # if defined(__NV_CL_C_VERSION) // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; @@ -802,15 +811,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)c)[0]; idx0 = as_uint2(a[0]).s0 & MASK; -// cryptonight_heavy || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 10) +#if (ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; idx0 = (d ^ as_int2(q).s0) & MASK; -// cryptonight_haven || cryptonight_superfast -#elif (ALGO == 9 || ALGO == 12) +#elif (ALGO == cryptonight_haven || ALGO == cryptonight_superfast) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); @@ -828,8 +835,8 @@ R"===( __attribute__((reqd_work_group_size(8, 8, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, -// cryptonight_gpu -#if (ALGO == 13) + +#if (ALGO == cryptonight_gpu) __global uint *output, ulong Target, uint Threads) #else __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) @@ -851,8 +858,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif @@ -890,8 +896,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; @@ -904,8 +909,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states if (gIdx < Threads) #endif { -// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) + +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) #pragma unroll 2 for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) { @@ -945,8 +950,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif } -// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) /* Also left over threads performe this loop. * The left over thread results will be ignored */ @@ -988,10 +992,10 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states for(int i = 0; i < 25; ++i) State[i] = states[i]; keccakf1600_2(State); -#if (ALGO == 13) + +#if (ALGO == cryptonight_gpu) if(State[3] <= Target) { - //printf("gt %lu\n", State[3]); ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = get_global_id(0); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl index 161f2f55d..4469b0670 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl @@ -1,7 +1,6 @@ R"===( -#ifndef FAST_DIV_HEAVY_CL -#define FAST_DIV_HEAVY_CL +#if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) inline long fast_div_heavy(long _a, int _b) { long a = abs(_a); @@ -19,6 +18,5 @@ inline long fast_div_heavy(long _a, int _b) const long q = q1 + q2 + q3; return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q; } - #endif )===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 93e304aee..b34e68294 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -3,8 +3,7 @@ R"===( * @author SChernykh */ -// cryptonight_monero_v8 -#if(ALGO==11 || ALGO==14) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) static const __constant uint RCP_C[256] = { From e8ec992155db76194662230552ca986d27222968 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Thu, 31 Jan 2019 22:19:06 +0100 Subject: [PATCH 12/19] cuda: optimize cn-gpu psychocrypt committed 9 minutes ago - use precomuted indicies within the loop - `cn_explode_gpu` use all threads to load the state --- .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 62 +++++++++++-------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp index d3df0fed2..94750560c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -347,7 +347,7 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n } -__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, __m128i v0, __m128i v1, __m128i v2, __m128i v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { __m128 n0(v0); __m128 n1(v1); @@ -402,6 +402,16 @@ __constant__ float ccnt[16] = { 1.4609375f }; + +__forceinline__ __device__ void sync() +{ +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif +} + template __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) { @@ -440,18 +450,19 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int s = ((uint32_t*)spad)[idxHash * 50] >> 8; } + const uint32_t b = tid / 4; + const uint32_t bb = tid % 4; + const uint32_t block = b * 16 + bb; + for(size_t i = 0; i < batchsize; i++) { - __syncthreads(); - - ((int*)smem)[tid] = ((int*)scratchpad_ptr(s, tid/4, lpad))[tid%4]; - __syncthreads(); + sync(); + ((int*)smem)[tid] = ((int*)scratchpad_ptr(s, b, lpad))[bb]; + sync(); __m128 rc = vs; - - single_comupte_wrap( - tid%4, + bb, *(smem + look[tid][0]), *(smem + look[tid][1]), *(smem + look[tid][2]), @@ -460,40 +471,37 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int smemOut[tid] ); - __syncthreads(); + sync(); - const uint32_t b = tid / 4; - const uint32_t bb = tid % 4; - - int outXor = ((int*)smemOut)[b * 16 + bb]; - for(uint32_t dd = b * 16 + 4 + bb; dd < (b + 1) * 16; dd += 4) + int outXor = ((int*)smemOut)[block]; + for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4) outXor ^= ((int*)smemOut)[dd]; - ((int*)scratchpad_ptr(s, tid/4, lpad))[tid%4] = outXor ^ ((int*)smem)[tid]; + ((int*)scratchpad_ptr(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid]; ((int*)smemOut)[tid] = outXor; - float va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4]; - float va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12]; + float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; + float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12]; ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; - __syncthreads(); + sync(); __m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3]; - va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4]; - va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12]; + va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4]; + va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12]; ((float*)smemVa)[tid] = va_tmp1 + va_tmp2; - __syncthreads(); + sync(); vs = smemVa[0]; vs.abs(); // take abs(va) by masking the float sign bit auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); // vs range 0 - 64 - *smem = xx.get_int(); - *smem = _mm_xor_si128(*smem, out2); + auto xx_int = xx.get_int(); + out2 = _mm_xor_si128(xx_int, out2); // vs is now between 0 and 1 vs = _mm_div_ps(vs, __m128(64.0f)); - s = smem->x ^ smem->y ^ smem->z ^ smem->w; + s = out2.x ^ out2.y ^ out2.z ^ out2.w; } if(partidx != ((1< Date: Fri, 1 Feb 2019 20:55:06 +0100 Subject: [PATCH 13/19] add ryo algorithm fork switch with block 6 to cryptonigh_gpu (https://github.com/ryo-currency/ryo-currency/blob/7bc5dc40864e518753807d04c17d617f728da156/src/cryptonote_config.h#L185) --- xmrstak/jconf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 80e6002d7..6c2de4b3b 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -111,7 +111,7 @@ xmrstak::coin_selection coins[] = { { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, { "monero", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" }, { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, + { "ryo", {cryptonight_gpu, cryptonight_heavy, 6u}, {cryptonight_gpu, cryptonight_heavy, 6u}, nullptr }, { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "turtlecoin", {cryptonight_turtle, cryptonight_aeon, 5u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "plenteum", {cryptonight_turtle, cryptonight_aeon, 5u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } From 3f6bd5a25e5b6c0e22a99f5d0b296be61b766a63 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 1 Feb 2019 22:30:33 +0100 Subject: [PATCH 14/19] CUDA: optimze cn_gpu auto suggestion optimize the algorithm for cryptonight_gpu autosuggestion --- .../backend/nvidia/nvcc_code/cuda_extra.cu | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index e4574e20a..a37ecc8a0 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -593,6 +593,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } } + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); + + // set all device option those marked as auto (-1) to a valid value if(ctx->device_blocks == -1) { @@ -600,8 +604,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * - 3 * SMX count for >=sm_30 * - 2 * SMX count for device_blocks = props.multiProcessorCount * - ( props.major < 3 ? 2 : 3 ); + ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3); + + // use 6 blocks per SM for sm_2X else 8 blocks + if(useCryptonight_gpu) + ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 6 : 8); // increase bfactor for low end devices to avoid that the miner is killed by the OS if(props.multiProcessorCount <= 6) @@ -613,7 +620,16 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts * `8 * ctx->device_threads` threads per block */ - ctx->device_threads = 64; + const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024; + + // for the most algorithms we are using 8 threads per hash + uint32_t threadsPerHash = 8; + + // phase2_gpu uses 16 threads per hash + if(useCryptonight_gpu) + threadsPerHash = 16; + + ctx->device_threads = maxThreadsPerBlock / threadsPerHash; constexpr size_t byteToMiB = 1024u * 1024u; // no limit by default 1TiB @@ -678,8 +694,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->total_device_memory = totalMemory; ctx->free_device_memory = freeMemory; - auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); - size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { @@ -725,10 +739,9 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) // use only odd number of threads ctx->device_threads = ctx->device_threads & 0xFFFFFFFE; - if(props.major == 2 && ctx->device_threads > 64) + if(ctx->device_threads > maxThreadsPerBlock / threadsPerHash) { - // Fermi gpus only support 512 threads per block (we need start 4 * configured threads) - ctx->device_threads = 64; + ctx->device_threads = maxThreadsPerBlock / threadsPerHash; } // check if cryptonight_monero_v8 is selected for the user pool @@ -749,6 +762,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_blocks = blockOptimal; } } + else if(useCryptonight_gpu) + { + // 8 based on my profiling sessions maybe it must be adjusted later + size_t threads = 8; + // 8 is chosen by checking the occupancy calculator + size_t blockOptimal = 8 * ctx->device_mpcount; + if(gpuArch >= 70) + blockOptimal = 5 * ctx->device_mpcount; + + if(blockOptimal * threads * hashMemSize < limitedMemory) + { + ctx->device_threads = threads; + ctx->device_blocks = blockOptimal; + } + + } } printf("device init succeeded\n"); From 4524b875e9397a5856444090ff7f9a930f8a690a Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Fri, 1 Feb 2019 22:32:55 +0100 Subject: [PATCH 15/19] OpenCL: fix work size message Fix message with the maximal allowed worksize if cryptonight_gpu is used. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 857abc138..f80c37a8b 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -284,10 +284,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - /* Some kernel spawn 8 times more threads than the user is configuring. - * To give the user the correct maximum work size we divide the hardware specific max by 8. - */ - MaximumWorkSize /= 8; + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); + + if(useCryptonight_gpu) + { + // work cn_1 we use 16x more threads than configured by the user + MaximumWorkSize /= 16; + } + else + { + /* Some kernel spawn 8 times more threads than the user is configuring. + * To give the user the correct maximum work size we divide the hardware specific max by 8. + */ + MaximumWorkSize /= 8; + } printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 }; @@ -316,8 +327,6 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); - size_t scratchPadSize = 0; for(const auto algo : neededAlgorithms) { From e274dbcc8ef19665cb913777837fa0ac7e44ff26 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 2 Feb 2019 22:37:19 +0100 Subject: [PATCH 16/19] OpenCL: fix Blake hashing Windows driver creates wrong code if unroll is used. --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index faea409ed..2c775e77e 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -1173,7 +1173,6 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u ((uint8 *)h)[0] = vload8(0U, c_IV256); - #pragma unroll 4 for(uint i = 0, bitlen = 0; i < 4; ++i) { if(i < 3) @@ -1295,4 +1294,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global } } -)===" +)===" \ No newline at end of file From 1ca0b951a2a7ef2c489b40aa94b00538592adcf5 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 2 Feb 2019 22:55:45 +0100 Subject: [PATCH 17/19] coin cleanup - rename `Intense` to `Lethean` - remove `masari` and `stellite`, both hase forked to a currently unsupported algorithm --- README.md | 3 +-- xmrstak/jconf.cpp | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e0b62503c..1040e56ea 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [BitTube](https://coin.bit.tube/) - [Graft](https://www.graft.network) - [Haven](https://havenprotocol.com) -- [Intense](https://intensecoin.com) -- [Masari](https://getmasari.org) +- [Lethean](https://lethean.io) - [QRL](https://theqrl.org) - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo** - [TurtleCoin](https://turtlecoin.lol) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 6c2de4b3b..a16910552 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -107,12 +107,10 @@ xmrstak::coin_selection coins[] = { { "freehaven", {cryptonight_heavy, cryptonight_superfast, 255u}, {cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, { "graft", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, + { "lethean", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "monero", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" }, { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "ryo", {cryptonight_gpu, cryptonight_heavy, 6u}, {cryptonight_gpu, cryptonight_heavy, 6u}, nullptr }, - { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "turtlecoin", {cryptonight_turtle, cryptonight_aeon, 5u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "plenteum", {cryptonight_turtle, cryptonight_aeon, 5u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } }; From d834e9e0f6f2eaf15dfc8b426daaa4a40b18c1c5 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 2 Feb 2019 23:24:14 +0100 Subject: [PATCH 18/19] update documentation - remove driver warning - add link to supported card y the ROCm driver --- doc/compile_Linux.md | 4 +--- doc/compile_Windows.md | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md index ebf115430..6c80bc56a 100644 --- a/doc/compile_Linux.md +++ b/doc/compile_Linux.md @@ -9,10 +9,8 @@ - run `./amdgpu-pro-install --opencl=legacy,pal` from the unzipped folder - set the environment variable to opencl `export AMDAPPSDKROOT=/opt/amdgpu-pro/` -**ATTENTION** The linux driver 18.3 creating invalid shares. -If you have an issue with `invalid shares` please downgrade your driver or switch to ROCm. - For linux also the OpenSource driver ROCm 1.9.X+ is a well working alternative, see https://rocm.github.io/ROCmInstall.html +ROCm is not supporting old GPUs please check if your GPU is supported https://rocm.github.io/hardware.html. ### Cuda 8.0+ (only needed to use NVIDIA GPUs) diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index 8fe4dcf53..64d68bab1 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -34,9 +34,6 @@ - Download & install the AMD driver: https://www.amd.com/en/support -**ATTENTION** Many windows driver 18.5+ creating invalid shares. -If you have an issue with `invalid shares` please downgrade your driver. - - Download and install the latest version of the OCL-SDK from https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases Do not follow old information that you need the AMD APP SDK. AMD has removed the APP SDK and is now shipping the OCL-SDK_light. From 62b305a73cf8be50c863d8cdb356141b8d2925bb Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 2 Feb 2019 23:28:09 +0100 Subject: [PATCH 19/19] update version to 2.8.0 --- xmrstak/version.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 88e3a5cad..84fd048bd 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.7.1" +#define XMR_STAK_VERSION "2.8.0" #if defined(_WIN32) #define OS_TYPE "win"
Currency%s
Difficulty%u
Good results%u / %u (%.1f %%)
Avg result time%.1f sec