Merge branch 'dev'

xmrig · Jul 16, 2020 · 4234fa1 · 4234fa1
2 parents 9f8fc5a + 40e5f82
commit 4234fa1
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 27 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,7 +7,6 @@ option(WITH_RANDOMX "Enable RandomX algorithms family" ON)
 
 include_directories(src)
 add_definitions(/DXMRIG_ALGO_CN_LITE /DXMRIG_ALGO_CN_HEAVY /DXMRIG_ALGO_CN_PICO)
-add_definitions(/DCUB_IGNORE_DEPRECATED_CPP_DIALECT)
 
 
 include(cmake/cpu.cmake)

diff --git a/cmake/CUDA.cmake b/cmake/CUDA.cmake
@@ -1,3 +1,5 @@
+add_definitions(-DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+
 option(XMRIG_LARGEGRID "Support large CUDA block count > 128" ON)
 if (XMRIG_LARGEGRID)
     add_definitions("-DXMRIG_LARGEGRID=${XMRIG_LARGEGRID}")
@@ -23,13 +25,19 @@ find_library(CUDA_NVRTC_LIB libnvrtc nvrtc HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64
 
 set(LIBS ${LIBS} ${CUDA_LIBRARIES} ${CUDA_LIB} ${CUDA_NVRTC_LIB})
 
-set(DEFAULT_CUDA_ARCH "30;50")
+set(DEFAULT_CUDA_ARCH "50")
 
 # Fermi GPUs are only supported with CUDA < 9.0
 if (CUDA_VERSION VERSION_LESS 9.0)
     list(APPEND DEFAULT_CUDA_ARCH "20;21")
 endif()
 
+if (CUDA_VERSION VERSION_LESS 11.0)
+    list(APPEND DEFAULT_CUDA_ARCH "30")
+else()
+    list(APPEND DEFAULT_CUDA_ARCH "35")
+endif()
+
 # add Pascal support for CUDA >= 8.0
 if (NOT CUDA_VERSION VERSION_LESS 8.0)
     list(APPEND DEFAULT_CUDA_ARCH "60")
@@ -85,9 +93,10 @@ elseif("${CUDA_COMPILER}" STREQUAL "nvcc")
     if (CUDA_VERSION VERSION_LESS 8.0)
         add_definitions(-D_FORCE_INLINES)
         add_definitions(-D_MWAITXINTRIN_H_INCLUDED)
-    elseif(CUDA_VERSION VERSION_LESS 9.0)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Wno-deprecated-gpu-targets")
     endif()
+
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Wno-deprecated-gpu-targets")
+
     foreach(CUDA_ARCH_ELEM ${CUDA_ARCH})
         # set flags to create device code for the given architecture
         if("${CUDA_ARCH_ELEM}" STREQUAL "21")

diff --git a/src/cryptonight.h b/src/cryptonight.h
@@ -60,7 +60,7 @@ struct nvid_ctx {
     bool ready                          = false;
 
     uint32_t *d_input                   = nullptr;
-    uint32_t inputlen                   = 0;
+    int inputlen                        = 0;
     uint32_t *d_result_count            = nullptr;
     uint32_t *d_result_nonce            = nullptr;
     uint32_t *d_long_state              = nullptr;

diff --git a/src/cuda_extra.cu b/src/cuda_extra.cu
@@ -45,6 +45,11 @@ typedef unsigned long long DataLength;
 #include "cuda_aes.hpp"
 #include "crypto/cn/CnAlgo.h"
 
+// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
+// Buffer increased to 384 bytes to accomodate the Haven offshore pricing_record
+// Round it up to 408 (136*3) for a convenient keccak calculation
+static constexpr size_t kMaxBlobSize = 408;
+
 __constant__ uint8_t d_sub_byte[16][16] ={
     {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
     {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
@@ -119,7 +124,7 @@ template<xmrig::Algorithm::Id ALGO>
 __global__ void cryptonight_extra_gpu_prepare(
         int threads,
         uint32_t *__restrict__ d_input,
-        uint32_t len,
+        int len,
         uint32_t startNonce,
         uint32_t *__restrict__ d_ctx_state,
         uint32_t *__restrict__ d_ctx_state2,
@@ -148,15 +153,19 @@ __global__ void cryptonight_extra_gpu_prepare(
     uint32_t ctx_b[4];
     uint32_t ctx_key1[40];
     uint32_t ctx_key2[40];
-    uint32_t input[32];
 
-    memcpy(input, d_input, len);
-    uint32_t nonce = startNonce + thread;
-    for (int i = 0; i < sizeof (uint32_t ); ++i) {
-        (((char *)input) + 39)[i] = ((char*) (&nonce))[i]; //take care of pointer alignment
+    {
+        uint64_t input[kMaxBlobSize / sizeof(uint64_t)];
+
+        memcpy(input, d_input, len);
+        uint32_t nonce = startNonce + thread;
+        for (int i = 0; i < sizeof(uint32_t); ++i) {
+            (((char *)input) + 39)[i] = ((char*)(&nonce))[i]; //take care of pointer alignment
+        }
+
+        cn_keccak(input, len, (uint8_t *)ctx_state);
     }
 
-    cn_keccak((uint8_t *) input, len, (uint8_t *) ctx_state);
     cryptonight_aes_set_key(ctx_key1, ctx_state);
     cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
 
@@ -279,13 +288,18 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 
 void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len)
 {
-    ctx->inputlen = static_cast<unsigned int>(len);
+    uint8_t buf[kMaxBlobSize];
+
+    const int inlen = static_cast<int>(len + 136 - (len % 136));
 
-    // Use temporary 200 byte buffer with zeros in the end (required for AstroBWT)
-    uint8_t buf[200] = {};
     memcpy(buf, data, len);
+    buf[len] = 1;
+    memset(buf + len + 1, 0, inlen - len - 1);
+    buf[inlen - 1] |= 0x80;
+
+    ctx->inputlen = static_cast<unsigned int>(inlen);
 
-    CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, buf, sizeof(buf), cudaMemcpyHostToDevice));
+    CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, buf, ctx->inputlen, cudaMemcpyHostToDevice));
 }
 
 
@@ -342,8 +356,7 @@ int cryptonight_extra_cpu_init(nvid_ctx *ctx, const xmrig::Algorithm &algorithm,
         ctx->d_ctx_state2 = ctx->d_ctx_state;
     }
 
-    // POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
-    CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input,        200));
+    CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, kMaxBlobSize));
     if (algorithm.family() != Algorithm::KAWPOW) {
         CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof(uint32_t)));
     }

diff --git a/src/cuda_keccak.hpp b/src/cuda_keccak.hpp
@@ -150,16 +150,23 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s)
 	}
 }
 
-__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md)
+__device__ __forceinline__ void cn_keccak(const uint64_t * __restrict__ input, int inlen, uint8_t * __restrict__ md)
 {
 	uint64_t st[25];
 
-	MEMSET8(st + 8, 0x00, 25 - 8);
-	memcpy(st, in, len);
-	((uint8_t*)st)[len] = 0x01;
-	st[16] = 0x8000000000000000ULL;
+	#pragma unroll
+	for (int i = 0; i < 25; ++i) {
+		st[i] = 0;
+	}
 
-	cn_keccakf(st);
+	// Input length must be a multiple of 136 and padded on the host side
+	for (int i = 0; inlen > 0; i += 17, inlen -= 136) {
+		#pragma unroll
+		for (int j = 0; j < 17; ++j) {
+			st[j] ^= input[i + j];
+		}
+		cn_keccakf(st);
+	}
 
 	MEMCPY8(md, st, 25);
 	return;

diff --git a/src/version.h b/src/version.h
@@ -28,14 +28,14 @@
 #define APP_ID        "xmrig-cuda"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig CUDA plugin"
-#define APP_VERSION   "6.2.1"
+#define APP_VERSION   "6.3.0"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"
 
 #define APP_VER_MAJOR  6
-#define APP_VER_MINOR  2
-#define APP_VER_PATCH  1
+#define APP_VER_MINOR  3
+#define APP_VER_PATCH  0
 
 #define API_VERSION    3