Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
xmrig committed Jul 16, 2020
2 parents 9f8fc5a + 40e5f82 commit 4234fa1
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 27 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ option(WITH_RANDOMX "Enable RandomX algorithms family" ON)

include_directories(src)
add_definitions(/DXMRIG_ALGO_CN_LITE /DXMRIG_ALGO_CN_HEAVY /DXMRIG_ALGO_CN_PICO)
add_definitions(/DCUB_IGNORE_DEPRECATED_CPP_DIALECT)


include(cmake/cpu.cmake)
Expand Down
15 changes: 12 additions & 3 deletions cmake/CUDA.cmake
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
add_definitions(-DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)

option(XMRIG_LARGEGRID "Support large CUDA block count > 128" ON)
if (XMRIG_LARGEGRID)
add_definitions("-DXMRIG_LARGEGRID=${XMRIG_LARGEGRID}")
Expand All @@ -23,13 +25,19 @@ find_library(CUDA_NVRTC_LIB libnvrtc nvrtc HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64

set(LIBS ${LIBS} ${CUDA_LIBRARIES} ${CUDA_LIB} ${CUDA_NVRTC_LIB})

set(DEFAULT_CUDA_ARCH "30;50")
set(DEFAULT_CUDA_ARCH "50")

# Fermi GPUs are only supported with CUDA < 9.0
if (CUDA_VERSION VERSION_LESS 9.0)
list(APPEND DEFAULT_CUDA_ARCH "20;21")
endif()

if (CUDA_VERSION VERSION_LESS 11.0)
list(APPEND DEFAULT_CUDA_ARCH "30")
else()
list(APPEND DEFAULT_CUDA_ARCH "35")
endif()

# add Pascal support for CUDA >= 8.0
if (NOT CUDA_VERSION VERSION_LESS 8.0)
list(APPEND DEFAULT_CUDA_ARCH "60")
Expand Down Expand Up @@ -85,9 +93,10 @@ elseif("${CUDA_COMPILER}" STREQUAL "nvcc")
if (CUDA_VERSION VERSION_LESS 8.0)
add_definitions(-D_FORCE_INLINES)
add_definitions(-D_MWAITXINTRIN_H_INCLUDED)
elseif(CUDA_VERSION VERSION_LESS 9.0)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Wno-deprecated-gpu-targets")
endif()

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Wno-deprecated-gpu-targets")

foreach(CUDA_ARCH_ELEM ${CUDA_ARCH})
# set flags to create device code for the given architecture
if("${CUDA_ARCH_ELEM}" STREQUAL "21")
Expand Down
2 changes: 1 addition & 1 deletion src/cryptonight.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct nvid_ctx {
bool ready = false;

uint32_t *d_input = nullptr;
uint32_t inputlen = 0;
int inputlen = 0;
uint32_t *d_result_count = nullptr;
uint32_t *d_result_nonce = nullptr;
uint32_t *d_long_state = nullptr;
Expand Down
39 changes: 26 additions & 13 deletions src/cuda_extra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ typedef unsigned long long DataLength;
#include "cuda_aes.hpp"
#include "crypto/cn/CnAlgo.h"

// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
// Buffer increased to 384 bytes to accomodate the Haven offshore pricing_record
// Round it up to 408 (136*3) for a convenient keccak calculation
static constexpr size_t kMaxBlobSize = 408;

__constant__ uint8_t d_sub_byte[16][16] ={
{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
Expand Down Expand Up @@ -119,7 +124,7 @@ template<xmrig::Algorithm::Id ALGO>
__global__ void cryptonight_extra_gpu_prepare(
int threads,
uint32_t *__restrict__ d_input,
uint32_t len,
int len,
uint32_t startNonce,
uint32_t *__restrict__ d_ctx_state,
uint32_t *__restrict__ d_ctx_state2,
Expand Down Expand Up @@ -148,15 +153,19 @@ __global__ void cryptonight_extra_gpu_prepare(
uint32_t ctx_b[4];
uint32_t ctx_key1[40];
uint32_t ctx_key2[40];
uint32_t input[32];

memcpy(input, d_input, len);
uint32_t nonce = startNonce + thread;
for (int i = 0; i < sizeof (uint32_t ); ++i) {
(((char *)input) + 39)[i] = ((char*) (&nonce))[i]; //take care of pointer alignment
{
uint64_t input[kMaxBlobSize / sizeof(uint64_t)];

memcpy(input, d_input, len);
uint32_t nonce = startNonce + thread;
for (int i = 0; i < sizeof(uint32_t); ++i) {
(((char *)input) + 39)[i] = ((char*)(&nonce))[i]; //take care of pointer alignment
}

cn_keccak(input, len, (uint8_t *)ctx_state);
}

cn_keccak((uint8_t *) input, len, (uint8_t *) ctx_state);
cryptonight_aes_set_key(ctx_key1, ctx_state);
cryptonight_aes_set_key(ctx_key2, ctx_state + 8);

Expand Down Expand Up @@ -279,13 +288,18 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3

void cryptonight_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len)
{
ctx->inputlen = static_cast<unsigned int>(len);
uint8_t buf[kMaxBlobSize];

const int inlen = static_cast<int>(len + 136 - (len % 136));

// Use temporary 200 byte buffer with zeros in the end (required for AstroBWT)
uint8_t buf[200] = {};
memcpy(buf, data, len);
buf[len] = 1;
memset(buf + len + 1, 0, inlen - len - 1);
buf[inlen - 1] |= 0x80;

ctx->inputlen = static_cast<unsigned int>(inlen);

CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, buf, sizeof(buf), cudaMemcpyHostToDevice));
CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, buf, ctx->inputlen, cudaMemcpyHostToDevice));
}


Expand Down Expand Up @@ -342,8 +356,7 @@ int cryptonight_extra_cpu_init(nvid_ctx *ctx, const xmrig::Algorithm &algorithm,
ctx->d_ctx_state2 = ctx->d_ctx_state;
}

// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 200));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, kMaxBlobSize));
if (algorithm.family() != Algorithm::KAWPOW) {
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof(uint32_t)));
}
Expand Down
19 changes: 13 additions & 6 deletions src/cuda_keccak.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,23 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s)
}
}

__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md)
__device__ __forceinline__ void cn_keccak(const uint64_t * __restrict__ input, int inlen, uint8_t * __restrict__ md)
{
uint64_t st[25];

MEMSET8(st + 8, 0x00, 25 - 8);
memcpy(st, in, len);
((uint8_t*)st)[len] = 0x01;
st[16] = 0x8000000000000000ULL;
#pragma unroll
for (int i = 0; i < 25; ++i) {
st[i] = 0;
}

cn_keccakf(st);
// Input length must be a multiple of 136 and padded on the host side
for (int i = 0; inlen > 0; i += 17, inlen -= 136) {
#pragma unroll
for (int j = 0; j < 17; ++j) {
st[j] ^= input[i + j];
}
cn_keccakf(st);
}

MEMCPY8(md, st, 25);
return;
Expand Down
6 changes: 3 additions & 3 deletions src/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@
#define APP_ID "xmrig-cuda"
#define APP_NAME "XMRig"
#define APP_DESC "XMRig CUDA plugin"
#define APP_VERSION "6.2.1"
#define APP_VERSION "6.3.0"
#define APP_DOMAIN "xmrig.com"
#define APP_SITE "www.xmrig.com"
#define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com"

#define APP_VER_MAJOR 6
#define APP_VER_MINOR 2
#define APP_VER_PATCH 1
#define APP_VER_MINOR 3
#define APP_VER_PATCH 0

#define API_VERSION 3

Expand Down

0 comments on commit 4234fa1

Please sign in to comment.