From 5452725f1ae50b00a3143c2a717a5e836fffe259 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Tue, 18 Dec 2018 19:04:16 -0700
Subject: [PATCH 01/19] Add Rig ID to connection status web+cli

---
 xmrstak/http/webdesign.cpp | 1 +
 xmrstak/misc/executor.cpp  | 2 ++
 xmrstak/net/jpsock.hpp     | 1 +
 3 files changed, 4 insertions(+)
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index 93e217519..30a844349 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -168,6 +168,7 @@ extern const char sHtmlHashrateBodyLow [] =
 extern const char sHtmlConnectionBodyHigh [] =
 	"<div class='data'>"
 	"<table>"
+		"<tr><th>Rig ID</th><td>%s</td></tr>"
 		"<tr><th>Pool address</th><td>%s</td></tr>"
 		"<tr><th>Connected since</th><td>%s</td></tr>"
 		"<tr><th>Pool ping time</th><td>%u ms</td></tr>"
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index a303b34cd..c77645ff6 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -945,6 +945,7 @@ void executor::connection_report(std::string& out)
 		pool = pick_pool_by_id(last_usr_pool_id);
 
 	out.append("CONNECTION REPORT\n");
+	out.append("Rig ID          : ").append(pool != nullptr ? pool->get_rigid() : "").append(1, '\n');
 	out.append("Pool address    : ").append(pool != nullptr ? pool->get_pool_addr() : "<not connected>").append(1, '\n');
 	if(pool != nullptr && pool->is_running() && pool->is_logged_in())
 		out.append("Connected since : ").append(time_format(date, sizeof(date), tPoolConnTime)).append(1, '\n');
@@ -1145,6 +1146,7 @@ void executor::http_connection_report(std::string& out)
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh,
+		pool != nullptr ? pool->get_rigid() : "",
 		pool != nullptr ? pool->get_pool_addr() : "not connected",
 		cdate, ping_time);
 	out.append(buffer);
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index ad34f6c86..96fec6b98 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -58,6 +58,7 @@ class jpsock
 	inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; }
 	inline const char* get_pool_addr() { return net_addr.c_str(); }
 	inline const char* get_tls_fp() { return tls_fp.c_str(); }
+	inline const char* get_rigid() { return usr_rigid.c_str(); }
 	inline bool is_nicehash() { return nicehash; }
 
 	bool get_pool_motd(std::string& strin);

From 0e034f4ab4be2f563e030fde14a659065ddefda9 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Tue, 18 Dec 2018 19:15:24 -0700
Subject: [PATCH 02/19] Add backend thread tags to Thread IDs on Hashrate
 Report (web)

---
 xmrstak/http/webdesign.cpp |  2 +-
 xmrstak/misc/executor.cpp  | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index 93e217519..f58698e7e 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -157,7 +157,7 @@ extern const char sHtmlHashrateBodyHigh [] =
 		"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
 
 extern const char sHtmlHashrateTableRow [] =
-	"<tr><th>%u</th><td>%s</td><td>%s</td><td>%s</td></tr>";
+	"<tr><th>%s</th><td>%s</td><td>%s</td><td>%s</td></tr>";
 
 extern const char sHtmlHashrateBodyLow [] =
 		"<tr><th>Totals:</th><td>%s</td><td>%s</td><td>%s</td></tr>"
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index a303b34cd..5d5b8948b 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -1040,9 +1040,27 @@ void executor::http_hashrate_report(std::string& out)
 	out.append(buffer);
 
 	double fTotal[3] = { 0.0, 0.0, 0.0};
+	auto bTypePrev = static_cast<xmrstak::iBackend::BackendType>(0);
+	std::string name;
+	size_t j = 0;
 	for(size_t i=0; i < nthd; i++)
 	{
 		double fHps[3];
+		char csThreadTag[25];
+		auto bType = static_cast<xmrstak::iBackend::BackendType>(pvThreads->at(i)->backendType);
+		if(bTypePrev == bType)
+			j++;
+		else
+		{
+			j = 0;
+			bTypePrev = bType;
+			name = xmrstak::iBackend::getName(bType);
+			std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+		}
+		snprintf(csThreadTag, sizeof(csThreadTag),
+			(99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"),
+			name.c_str(), (unsigned int)(j), (unsigned int)i
+		);
 
 		fHps[0] = telem->calc_telemetry_data(10000, i);
 		fHps[1] = telem->calc_telemetry_data(60000, i);
@@ -1057,7 +1075,7 @@ void executor::http_hashrate_report(std::string& out)
 		fTotal[1] += fHps[1];
 		fTotal[2] += fHps[2];
 
-		snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, (unsigned int)i, num_a, num_b, num_c);
+		snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, csThreadTag, num_a, num_b, num_c);
 		out.append(buffer);
 	}
 

From 72c8dacfaa1c8754402fe1d3ba64015cc24ddc77 Mon Sep 17 00:00:00 2001
From: Tony Butler <spudz76@gmail.com>
Date: Tue, 18 Dec 2018 19:37:34 -0700
Subject: [PATCH 03/19] Add currency to results report web+cli

---
 xmrstak/http/webdesign.cpp | 1 +
 xmrstak/misc/executor.cpp  | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index 93e217519..27653bba4 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -185,6 +185,7 @@ extern const char sHtmlConnectionBodyLow [] =
 extern const char sHtmlResultBodyHigh [] =
 	"<div class='data'>"
 	"<table>"
+		"<tr><th>Currency</th><td>%s</td></tr>"
 		"<tr><th>Difficulty</th><td>%u</td></tr>"
 		"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
 		"<tr><th>Avg result time</th><td>%.1f sec</td></tr>"
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index a303b34cd..c0054ce43 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -884,6 +884,8 @@ void executor::result_report(std::string& out)
 		iTotalRes += vMineResults[i].count;
 
 	out.append("RESULT REPORT\n");
+	out.append("Currency         : ").
+		append(jconf::inst()->GetMiningCoin()).append("\n");
 	if(iTotalRes == 0)
 	{
 		out.append("You haven't found any results yet.\n");
@@ -1100,6 +1102,7 @@ void executor::http_result_report(std::string& out)
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlResultBodyHigh,
+		jconf::inst()->GetMiningCoin().c_str(),
 		iPoolDiff, iGoodRes, iTotalRes, fGoodResPrc, fAvgResTime, iPoolHashes,
 		int_port(iTopDiff[0]), int_port(iTopDiff[1]), int_port(iTopDiff[2]), int_port(iTopDiff[3]),
 		int_port(iTopDiff[4]), int_port(iTopDiff[5]), int_port(iTopDiff[6]), int_port(iTopDiff[7]),

From a39ee0886cf613b70490164c4f33b066230709bc Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 29 Dec 2018 20:53:43 +0100
Subject: [PATCH 04/19] OpenCL: allow more than two algorithms

In the current implementation the POW algorithm in dev pool section of a
currency will not be taken into account during the binary creation.
This PR changes the behavior and allow to create binaries for more than two POW algorihms.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 132 ++++++++++++----------------
 xmrstak/backend/amd/amd_gpu/gpu.hpp |   6 +-
 2 files changed, 61 insertions(+), 77 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 408cad97a..e4fb765e7 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -390,18 +390,26 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	xmrstak_algo miner_algo[2] = {
+	std::array<xmrstak_algo, 4> selectedAlgos = {
+	    ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo(),
+		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot(),
 		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
 		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
 	};
-	int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2;
 
-	for(int ii = 0; ii < num_algos; ++ii)
+	for(int ii = 0; ii < selectedAlgos.size(); ++ii)
 	{
+		xmrstak_algo miner_algo = selectedAlgos[ii];
+		bool alreadyCompiled = ctx->Program.find(miner_algo) != ctx->Program.end();
+		if(alreadyCompiled)
+		{
+			printer::inst()->print_msg(L1,"OpenCL device %u - Skip %u",ctx->deviceIdx, (uint32_t)miner_algo);
+			continue;
+		}
 		// scratchpad size for the selected mining algorithm
-		size_t hashMemSize = cn_select_memory(miner_algo[ii]);
-		int threadMemMask = cn_select_mask(miner_algo[ii]);
-		int hashIterations = cn_select_iter(miner_algo[ii]);
+		size_t hashMemSize = cn_select_memory(miner_algo);
+		int threadMemMask = cn_select_mask(miner_algo);
+		int hashIterations = cn_select_iter(miner_algo);
 
 		size_t mem_chunk_exp = 1u << ctx->memChunk;
 		size_t strided_index = ctx->stridedIndex;
@@ -409,7 +417,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 * this is required if the dev pool is mining monero
 		 * but the user tuned there settings for another currency
 		 */
-		if(miner_algo[ii] == cryptonight_monero_v8)
+		if(miner_algo == cryptonight_monero_v8)
 		{
 			if(ctx->memChunk < 2)
 				mem_chunk_exp = 1u << 2;
@@ -428,7 +436,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U";
 		options += " -DCOMP_MODE=" + std::to_string(needCompMode);
 		options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU";
-		options += " -DALGO=" + std::to_string(miner_algo[ii]);
+		options += " -DALGO=" + std::to_string(miner_algo);
 		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
 		/* AMD driver output is something like: `1445.5 (VM)`
 		 * and is mapped to `14` only. The value is only used for a compiler
@@ -457,20 +465,20 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		{
 			if(xmrstak::params::inst().AMDCache)
 				printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
-			ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+			ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
 			if(ret != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 				return ERR_OCL_API;
 			}
 
-			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
+			ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
 				size_t len;
 				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
 
-				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
 					return ERR_OCL_API;
@@ -479,7 +487,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				char* BuildLog = (char*)malloc(len + 1);
 				BuildLog[0] = '\0';
 
-				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
 				{
 					free(BuildLog);
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
@@ -494,11 +502,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			}
 
 			cl_uint num_devices;
-			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
 
 
 			std::vector<cl_device_id> devices_ids(num_devices);
-			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
 			int dev_id = 0;
 			/* Search for the gpu within the program context.
 			 * The id can be different to  ctx->DeviceID.
@@ -513,7 +521,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			cl_build_status status;
 			do
 			{
-				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
 					return ERR_OCL_API;
@@ -525,7 +533,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			if(xmrstak::params::inst().AMDCache)
 			{
 				std::vector<size_t> binary_sizes(num_devices);
-				clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+				clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
 
 				std::vector<char*> all_programs(num_devices);
 				std::vector<std::vector<char>> program_storage;
@@ -541,7 +549,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 					p_id++;
 				}
 
-				if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
 					return ERR_OCL_API;
@@ -565,7 +573,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			auto data_ptr = s.data();
 
 			cl_int clStatus;
-			ctx->Program[ii] = clCreateProgramWithBinary(
+			ctx->Program[miner_algo] = clCreateProgramWithBinary(
 				opencl_ctx, 1, &ctx->DeviceID, &bin_size,
 				(const unsigned char **)&data_ptr, &clStatus, &ret
 			);
@@ -574,7 +582,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
-			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL);
+			ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
@@ -585,37 +593,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
 		// append algorithm number to kernel name
 		for(int k = 0; k < 3; k++)
-			KernelNames[k] += std::to_string(miner_algo[ii]);
+			KernelNames[k] += std::to_string(miner_algo);
 
-		if(ii == 0)
-		{
-			for(int i = 0; i < 7; ++i)
-			{
-				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
-				if(ret != CL_SUCCESS)
-				{
-					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
-					return ERR_OCL_API;
-				}
-			}
-		}
-		else
+		for(int i = 0; i < 7; ++i)
 		{
-			for(int i = 0; i < 3; ++i)
-			{
-				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
-				if(ret != CL_SUCCESS)
-				{
-					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str());
-					return ERR_OCL_API;
-				}
-			}
-			// move kernel from the main algorithm into the root algorithm kernel space
-			for(int i = 3; i < 7; ++i)
+			ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret);
+			if(ret != CL_SUCCESS)
 			{
-				ctx->Kernels[ii][i] = ctx->Kernels[0][i];
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+				return ERR_OCL_API;
 			}
-
 		}
 	}
 	ctx->Nonce = 0;
@@ -996,8 +983,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
 {
-	// switch to the kernel storage
-	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
 
 	cl_int ret;
 
@@ -1015,28 +1000,28 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return ERR_OCL_API;
 	}
 
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1045,21 +1030,21 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	// CN1 Kernel
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1068,7 +1053,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2)
 	{
 		// Input
-		if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		if ((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret));
 			return ERR_OCL_API;
@@ -1077,49 +1062,49 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1128,34 +1113,34 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	for(int i = 0; i < 4; ++i)
 	{
 		// States
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
 			return ERR_OCL_API;
 		}
 
 		// Nonce buffer
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
 			return ERR_OCL_API;
 		}
 
 		// Output
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
 			return ERR_OCL_API;
 		}
 
 		// Target
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
 		}
 
-		if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		if((clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
 			return(ERR_OCL_API);
@@ -1258,9 +1243,6 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
-	// switch to the kernel storage
-	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
-
 	cl_int ret;
 	cl_uint zero = 0;
 	size_t BranchNonces[4];
@@ -1294,7 +1276,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 	}
 
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
@@ -1302,13 +1284,13 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	size_t tmpNonce = ctx->Nonce;
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
 		return ERR_OCL_API;
@@ -1317,7 +1299,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 	for(int i = 0; i < 4; ++i)
 	{
 		size_t tmpNonce = ctx->Nonce;
-		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 			return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 80fcbefde..5e9f618ed 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -14,6 +14,8 @@
 #include <vector>
 #include <mutex>
 #include <memory>
+#include <map>
+#include <array>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
@@ -50,8 +52,8 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
-	cl_program Program[2];
-	cl_kernel Kernels[2][8];
+	std::map<xmrstak_algo, cl_program> Program;
+	std::map<xmrstak_algo, std::array<cl_kernel,8>> Kernels;
 	size_t freeMem;
 	size_t maxMemPerAlloc;
 	int computeUnits;

From 758dbfb14b4ca54e44ba9dca1ce5b81a34d00603 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 29 Dec 2018 22:16:46 +0100
Subject: [PATCH 05/19] improve POW algorithm selection

- add helper method `GetAllAlgorithms()` to get all active POW
algorithms
- select max scratchpad memory size based on the dev pool and user
algorithms
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp           | 27 ++++-------
 xmrstak/backend/amd/autoAdjust.hpp            | 27 +++++------
 xmrstak/backend/cpu/autoAdjust.hpp            | 12 +++--
 xmrstak/backend/cpu/autoAdjustHwloc.hpp       | 14 +++---
 .../backend/cpu/crypto/cryptonight_common.cpp | 22 +++++----
 xmrstak/backend/cpu/minethd.cpp               | 20 ++++----
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 47 ++++++++++---------
 xmrstak/misc/coinDescription.hpp              | 25 +++++++++-
 8 files changed, 109 insertions(+), 85 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index e4fb765e7..e9a1447ca 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -316,10 +316,13 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	size_t scratchPadSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t scratchPadSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		scratchPadSize = std::max(scratchPadSize, cn_select_memory(algo));
+	}
 
 	size_t g_thd = ctx->rawIntensity;
 	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
@@ -390,22 +393,8 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	std::array<xmrstak_algo, 4> selectedAlgos = {
-	    ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo(),
-		::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot(),
-		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
-		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
-	};
-
-	for(int ii = 0; ii < selectedAlgos.size(); ++ii)
+	for(const auto miner_algo : neededAlgorithms)
 	{
-		xmrstak_algo miner_algo = selectedAlgos[ii];
-		bool alreadyCompiled = ctx->Program.find(miner_algo) != ctx->Program.end();
-		if(alreadyCompiled)
-		{
-			printer::inst()->print_msg(L1,"OpenCL device %u - Skip %u",ctx->deviceIdx, (uint32_t)miner_algo);
-			continue;
-		}
 		// scratchpad size for the selected mining algorithm
 		size_t hashMemSize = cn_select_memory(miner_algo);
 		int threadMemMask = cn_select_mask(miner_algo);
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index ba4cebb7b..0442fde91 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -83,10 +83,13 @@ class autoAdjust
 
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
-		size_t hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		);
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		size_t hashMemSize = 0;
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		}
 
 		std::string conf;
 		for(auto& ctx : devVec)
@@ -128,18 +131,10 @@ class autoAdjust
 			}
 
 			// check if cryptonight_monero_v8 is selected for the user or dev pool
-			bool useCryptonight_v8 =
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+			bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end();
 
 			// true for all cryptonight_heavy derivates since we check the user and dev pool
-			bool useCryptonight_heavy =
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy;
+			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
 
 			// set strided index to default
 			ctx.stridedIndex = 1;
@@ -154,8 +149,8 @@ class autoAdjust
 			else if(useCryptonight_heavy)
 				ctx.stridedIndex = 3;
 
-			// increase all intensity limits by two for aeon
-			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
+			// increase all intensity limits by two if scratchpad is only 1 MiB
+			if(hashMemSize <= CRYPTONIGHT_LITE_MEMORY)
 				maxThreads *= 2u;
 
 			// keep 128MiB memory free (value is randomly chosen) from the max available memory
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index e7f3e9148..91da7a6ee 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -28,11 +28,15 @@ class autoAdjust
 
 	bool printConfig()
 	{
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		size_t hashMemSize = 0;
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		}
+		const size_t hashMemSizeKB = hashMemSize / 1024u;
 
-		const size_t hashMemSizeKB = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		) / 1024u;
 		const size_t halfHashMemSizeKB = hashMemSizeKB / 2u;
 
 		configEditor configTpl{};
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index b61582588..39e80a376 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -28,10 +28,12 @@ class autoAdjust
 
 	autoAdjust()
 	{
-		hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		);
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		}
 		halfHashMemSize = hashMemSize / 2u;
 	}
 
@@ -93,8 +95,8 @@ class autoAdjust
 	}
 
 private:
-	size_t hashMemSize;
-	size_t halfHashMemSize;
+	size_t hashMemSize = 0;
+	size_t halfHashMemSize = 0;
 
 	std::vector<uint32_t> results;
 
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a7e4696a8..ee1ff2386 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -203,10 +203,13 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 {
-	size_t hashMemSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t hashMemSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+	}
 
 	cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
 
@@ -284,10 +287,13 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 
 void cryptonight_free_ctx(cryptonight_ctx* ctx)
 {
-	size_t hashMemSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t hashMemSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+	}
 
 	if(ctx->ctx_info[0] != 0)
 	{
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 20203a3c5..975f2d4a2 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -239,15 +239,17 @@ bool minethd::self_test()
 	cn_hash_fun hashf;
 	cn_hash_fun hashf_multi;
 
-	xmrstak_algo algo = xmrstak_algo::invalid_algo;
-
-	for(int algo_idx = 0; algo_idx < 2; ++algo_idx)
+	if(xmrstak_algo::invalid_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() ||
+		xmrstak_algo::invalid_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
 	{
-		if(algo_idx == 0)
-			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo();
-		else
-			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
+		printer::inst()->print_msg(L0, "Root algorithm is not allowed to be invalid");
+		return false;
+	}
 
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	for(const auto algo : neededAlgorithms)
+	{
 		if(algo == cryptonight)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
@@ -590,7 +592,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>,
-    
+
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
@@ -633,7 +635,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
 		}
 	}
-	
+
 	return selected_function;
 }
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 45afec9ac..f98de2c14 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -287,19 +287,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	// prefer shared memory over L1 cache
 	CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
 
-	size_t hashMemSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t hashMemSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+	}
 
 	size_t wsize = ctx->device_blocks * ctx->device_threads;
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
 	size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize;
 	if(
-		cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
 	)
 	{
 		// extent ctx_b to hold the state of idx0
@@ -307,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
-	else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
 	{
 		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
 		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
@@ -409,7 +411,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
 			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
 		);
-	} 
+	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_MSG_KERNEL(
@@ -656,10 +658,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		ctx->total_device_memory = totalMemory;
 		ctx->free_device_memory = freeMemory;
 
-		size_t hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		);
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		size_t hashMemSize = 0;
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, cn_select_memory(algo));
+		}
 
 #ifdef WIN32
 		/* We use in windows bfactor (split slow kernel into smaller parts) to avoid
@@ -688,10 +693,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		// 680bytes are extra meta data memory per hash
 		size_t perThread = hashMemSize + 16192u + 680u;
 		if(
-			cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
 		)
 			perThread += 50 * 4; // state double buffer
 
@@ -707,9 +712,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
-		bool useCryptonight_v8 =
-			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
-			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;
+		bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end();
 
 		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
 		if(useCryptonight_v8 && gpuArch >= 50)
diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp
index 55e86f4e2..26688aeea 100644
--- a/xmrstak/misc/coinDescription.hpp
+++ b/xmrstak/misc/coinDescription.hpp
@@ -4,7 +4,8 @@
 
 #include <stdlib.h>
 #include <string>
-
+#include <vector>
+#include <algorithm>
 
 namespace xmrstak
 {
@@ -56,5 +57,27 @@ namespace xmrstak
 			coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
 			return tmp;
 		}
+
+		/** return all POW algorithm for the current selected currency
+		 *
+		 * @return required POW algorithms without duplicated entries
+		 */
+		inline std::vector<xmrstak_algo> GetAllAlgorithms()
+		{
+			std::vector<xmrstak_algo> allAlgos = {
+				GetDescription(0).GetMiningAlgo(),
+				GetDescription(0).GetMiningAlgoRoot(),
+				GetDescription(1).GetMiningAlgo(),
+				GetDescription(1).GetMiningAlgoRoot()
+			};
+
+			std::sort(allAlgos.begin(), allAlgos.end());
+			std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo);
+			auto last = std::unique(allAlgos.begin(), allAlgos.end());
+			// remove duplicated algorithms
+			allAlgos.erase(last, allAlgos.end());
+
+			return allAlgos;
+		}
 	};
 } // namespace xmrstak

From 0643f60177aa473d4772e887582ae9a4b6613c0c Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 29 Dec 2018 22:26:43 +0100
Subject: [PATCH 06/19] OpenCl: avoid multiple map lookups

Avoid that we do multiple lookups to `std::map` to find the OpenCL
kernel binaries.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 52 ++++++++++++++++-------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index e9a1447ca..e0f3bcf1f 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -973,6 +973,8 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
 {
 
+	const auto & Kernels = ctx->Kernels[miner_algo];
+
 	cl_int ret;
 
 	if(input_len > 84)
@@ -989,28 +991,28 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return ERR_OCL_API;
 	}
 
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1019,21 +1021,21 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	// CN1 Kernel
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1042,7 +1044,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2)
 	{
 		// Input
-		if ((ret = clSetKernelArg(ctx->Kernels[miner_algo][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret));
 			return ERR_OCL_API;
@@ -1051,49 +1053,49 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[miner_algo][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1102,34 +1104,34 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	for(int i = 0; i < 4; ++i)
 	{
 		// States
-		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
 			return ERR_OCL_API;
 		}
 
 		// Nonce buffer
-		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
 			return ERR_OCL_API;
 		}
 
 		// Output
-		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
 			return ERR_OCL_API;
 		}
 
 		// Target
-		if((ret = clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
 		}
 
-		if((clSetKernelArg(ctx->Kernels[miner_algo][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
 			return(ERR_OCL_API);
@@ -1232,6 +1234,8 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
+	const auto & Kernels = ctx->Kernels[miner_algo];
+
 	cl_int ret;
 	cl_uint zero = 0;
 	size_t BranchNonces[4];
@@ -1265,7 +1269,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 	}
 
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
@@ -1273,13 +1277,13 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	size_t tmpNonce = ctx->Nonce;
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
 		return ERR_OCL_API;
@@ -1288,7 +1292,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 	for(int i = 0; i < 4; ++i)
 	{
 		size_t tmpNonce = ctx->Nonce;
-		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[miner_algo][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
 			return ERR_OCL_API;

From 749751e3c01f5f1da977b3f49ff0d7061c1965fa Mon Sep 17 00:00:00 2001
From: Brandon Lehmann <brandonlehmann@gmail.com>
Date: Sun, 6 Jan 2019 14:59:31 -0500
Subject: [PATCH 07/19] Add CryptoNight Turtle Support. Special thanks to
 @DaveLong for his hard work in getting this done.

---
 README.md                                     |  4 +-
 xmrstak/backend/amd/amd_gpu/gpu.cpp           |  2 +-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 20 ++++----
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    |  2 +-
 xmrstak/backend/amd/autoAdjust.hpp            |  7 ++-
 .../backend/cpu/crypto/cryptonight_aesni.h    | 12 ++---
 xmrstak/backend/cpu/minethd.cpp               | 48 ++++++++++++++++++-
 xmrstak/backend/cryptonight.hpp               | 22 ++++++++-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 21 ++++----
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 15 ++++--
 xmrstak/jconf.cpp                             |  4 +-
 xmrstak/misc/executor.cpp                     |  1 +
 xmrstak/net/jpsock.cpp                        |  3 ++
 xmrstak/pools.tpl                             |  3 ++
 14 files changed, 128 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 046a930e1..e0b62503c 100644
--- a/README.md
+++ b/README.md
@@ -48,12 +48,14 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [QRL](https://theqrl.org)
 - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
 - [TurtleCoin](https://turtlecoin.lol)
+- [Plenteum](https://www.plenteum.com/)
 
 Ryo currency is a way for us to implement the ideas that we were unable to in
 Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
 
 If your prefered coin is not listed, you can choose one of the following algorithms:
-
+- 256Kib scratchpad memory
+    - cryptonight_turtle
 - 1MiB scratchpad memory
     - cryptonight_lite
     - cryptonight_lite_v7
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index e0f3bcf1f..4ad4d59e9 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -406,7 +406,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 * this is required if the dev pool is mining monero
 		 * but the user tuned there settings for another currency
 		 */
-		if(miner_algo == cryptonight_monero_v8)
+		if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_turtle)
 		{
 			if(ctx->memChunk < 2)
 				mem_chunk_exp = 1u << 2;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 6a3def72c..f647bcafc 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -538,7 +538,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 R"===(
 
 // cryptonight_monero_v8 && NVIDIA
-#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+#if((ALGO==11  || ALGO==13) && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -556,7 +556,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong a[2];
 
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 || ALGO==13)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -571,7 +571,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     __local uint AES0[256], AES1[256];
 
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 || ALGO==13)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
     __local uint RCP[256];
 #	endif
@@ -587,7 +587,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         AES0[i] = tmp;
         AES1[i] = rotate(tmp, 8U);
 // cryptonight_monero_v8
-#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+#if((ALGO==11 || ALGO==13) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
     }
@@ -622,7 +622,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		b_x[0] = ((uint4 *)b)[0];
 
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 || ALGO==13)
         a[1] = states[1] ^ states[5];
         b[2] = states[8] ^ states[10];
         b[3] = states[9] ^ states[11];
@@ -654,7 +654,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     {
 			ulong c[2];
 // cryptonight_monero_v8 && NVIDIA
-#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
@@ -668,7 +668,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 || ALGO==13)
         {
 			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
@@ -693,7 +693,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
 // cryptonight_monero_v8
-#elif(ALGO==11)
+#elif(ALGO==11 || ALGO==13)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -712,7 +712,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 || ALGO==13)
 			// Use division and square root results from the _previous_ iteration to hide the latency
             tmp.s0 ^= division_result.s0;
             tmp.s1 ^= division_result.s1 ^ sqrt_result;
@@ -770,7 +770,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         ((uint4 *)a)[0] ^= tmp;
 
 // cryptonight_monero_v8
-#if (ALGO == 11)
+#if (ALGO == 11 || ALGO==13)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index c170387b4..4205a67c3 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -4,7 +4,7 @@ R"===(
  */
 
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 || ALGO==13)
 
 static const __constant uint RCP_C[256] =
 {
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0442fde91..f2dce7f90 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -131,7 +131,8 @@ class autoAdjust
 			}
 
 			// check if cryptonight_monero_v8 is selected for the user or dev pool
-			bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end();
+			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() ||
+			                          std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end());
 
 			// true for all cryptonight_heavy derivates since we check the user and dev pool
 			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
@@ -153,6 +154,10 @@ class autoAdjust
 			if(hashMemSize <= CRYPTONIGHT_LITE_MEMORY)
 				maxThreads *= 2u;
 
+			// increase all intensity limits by eight for turtle (*2u shadowed from lite)
+			if (hashMemSize <= CRYPTONIGHT_TURTLE_MEMORY)
+				maxThreads *= 4u;
+
 			// keep 128MiB memory free (value is randomly chosen) from the max available memory
 			const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
 
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 06cbe8740..2218bf88a 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -545,7 +545,7 @@ inline void set_float_rounding_mode()
 
 #define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
 	{ \
 		const uint64_t idx1 = idx0 & MASK; \
 		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
@@ -558,7 +558,7 @@ inline void set_float_rounding_mode()
 
 #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
 	{ \
 		const uint64_t idx1 = idx0 & MASK; \
 		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
@@ -572,7 +572,7 @@ inline void set_float_rounding_mode()
 	}
 
 #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
 	{ \
 		uint64_t sqrt_result_tmp; \
 		assign(sqrt_result_tmp, sqrt_result); \
@@ -627,7 +627,7 @@ inline void set_float_rounding_mode()
 		idx0 = h0[0] ^ h0[4]; \
 		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
 		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
-		if(ALGO == cryptonight_monero_v8) \
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
 		{ \
 			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
 			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
@@ -664,7 +664,7 @@ inline void set_float_rounding_mode()
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	if(ALGO != cryptonight_monero_v8) \
+	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_turtle) \
 		bx0 = cx
 
 #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
@@ -681,7 +681,7 @@ inline void set_float_rounding_mode()
 		ah0 += lo; \
 		al0 += hi; \
 	} \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) \
 	{ \
 		bx1 = bx0; \
 		bx0 = cx; \
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 975f2d4a2..2327bed1d 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -403,6 +403,15 @@ bool minethd::self_test()
 			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
 		}
+    else if (algo == cryptonight_turtle)
+		{
+		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle);
+		hashf("This is a test This is a test This is a test", 44, out, ctx);
+		bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
+		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle);
+		hashf("This is a test This is a test This is a test", 44, out, ctx);
+		bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
+		}
 
 
 		if(!bResult)
@@ -532,6 +541,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_superfast:
 		algv = 11;
 		break;
+	case cryptonight_turtle:
+		algv = 12;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -596,7 +608,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_turtle, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_turtle, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_turtle, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_turtle, true, true>
 	};
 
 	std::bitset<2> digit;
@@ -636,6 +653,35 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		}
 	}
 
+	if (N <= 2 && (algo == cryptonight_turtle) && bHaveAes)
+	{
+		std::string selected_asm = asm_version_str;
+		if (selected_asm == "auto")
+			selected_asm = cpu::getAsmName(N);
+
+		if (selected_asm != "off")
+		{
+			if (selected_asm == "intel_avx" && asm_version_str != "auto")
+			{
+				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
+				if (N == 1)
+					selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_turtle>;
+				else if (N == 2)
+					selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_turtle>;
+			}
+			// supports only 1 thread per hash
+			if (N == 1 && selected_asm == "amd_avx")
+			{
+				// AMD Ryzen (1xxx and 2xxx series)
+				selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_turtle>;
+			}
+			if (asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
+				printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
+			else if (selected_asm != "intel_avx" && selected_asm != "amd_avx") // unknown asm type
+				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
+		}
+	}
+
 	return selected_function;
 }
 
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index e905caa9f..2dd922f91 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -17,7 +17,8 @@ enum xmrstak_algo
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 	cryptonight_monero_v8 = 11,
-	cryptonight_superfast = 12
+	cryptonight_superfast = 12,
+	cryptonight_turtle = 13
 };
 
 // define aeon settings
@@ -37,6 +38,10 @@ constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
 
 constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; 
 
+constexpr size_t CRYPTONIGHT_TURTLE_MEMORY = 256 * 1024;
+constexpr uint32_t CRYPTONIGHT_TURTLE_MASK = 0x1FFF0;
+constexpr uint32_t CRYPTONIGHT_TURTLE_ITER = 0x10000;
+
 template<xmrstak_algo ALGO>
 inline constexpr size_t cn_select_memory() { return 0; }
 
@@ -76,6 +81,9 @@ inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTO
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } 
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MEMORY; }
+
 inline size_t cn_select_memory(xmrstak_algo algo)
 {
 	switch(algo)
@@ -95,6 +103,8 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	case cryptonight_haven:
 	case cryptonight_heavy:
 		return CRYPTONIGHT_HEAVY_MEMORY;
+	case cryptonight_turtle:
+		return CRYPTONIGHT_TURTLE_MEMORY;
 	default:
 		return 0;
 	}
@@ -139,6 +149,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTO
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } 
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MASK; }
+
 inline size_t cn_select_mask(xmrstak_algo algo)
 {
 	switch(algo)
@@ -158,6 +171,8 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	case cryptonight_haven:
 	case cryptonight_heavy:
 		return CRYPTONIGHT_HEAVY_MASK;
+	case cryptonight_turtle:
+		return CRYPTONIGHT_TURTLE_MASK;
 	default:
 		return 0;
 	}
@@ -202,6 +217,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTO
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } 
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_ITER; }
+
 inline size_t cn_select_iter(xmrstak_algo algo)
 {
 	switch(algo)
@@ -223,6 +241,8 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 		return CRYPTONIGHT_MASARI_ITER;
 	case cryptonight_superfast:
 		return CRYPTONIGHT_SUPERFAST_ITER;
+	case cryptonight_turtle:
+		return CRYPTONIGHT_TURTLE_ITER;
 	default:
 		return 0;
 	}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 87c1befa8..c3a97808c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -310,7 +310,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	uint64_t bx1;
 	uint32_t sqrt_result;
 	uint64_t division_result;
-	if(ALGO == cryptonight_monero_v8)
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 	{
 		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
 		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
@@ -350,7 +350,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
 		);
 
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 		{
 
 			const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ];
@@ -393,14 +393,14 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		else
 			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
-		if(ALGO != cryptonight_monero_v8)
+		if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_turtle)
 			bx0 = cx_aes;
 
 		uint64_t cx_mul;
 		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
 		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
 
-		if(ALGO == cryptonight_monero_v8 && sub == 1)
+		if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && sub == 1)
 		{
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			((uint32_t*)&division_result)[1] ^= sqrt_result;
@@ -424,7 +424,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			uint64_t cl = ((uint64_t*)myChunks)[ idx1 ];
 			// sub 0 -> hi, sub 1 -> lo
 			uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
-			if(ALGO == cryptonight_monero_v8)
+			if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 			{
 				const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res;
 				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
@@ -441,7 +441,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 			}
 			ax0 += res;
 		}
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 		{
 			bx1 = bx0;
 			bx0 = cx_aes;
@@ -464,7 +464,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	if ( bfactor > 0 )
 	{
 		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 		{
 			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
 			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
@@ -771,7 +771,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 		{
 			// two threads per block
 			CUDA_CHECK_MSG_KERNEL(
@@ -884,7 +884,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>,
     
 		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>
+		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>,
+
+		cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 1>
 	};
 
 	std::bitset<1> digit;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index f98de2c14..7149f37a6 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -142,7 +142,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	if(ALGO == cryptonight_monero_v8)
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 	{
 		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
 		// bx1
@@ -310,7 +310,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
-	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
+	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() ||
+	        std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end() )
 	{
 		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
 		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
@@ -362,11 +363,16 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
-	if(miner_algo == cryptonight_monero_v8)
+	else if(miner_algo == cryptonight_monero_v8)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	else if (miner_algo == cryptonight_turtle)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_turtle> << <grid, block >> > (wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
@@ -712,7 +718,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
-		bool useCryptonight_v8 = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end();
+		bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end() ||
+		                          std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_turtle) != neededAlgorithms.end());
 
 		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
 		if(useCryptonight_v8 && gpuArch >= 50)
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 2a2dc8dbc..165595c5f 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -99,6 +99,7 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
 	{ "cryptonight_superfast",   {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
+	{ "cryptonight_turtle",  {cryptonight_turtle, cryptonight_turtle, 0u},      {cryptonight_turtle, cryptonight_turtle, 0u},   nullptr },
 	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
@@ -111,7 +112,8 @@ xmrstak::coin_selection coins[] = {
 	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
+	{ "turtlecoin",          {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
+	{ "plenteum",			 {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index a303b34cd..58d5b0e83 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -562,6 +562,7 @@ void executor::ex_main()
 		break;
 	case cryptonight_monero_v8:
 	case cryptonight_monero:
+	case cryptonight_turtle:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:8800", "", "", "", 0.0, true, true, "", false);
 		else
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 406c535d2..cbdf1d0c1 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -709,6 +709,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		case cryptonight_superfast:
 			algo_name = "cryptonight_superfast";
 			break;
+		case cryptonight_turtle:
+			algo_name = "cryptonight_turtle";
+			break;
 		default:
 			algo_name = "unknown";
 			break;
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 58762de56..2019f2b86 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -33,9 +33,12 @@ POOLCONF],
  *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin
+ *    plenteum
  *
  * Native algorithms which not depends on any block versions:
  *
+ *    # 256KiB scratchpad memory
+ *    cryptonight_turtle
  *    # 1MiB scratchpad memory
  *    cryptonight_lite
  *    cryptonight_lite_v7

From 346933d1e0977434944a26e313d3150880507b5c Mon Sep 17 00:00:00 2001
From: fireice-uk <fireice-uk@users.noreply.github.com>
Date: Wed, 30 Jan 2019 08:47:57 +0000
Subject: [PATCH 08/19] Implement CN-GPU Proof-of-Work Algo

Co-authored-by: psychocrypt <psychocryptHPC@gmail.com>
Co-authored-by: fireice-uk <fireice-uk@users.noreply.github.com>
---
 CMakeLists.txt                                |  11 +
 xmrstak/backend/amd/amd_gpu/gpu.cpp           | 179 ++++--
 xmrstak/backend/amd/amd_gpu/gpu.hpp           |   2 +-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 193 +++---
 .../amd/amd_gpu/opencl/cryptonight_gpu.cl     | 383 ++++++++++++
 xmrstak/backend/amd/autoAdjust.hpp            |  13 +-
 xmrstak/backend/cpu/crypto/cn_gpu.hpp         |  43 ++
 xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp     | 176 ++++++
 xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp   | 182 ++++++
 .../backend/cpu/crypto/cryptonight_aesni.h    |  72 ++-
 xmrstak/backend/cpu/minethd.cpp               |  38 +-
 xmrstak/backend/cryptonight.hpp               |  20 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu |  77 ++-
 .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 552 ++++++++++++++++++
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |  92 +--
 .../backend/nvidia/nvcc_code/cuda_keccak.hpp  |  20 +
 xmrstak/jconf.cpp                             |   1 +
 xmrstak/misc/executor.cpp                     |   6 +
 18 files changed, 1866 insertions(+), 194 deletions(-)
 create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
 create mode 100644 xmrstak/backend/cpu/crypto/cn_gpu.hpp
 create mode 100644 xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
 create mode 100644 xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
 create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b714ee0ce..7d21fa928 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,6 +152,9 @@ if(CUDA_ENABLE)
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
             endif()
 
+            # required for cryptonight_gpu (fast floating point operations are not allowed)
+            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --fmad=false --prec-div=true --ftz=false")
+
             # avoid that nvcc in CUDA 8 complains about sm_20 pending removal
             if(CUDA_VERSION VERSION_EQUAL 8.0)
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets")
@@ -279,6 +282,14 @@ else()
     list(APPEND BACKEND_TYPES "cpu")
 endif()
 
+################################################################################
+# Explicit march setting for Clang
+################################################################################
+
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+	set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
+endif()
+
 ################################################################################
 # Find PThreads
 ################################################################################
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 4ad4d59e9..857abc138 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -413,6 +413,10 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			if(strided_index == 1)
 				strided_index = 0;
 		}
+		if(miner_algo == cryptonight_gpu)
+		{
+			strided_index = 0;
+		}
 
 		// if intensity is a multiple of worksize than comp mode is not needed
 		int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0;
@@ -433,6 +437,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 */
 		options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);
 
+		if(miner_algo == cryptonight_gpu)
+			options += " -cl-fp32-correctly-rounded-divide-sqrt";
+
 		/* create a hash for the compile time cache
 		 * used data:
 		 *   - source code
@@ -579,12 +586,23 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			}
 		}
 
-		std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
+		std::vector<std::string> KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" };
+		if(miner_algo == cryptonight_gpu)
+		{
+			KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu");
+			KernelNames.insert(KernelNames.begin(), "cn0_cn_gpu");
+		}
+		else
+		{
+			KernelNames.insert(KernelNames.begin(), "cn1");
+			KernelNames.insert(KernelNames.begin(), "cn0");
+		}
+
 		// append algorithm number to kernel name
 		for(int k = 0; k < 3; k++)
 			KernelNames[k] += std::to_string(miner_algo);
 
-		for(int i = 0; i < 7; ++i)
+		for(int i = 0; i < KernelNames.size(); ++i)
 		{
 			ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret);
 			if(ret != CL_SUCCESS)
@@ -919,6 +937,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	const char *wolfSkeinCL =
 			#include "./opencl/wolf-skein.cl"
 	;
+	const char *cryptonight_gpu =
+			#include "./opencl/cryptonight_gpu.cl"
+	;
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
@@ -928,6 +949,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu);
 
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
@@ -1066,76 +1088,102 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return ERR_OCL_API;
 	}
 
-	// Branch 0
-	if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if(miner_algo == cryptonight_gpu)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	// Branch 1
-	if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	// Branch 2
-	if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
+		// Output
+		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2);
+			return ERR_OCL_API;
+		}
 
-	// Branch 3
-	if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
+		// Target
+		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3);
+			return ERR_OCL_API;
+		}
 
-	// Threads
-	if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
-		return(ERR_OCL_API);
+		// Threads
+		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+			return(ERR_OCL_API);
+		}
 	}
-
-	for(int i = 0; i < 4; ++i)
-	{
-		// States
-		if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	else
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+		// Branch 0
+		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		// Nonce buffer
-		if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		// Branch 1
+		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		// Output
-		if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		// Branch 2
+		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		// Target
-		if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		// Branch 3
+		if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		// Threads
+		if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 			return(ERR_OCL_API);
 		}
+
+		for(int i = 0; i < 4; ++i)
+		{
+			// States
+			if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+				return ERR_OCL_API;
+			}
+
+			// Nonce buffer
+			if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+				return ERR_OCL_API;
+			}
+
+			// Output
+			if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+				return ERR_OCL_API;
+			}
+
+			// Target
+			if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+				return ERR_OCL_API;
+			}
+
+			if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+				return(ERR_OCL_API);
+			}
+		}
 	}
 
 	return ERR_SUCCESS;
@@ -1277,10 +1325,24 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	size_t tmpNonce = ctx->Nonce;
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+	if(miner_algo == cryptonight_gpu)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
-		return ERR_OCL_API;
+		size_t w_size_cn_gpu = w_size * 16;
+		size_t g_thd_cn_gpu = g_thd * 16;
+
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			return ERR_OCL_API;
+		}
+	}
+	else
+	{
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			return ERR_OCL_API;
+		}
 	}
 
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
@@ -1289,13 +1351,16 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		return ERR_OCL_API;
 	}
 
-	for(int i = 0; i < 4; ++i)
+	if(miner_algo != cryptonight_gpu)
 	{
-		size_t tmpNonce = ctx->Nonce;
-		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		for(int i = 0; i < 4; ++i)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
-			return ERR_OCL_API;
+			size_t tmpNonce = ctx->Nonce;
+			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+				return ERR_OCL_API;
+			}
 		}
 	}
 
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 5e9f618ed..5b95e9865 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -53,7 +53,7 @@ struct GpuContext
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
 	std::map<xmrstak_algo, cl_program> Program;
-	std::map<xmrstak_algo, std::array<cl_kernel,8>> Kernels;
+	std::map<xmrstak_algo, std::array<cl_kernel,7>> Kernels;
 	size_t freeMem;
 	size_t maxMemPerAlloc;
 	int computeUnits;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index f647bcafc..ea07d6a8b 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -78,21 +78,6 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 }
 #endif
 
-//#include "opencl/fast_int_math_v2.cl"
-XMRSTAK_INCLUDE_FAST_INT_MATH_V2
-//#include "fast_div_heavy.cl"
-XMRSTAK_INCLUDE_FAST_DIV_HEAVY
-//#include "opencl/wolf-aes.cl"
-XMRSTAK_INCLUDE_WOLF_AES
-//#include "opencl/wolf-skein.cl"
-XMRSTAK_INCLUDE_WOLF_SKEIN
-//#include "opencl/jh.cl"
-XMRSTAK_INCLUDE_JH
-//#include "opencl/blake256.cl"
-XMRSTAK_INCLUDE_BLAKE256
-//#include "opencl/groestl256.cl"
-XMRSTAK_INCLUDE_GROESTL256
-
 static const __constant ulong keccakf_rndc[24] =
 {
     0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
@@ -186,31 +171,49 @@ static const __constant uint keccakf_piln[24] =
     15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
 };
 
-void keccakf1600_1(ulong *st)
+inline void keccakf1600_1(ulong st[25])
 {
     int i, round;
     ulong t, bc[5];
 
     #pragma unroll 1
-    for(round = 0; round < 24; ++round)
+    for (round = 0; round < 24; ++round)
     {
+        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
 
-        // Theta
-        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-        #pragma unroll 1
-        for (i = 0; i < 5; ++i) {
-            t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
-            st[i     ] ^= t;
-            st[i +  5] ^= t;
-            st[i + 10] ^= t;
-            st[i + 15] ^= t;
-            st[i + 20] ^= t;
-        }
+        st[0] ^= bc[4];
+        st[5] ^= bc[4];
+        st[10] ^= bc[4];
+        st[15] ^= bc[4];
+        st[20] ^= bc[4];
+
+        st[1] ^= bc[0];
+        st[6] ^= bc[0];
+        st[11] ^= bc[0];
+        st[16] ^= bc[0];
+        st[21] ^= bc[0];
+
+        st[2] ^= bc[1];
+        st[7] ^= bc[1];
+        st[12] ^= bc[1];
+        st[17] ^= bc[1];
+        st[22] ^= bc[1];
+
+        st[3] ^= bc[2];
+        st[8] ^= bc[2];
+        st[13] ^= bc[2];
+        st[18] ^= bc[2];
+        st[23] ^= bc[2];
+
+        st[4] ^= bc[3];
+        st[9] ^= bc[3];
+        st[14] ^= bc[3];
+        st[19] ^= bc[3];
+        st[24] ^= bc[3];
 
         // Rho Pi
         t = st[1];
@@ -221,17 +224,16 @@ void keccakf1600_1(ulong *st)
             t = bc[0];
         }
 
-        #pragma unroll 1
+        #pragma unroll
         for(int i = 0; i < 25; i += 5)
         {
-            ulong tmp[5];
-
-            #pragma unroll 1
-            for(int x = 0; x < 5; ++x)
-                tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
+            ulong tmp1 = st[i], tmp2 = st[i + 1];
 
-            #pragma unroll 1
-            for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
+            st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+            st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+            st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+            st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+            st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
         }
 
         //  Iota
@@ -311,6 +313,43 @@ void keccakf1600_2(__local ulong *st)
     }
 }
 
+#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
+
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#elif(STRIDED_INDEX==1)
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
+#endif
+
+#define JOIN_DO(x,y) x##y
+#define JOIN(x,y) JOIN_DO(x,y)
+
+inline uint getIdx()
+{
+    return get_global_id(0) - get_global_offset(0);
+}
+
+//#include "opencl/fast_int_math_v2.cl"
+XMRSTAK_INCLUDE_FAST_INT_MATH_V2
+//#include "fast_div_heavy.cl"
+XMRSTAK_INCLUDE_FAST_DIV_HEAVY
+//#include "opencl/wolf-aes.cl"
+XMRSTAK_INCLUDE_WOLF_AES
+//#include "opencl/wolf-skein.cl"
+XMRSTAK_INCLUDE_WOLF_SKEIN
+//#include "opencl/jh.cl"
+XMRSTAK_INCLUDE_JH
+//#include "opencl/blake256.cl"
+XMRSTAK_INCLUDE_BLAKE256
+//#include "opencl/groestl256.cl"
+XMRSTAK_INCLUDE_GROESTL256
+//#include "opencl/cryptonight_gpu.cl"
+XMRSTAK_INCLUDE_CN_GPU
+
 )==="
 R"===(
 
@@ -360,28 +399,8 @@ void AESExpandKey256(uint *keybuf)
 )==="
 R"===(
 
-#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
-
-#if(STRIDED_INDEX==0)
-#   define IDX(x)	(x)
-#elif(STRIDED_INDEX==1)
-#	define IDX(x)   (mul24(((uint)(x)), Threads))
-#elif(STRIDED_INDEX==2)
-#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
-#elif(STRIDED_INDEX==3)
-#	define IDX(x)   ((x) * WORKSIZE)
-#endif
-
-inline uint getIdx()
-{
-    return get_global_id(0) - get_global_offset(0);
-}
-
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
 
-        #define JOIN_DO(x,y) x##y
-#define JOIN(x,y) JOIN_DO(x,y)
-
 __attribute__((reqd_work_group_size(8, 8, 1)))
 __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
@@ -538,7 +557,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 R"===(
 
 // cryptonight_monero_v8 && NVIDIA
-#if((ALGO==11  || ALGO==13) && defined(__NV_CL_C_VERSION))
+#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -556,7 +575,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong a[2];
 
 // cryptonight_monero_v8
-#if(ALGO==11 || ALGO==13)
+#if(ALGO==11 || ALGO==14)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -571,7 +590,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     __local uint AES0[256], AES1[256];
 
 // cryptonight_monero_v8
-#if(ALGO==11 || ALGO==13)
+#if(ALGO==11 || ALGO==14)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
     __local uint RCP[256];
 #	endif
@@ -587,7 +606,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         AES0[i] = tmp;
         AES1[i] = rotate(tmp, 8U);
 // cryptonight_monero_v8
-#if((ALGO==11 || ALGO==13) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+#if((ALGO==11 || ALGO==14) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
     }
@@ -622,7 +641,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		b_x[0] = ((uint4 *)b)[0];
 
 // cryptonight_monero_v8
-#if(ALGO==11 || ALGO==13)
+#if(ALGO==11 || ALGO==14)
         a[1] = states[1] ^ states[5];
         b[2] = states[8] ^ states[10];
         b[3] = states[9] ^ states[11];
@@ -654,7 +673,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     {
 			ulong c[2];
 // cryptonight_monero_v8 && NVIDIA
-#if((ALGO==11 || ALGO==13) && defined(__NV_CL_C_VERSION))
+#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
@@ -668,7 +687,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 
 // cryptonight_monero_v8
-#if(ALGO==11 || ALGO==13)
+#if(ALGO==11 || ALGO==14)
         {
 			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
@@ -693,7 +712,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
 // cryptonight_monero_v8
-#elif(ALGO==11 || ALGO==13)
+#elif(ALGO==11 || ALGO==14)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -712,7 +731,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
 // cryptonight_monero_v8
-#if(ALGO==11 || ALGO==13)
+#if(ALGO==11 || ALGO==14)
 			// Use division and square root results from the _previous_ iteration to hide the latency
             tmp.s0 ^= division_result.s0;
             tmp.s1 ^= division_result.s1 ^ sqrt_result;
@@ -805,7 +824,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 R"===(
 
 __attribute__((reqd_work_group_size(8, 8, 1)))
-__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
+__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states,
+// cryptonight_gpu
+#if (ALGO == 13)
+	__global uint *output, ulong Target, uint Threads)
+#else
+	__global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
+#endif
 {
     __local uint AES0[256], AES1[256], AES2[256], AES3[256];
     uint ExpandedKey2[40];
@@ -823,8 +848,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10  || ALGO == 12)
+// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10  || ALGO == 12)
     __local uint4 xin1[8][8];
     __local uint4 xin2[8][8];
 #endif
@@ -862,8 +887,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
     __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
     __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
@@ -876,7 +901,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     if (gIdx < Threads)
 #endif
     {
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
         #pragma unroll 2
         for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
         {
@@ -916,8 +942,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
     }
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     /* Also left over threads performe this loop.
      * The left over thread results will be ignored
      */
@@ -959,7 +985,15 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
             for(int i = 0; i < 25; ++i) State[i] = states[i];
 
             keccakf1600_2(State);
-
+#if (ALGO == 13)
+			if(State[3] <= Target)
+			{
+                //printf("gt %lu\n", State[3]);
+				ulong outIdx = atomic_inc(output + 0xFF);
+				if(outIdx < 0xFF)
+					output[outIdx] = get_global_id(0);
+			}
+#else
             for(int i = 0; i < 25; ++i) states[i] = State[i];
 
             uint StateSwitch = State[0] & 3;
@@ -967,6 +1001,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
             __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
             __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
             destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
+#endif
         }
     }
     mem_fence(CLK_GLOBAL_MEM_FENCE);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
new file mode 100644
index 000000000..a99243e44
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -0,0 +1,383 @@
+R"===(
+
+
+inline float4 _mm_add_ps(float4 a, float4 b)
+{
+	return a + b;
+}
+
+inline float4 _mm_sub_ps(float4 a, float4 b)
+{
+	return a - b;
+}
+
+inline float4 _mm_mul_ps(float4 a, float4 b)
+{
+
+	//#pragma OPENCL SELECT_ROUNDING_MODE rte
+	return a * b;
+}
+
+inline float4 _mm_div_ps(float4 a, float4 b)
+{
+	return a / b;
+}
+
+inline float4 _mm_and_ps(float4 a, int b)
+{
+	return as_float4(as_int4(a) & (int4)(b));
+}
+
+inline float4 _mm_or_ps(float4 a, int b)
+{
+	return as_float4(as_int4(a) | (int4)(b));
+}
+
+inline float4 _mm_fmod_ps(float4 v, float dc)
+{
+	float4 d = (float4)(dc);
+	float4 c = _mm_div_ps(v, d);
+	c = trunc(c);
+	c = _mm_mul_ps(c, d);
+	return _mm_sub_ps(v, c);
+}
+
+inline int4 _mm_xor_si128(int4 a, int4 b)
+{
+	return a ^ b;
+}
+
+inline float4 _mm_xor_ps(float4 a, int b)
+{
+	return as_float4(as_int4(a) ^ (int4)(b));
+}
+
+inline int4 _mm_alignr_epi8(int4 a, const uint rot)
+{
+	const uint right = 8 * rot;
+	const uint left = (32 - 8 * rot);
+	return (int4)(
+		((uint)a.x >> right) | ( a.y << left ),
+		((uint)a.y >> right) | ( a.z << left ),
+		((uint)a.z >> right) | ( a.w << left ),
+		((uint)a.w >> right) | ( a.x << left )
+	);
+}
+
+
+inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); }
+
+inline float4 fma_break(float4 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(x, 0xFEFFFFFF);
+	return _mm_or_ps(x, 0x00800000);
+}
+
+inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c)
+{
+	n1 = _mm_add_ps(n1, *c);
+	float4 nn = _mm_mul_ps(n0, *c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	*n = _mm_add_ps(*n, nn);
+
+	n3 = _mm_sub_ps(n3, *c);
+	float4 dd = _mm_mul_ps(n2, *c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	*d = _mm_add_ps(*d, dd);
+
+	//Constant feedback
+	*c = _mm_add_ps(*c, rnd_c);
+	*c = _mm_add_ps(*c, (float4)(0.734375f));
+	float4 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	*c = _mm_add_ps(*c, r);
+
+}
+
+// 9*8 + 2 = 74
+inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r)
+{
+	float4 n = (float4)(0.0f);
+	float4 d = (float4)(0.0f);
+
+	sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(d, 0xFF7FFFFF);
+	d = _mm_or_ps(d, 0x40000000);
+	*r =_mm_add_ps(*r, _mm_div_ps(n,d));
+}
+
+inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum)
+{
+	float4 c= (float4)(cnt);
+	// 35 maths calls follow (140 FLOPS)
+	float4 r = (float4)(0.0f);
+
+	for(int i = 0; i < 4; ++i)
+		round_compute(n0, n1, n2, n3, rnd_c, &c, &r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	*sum = r; // 34
+	float4 x = (float4)(536870880.0f);
+	r = _mm_mul_ps(r, x); // 35
+	return convert_int4_rte(r);
+}
+
+inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out)
+{
+	float4 n0 = convert_float4_rte(v0);
+	float4 n1 = convert_float4_rte(v1);
+	float4 n2 = convert_float4_rte(v2);
+	float4 n3 = convert_float4_rte(v3);
+
+	int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
+	*out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
+}
+
+)==="
+R"===(
+
+static const __constant uint look[16][4] = {
+	{0, 1, 2, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+
+	{1, 0, 2, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+
+	{2, 1, 0, 3},
+	{2, 0, 3, 1},
+	{2, 3, 1, 0},
+	{2, 3, 0, 1},
+
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1}
+};
+
+static const __constant float ccnt[16] = {
+	1.34375f,
+	1.28125f,
+	1.359375f,
+	1.3671875f,
+
+	1.4296875f,
+	1.3984375f,
+	1.3828125f,
+	1.3046875f,
+
+	1.4140625f,
+	1.2734375f,
+	1.2578125f,
+	1.2890625f,
+
+	1.3203125f,
+	1.3515625f,
+	1.3359375f,
+	1.4609375f
+};
+
+__attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1)))
+__kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads)
+{
+	const uint gIdx = getIdx();
+
+#if(COMP_MODE==1)
+	if(gIdx < Threads)
+		return;
+#endif
+
+	uint chunk = get_local_id(0) / 16;
+
+#if(STRIDED_INDEX==0)
+	__global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16));
+#endif
+
+	__local int4 smem2[1 * 4 * WORKSIZE];
+	__local int4 smemOut2[1 * 16 * WORKSIZE];
+	__local float4 smemVa2[1 * 16 * WORKSIZE];
+
+	__local int4* smem = smem2 + 4 * chunk;
+	__local int4* smemOut = smemOut2 + 16 * chunk;
+	__local float4* smemVa = smemVa2 + 16 * chunk;
+
+	uint tid = get_local_id(0) % 16;
+
+	uint idxHash = gIdx/16;
+	uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
+	float4 vs = (float4)(0);
+
+	for(size_t i = 0; i < ITERATIONS; i++)
+	{
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+		((__local int*)smem)[tid] = ((__global int*)scratchpad_ptr(s, tid/4, lpad))[tid%4];
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		float4 rc = vs;
+
+		{
+			single_comupte_wrap(
+				tid%4,
+				*(smem + look[tid][0]),
+				*(smem + look[tid][1]),
+				*(smem + look[tid][2]),
+				*(smem + look[tid][3]),
+				ccnt[tid], rc, smemVa + tid,
+				smemOut + tid
+			);
+		}
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		int4 tmp2;
+		if(tid % 4 == 0)
+		{
+			int4 out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]);
+			int4 out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]);
+			out = _mm_xor_si128(out, out2);
+			tmp2 = out;
+			*scratchpad_ptr(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out);
+		}
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+		if(tid % 4 == 0)
+			smemOut[tid] = tmp2;
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+		int4 out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12];
+
+		if(tid%2 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 1];
+		if(tid%4 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 2];
+		if(tid%8 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 4];
+		if(tid%16 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 8];
+		vs = smemVa[0];
+
+		vs = fabs(vs); // take abs(va) by masking the float sign bit
+		float4 xx = _mm_mul_ps(vs, (float4)(16777216.0f));
+		// vs range 0 - 64
+		int4 tmp = convert_int4_rte(xx);
+		tmp = _mm_xor_si128(tmp, out2);
+		// vs is now between 0 and 1
+		vs = _mm_div_ps(vs, (float4)(64.0f));
+		s = tmp.x ^ tmp.y ^ tmp.z ^ tmp.w;
+	}
+}
+
+)==="
+R"===(
+
+inline void generate_512(ulong idx, __local ulong* in, __global ulong* out)
+{
+	ulong hash[25];
+
+	hash[0] = in[0] ^ idx;
+	for(int i = 1; i < 25; ++i)
+		hash[i] = in[i];
+
+	keccakf1600_1(hash);
+	for(int i = 0; i < 20; ++i)
+		out[i] = hash[i];
+	out+=160/8;
+
+	keccakf1600_1(hash);
+	for(int i = 0; i < 22; ++i)
+		out[i] = hash[i];
+	out+=176/8;
+
+	keccakf1600_1(hash);
+	for(int i = 0; i < 22; ++i)
+		out[i] = hash[i];
+}
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void JOIN(cn0_cn_gpu,ALGO)(__global ulong *input, __global int *Scratchpad, __global ulong *states, uint Threads)
+{
+    const uint gIdx = getIdx();
+    __local ulong State_buf[8 * 25];
+	__local ulong* State = State_buf + get_local_id(0) * 25;
+
+#if(COMP_MODE==1)
+    // do not use early return here
+	if(gIdx < Threads)
+#endif
+    {
+        states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
+        Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx);
+#endif
+
+        if (get_local_id(1) == 0)
+        {
+
+// NVIDIA
+#ifdef __NV_CL_C_VERSION
+			for(uint i = 0; i < 8; ++i)
+				State[i] = input[i];
+#else
+            ((__local ulong8 *)State)[0] = vload8(0, input);
+#endif
+            State[8]  = input[8];
+            State[9]  = input[9];
+            State[10] = input[10];
+
+            ((__local uint *)State)[9]  &= 0x00FFFFFFU;
+            ((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
+            ((__local uint *)State)[10] &= 0xFF000000U;
+            /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+             * handle get_global_id and get_global_offset as signed long long int and add
+             * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+             * (even if it is correct casted to unsigned on the host)
+             */
+            ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
+
+            for (int i = 11; i < 25; ++i) {
+                State[i] = 0x00UL;
+            }
+
+            // Last bit of padding
+            State[16] = 0x8000000000000000UL;
+
+            keccakf1600_2(State);
+
+            #pragma unroll
+            for (int i = 0; i < 25; ++i) {
+                states[i] = State[i];
+            }
+        }
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+#if(COMP_MODE==1)
+    // do not use early return here
+	if(gIdx < Threads)
+#endif
+	{
+		for(ulong i = get_local_id(1); i < MEMORY / 512; i += get_local_size(1))
+		{
+			generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512));
+		}
+	}
+}
+
+)==="
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index f2dce7f90..7ca072c95 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -137,6 +137,9 @@ class autoAdjust
 			// true for all cryptonight_heavy derivates since we check the user and dev pool
 			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
 
+			// true for all cryptonight_gpu derivates since we check the user and dev pool
+			bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+
 			// set strided index to default
 			ctx.stridedIndex = 1;
 
@@ -158,13 +161,21 @@ class autoAdjust
 			if (hashMemSize <= CRYPTONIGHT_TURTLE_MEMORY)
 				maxThreads *= 4u;
 
+			if(useCryptonight_gpu)
+			{
+				// 6 waves per compute unit are a good value (based on profiling)
+				// @todo check again after all optimizations
+				maxThreads = ctx.computeUnits * 6 * 8;
+				ctx.stridedIndex = 0;
+			}
+
 			// keep 128MiB memory free (value is randomly chosen) from the max available memory
 			const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
 
 			size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
 
 			uint32_t numThreads = 1u;
-			if(ctx.isAMD)
+			if(ctx.isAMD && !useCryptonight_gpu)
 			{
 				numThreads = 2;
 				size_t memDoubleThread = maxAvailableFreeMem / numThreads;
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
new file mode 100644
index 000000000..4a7697b02
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <stdint.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <malloc.h>
+#include <intrin.h>
+#define HAS_WIN_INTRIN_API
+#endif
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#if !defined(HAS_WIN_INTRIN_API)
+#include <cpuid.h>
+#endif // !defined(HAS_WIN_INTRIN_API)
+#endif // __GNUC__
+
+inline void cngpu_cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+{
+	val[0] = 0;
+	val[1] = 0;
+	val[2] = 0;
+	val[3] = 0;
+
+#if defined(HAS_WIN_INTRIN_API)
+	__cpuidex(val, eax, ecx);
+#else
+	__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+#endif
+}
+
+inline bool cngpu_check_avx2()
+{
+	int32_t cpu_info[4];
+	cngpu_cpuid(7, 0, cpu_info);
+	return (cpu_info[1] & (1 << 5)) != 0;
+}
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad);
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
new file mode 100644
index 000000000..e46705fd0
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
@@ -0,0 +1,176 @@
+#include "cn_gpu.hpp"
+#include "../../cryptonight.hpp"
+
+#pragma GCC target ("avx2")
+
+inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
+{
+	v = _mm256_load_si256(idx);
+	n01 = _mm256_cvtepi32_ps(v);
+}
+
+inline __m256 fma_break(const __m256& x) 
+{ 
+	// Break the dependency chain by setitng the exp to ?????01 
+	__m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x); 
+	return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx); 
+}
+
+// 14
+inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c)
+{
+	__m256 nn = _mm256_mul_ps(n0, c);
+	nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn));
+	nn = fma_break(nn);
+	n = _mm256_add_ps(n, nn);
+
+	__m256 dd = _mm256_mul_ps(n2, c);
+	dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd));
+	dd = fma_break(dd);
+	d = _mm256_add_ps(d, dd);
+
+	//Constant feedback
+	c = _mm256_add_ps(c, rnd_c);
+	c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f));
+	__m256 r = _mm256_add_ps(nn, dd);
+	r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
+	r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
+	c = _mm256_add_ps(c, r);
+}
+
+// 14*8 + 2 = 112
+inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r)
+{
+	__m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps();
+
+	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d);
+	d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d);
+	r = _mm256_add_ps(r, _mm256_div_ps(n, d));
+}
+
+// 112×4 = 448
+template <bool add>
+inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, 
+							  float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
+{
+	__m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
+	__m256 r = _mm256_setzero_ps();
+
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
+	r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
+
+	if(add)
+		sum = _mm256_add_ps(sum, r);
+	else
+		sum = r;
+
+	r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35
+	return _mm256_cvttps_epi32(r);
+}
+
+template <size_t rot>
+inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, 
+								float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
+{
+	__m256i r = double_comupte<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
+	if(rot != 0)
+		r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot));
+
+	out = _mm256_xor_si256(out, r);
+}
+
+template<uint32_t MASK>
+inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m256i*>(lpad + (idx & MASK) + n*16); }
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad)
+{
+	uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+	__m256i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
+	__m256i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
+	__m256 sum0 = _mm256_setzero_ps();
+
+	for(size_t i = 0; i < ITER; i++)
+	{
+		__m256i v01, v23;
+		__m256 suma, sumb, sum1;
+		__m256 rc = sum0;
+
+		__m256 n01, n23;
+		__m256 d01, d23;
+		prep_dv_avx(idx0, v01, n01);
+		prep_dv_avx(idx2, v23, n23);
+		
+		__m256i out, out2;
+		__m256 n10, n22, n33;
+		n10 = _mm256_permute2f128_ps(n01, n01, 0x01);
+		n22 = _mm256_permute2f128_ps(n23, n23, 0x00);
+		n33 = _mm256_permute2f128_ps(n23, n23, 0x11);
+		
+		out = _mm256_setzero_si256();
+		double_comupte_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out);
+		double_comupte_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out);
+		double_comupte_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out);
+		double_comupte_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out);
+		_mm256_store_si256(idx0, _mm256_xor_si256(v01, out));
+		sum0 = _mm256_add_ps(suma, sumb);
+		out2 = out;
+		
+		__m256 n11, n02, n30;
+		n11 = _mm256_permute2f128_ps(n01, n01, 0x11);
+		n02 = _mm256_permute2f128_ps(n01, n23, 0x20);
+		n30 = _mm256_permute2f128_ps(n01, n23, 0x03);
+
+		out = _mm256_setzero_si256();
+		double_comupte_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out);
+		double_comupte_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out);
+		double_comupte_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out);
+		double_comupte_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out);
+		_mm256_store_si256(idx2, _mm256_xor_si256(v23, out));
+		sum1 = _mm256_add_ps(suma, sumb);
+
+		out2 = _mm256_xor_si256(out2, out);
+		out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2);
+		suma = _mm256_permute2f128_ps(sum0, sum1, 0x30);
+		sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21);
+		sum0 = _mm256_add_ps(suma, sumb);
+		sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41));
+
+		// Clear the high 128 bits
+		__m128 sum = _mm256_castps256_ps128(sum0);
+
+		sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit
+		// vs range 0 - 64 
+		__m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f)));
+		v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2));
+		__m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
+		v0 = _mm_xor_si128(v0, v1);
+		v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
+		v0 = _mm_xor_si128(v0, v1);
+
+		// vs is now between 0 and 1
+		sum = _mm_div_ps(sum, _mm_set1_ps(64.0f));
+		sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1);
+		uint32_t n = _mm_cvtsi128_si32(v0);
+		idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
+		idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
+	}
+}
+
+template void cn_gpu_inner_avx<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
new file mode 100644
index 000000000..bde34162a
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
@@ -0,0 +1,182 @@
+#include "cn_gpu.hpp"
+#include "../../cryptonight.hpp"
+
+#pragma GCC target ("sse2")
+
+inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
+{
+	v = _mm_load_si128(idx);
+	n = _mm_cvtepi32_ps(v);
+}
+
+inline __m128 fma_break(__m128 x) 
+{ 
+	// Break the dependency chain by setitng the exp to ?????01 
+	x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x); 
+	return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x); 
+}
+
+// 14
+inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
+{
+	n1 = _mm_add_ps(n1, c);
+	__m128 nn = _mm_mul_ps(n0, c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	n = _mm_add_ps(n, nn);
+
+	n3 = _mm_sub_ps(n3, c);
+	__m128 dd = _mm_mul_ps(n2, c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	d = _mm_add_ps(d, dd);
+
+	//Constant feedback
+	c = _mm_add_ps(c, rnd_c);
+	c = _mm_add_ps(c, _mm_set1_ps(0.734375f));
+	__m128 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
+	r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
+	c = _mm_add_ps(c, r);
+}
+
+// 14*8 + 2 = 112
+inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
+{
+	__m128 n = _mm_setzero_ps(), d = _mm_setzero_ps();
+
+	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d);
+	d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d);
+	r =_mm_add_ps(r, _mm_div_ps(n,d));
+}
+
+// 112×4 = 448
+template<bool add>
+inline __m128i single_comupte(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+{
+	__m128 c = _mm_set1_ps(cnt);
+	__m128 r = _mm_setzero_ps();
+
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
+	r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
+
+	if(add)
+		sum = _mm_add_ps(sum, r);
+	else
+		sum = r;
+
+	r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35
+	return _mm_cvttps_epi32(r);
+}
+
+template<size_t rot>
+inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+{
+	__m128i r = single_comupte<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
+	if(rot != 0)
+		r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot));
+	out = _mm_xor_si128(out, r);
+}
+
+template<uint32_t MASK>
+inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n) { return reinterpret_cast<__m128i*>(lpad + (idx & MASK) + n*16); }
+
+template<size_t ITER, uint32_t MASK>
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad)
+{
+	uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+	__m128i* idx0 = scratchpad_ptr<MASK>(lpad, s, 0);
+	__m128i* idx1 = scratchpad_ptr<MASK>(lpad, s, 1);
+	__m128i* idx2 = scratchpad_ptr<MASK>(lpad, s, 2);
+	__m128i* idx3 = scratchpad_ptr<MASK>(lpad, s, 3);
+	__m128 sum0 = _mm_setzero_ps();
+	
+	for(size_t i = 0; i < ITER; i++)
+	{
+		__m128 n0, n1, n2, n3;
+		__m128i v0, v1, v2, v3;
+		__m128 suma, sumb, sum1, sum2, sum3;
+		
+		prep_dv(idx0, v0, n0);
+		prep_dv(idx1, v1, n1);
+		prep_dv(idx2, v2, n2);
+		prep_dv(idx3, v3, n3);
+		__m128 rc = sum0;
+
+		__m128i out, out2;
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
+		single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
+		single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
+		single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
+		sum0 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx0, _mm_xor_si128(v0, out));
+		out2 = out;
+	
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
+		single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
+		single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
+		single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
+		sum1 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx1, _mm_xor_si128(v1, out));
+		out2 = _mm_xor_si128(out2, out);
+
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
+		single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
+		single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
+		single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
+		sum2 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx2, _mm_xor_si128(v2, out));
+		out2 = _mm_xor_si128(out2, out);
+
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
+		single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
+		single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
+		single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
+		sum3 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx3, _mm_xor_si128(v3, out));
+		out2 = _mm_xor_si128(out2, out);
+		sum0 = _mm_add_ps(sum0, sum1);
+		sum2 = _mm_add_ps(sum2, sum3);
+		sum0 = _mm_add_ps(sum0, sum2);
+
+		sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit
+		// vs range 0 - 64 
+		n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f));
+		v0 = _mm_cvttps_epi32(n0);
+		v0 = _mm_xor_si128(v0, out2);
+		v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
+		v0 = _mm_xor_si128(v0, v1);
+		v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
+		v0 = _mm_xor_si128(v0, v1);
+
+		// vs is now between 0 and 1
+		sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f));
+		uint32_t n = _mm_cvtsi128_si32(v0);
+		idx0 = scratchpad_ptr<MASK>(lpad, n, 0);
+		idx1 = scratchpad_ptr<MASK>(lpad, n, 1);
+		idx2 = scratchpad_ptr<MASK>(lpad, n, 2);
+		idx3 = scratchpad_ptr<MASK>(lpad, n, 3);
+	}
+}
+
+template void cn_gpu_inner_ssse3<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK>(const uint8_t* spad, uint8_t* lpad);
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 2218bf88a..c75eff8ff 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -17,6 +17,7 @@
 
 #include "cryptonight.h"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "cn_gpu.hpp"
 #include <memory.h>
 #include <stdio.h>
 #include <cfenv>
@@ -167,6 +168,8 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3
 template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
 void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 {
+	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast;
+	
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
 	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
@@ -182,7 +185,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	xin6 = _mm_load_si128(input + 10);
 	xin7 = _mm_load_si128(input + 11);
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(HEAVY_MIX)
 	{
 		for(size_t i=0; i < 16; i++)
 		{
@@ -263,9 +266,45 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	}
 }
 
+template<size_t MEM, bool PREFETCH, xmrstak_algo ALGO>
+void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output)
+{
+	constexpr size_t hash_size = 200; // 25x8 bytes
+	alignas(128) uint64_t hash[25];
+
+	for (uint64_t i = 0; i < MEM / 512; i++)
+	{
+		memcpy(hash, input, hash_size);
+		hash[0] ^= i;
+
+		keccakf(hash, 24);
+		memcpy(output, hash, 160);
+		output+=160;
+
+		keccakf(hash, 24);
+		memcpy(output, hash, 176);
+		output+=176;
+
+		keccakf(hash, 24);
+		memcpy(output, hash, 176);
+		output+=176;
+		
+		if(PREFETCH)
+		{
+			_mm_prefetch((const char*)output - 512, _MM_HINT_T2);
+			_mm_prefetch((const char*)output - 384, _MM_HINT_T2);
+			_mm_prefetch((const char*)output - 256, _MM_HINT_T2);
+			_mm_prefetch((const char*)output - 128, _MM_HINT_T2);
+		}
+	}
+}
+
 template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
 void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 {
+	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || 
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
+
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
 	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
@@ -326,11 +365,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 			aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 		}
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+		if(HEAVY_MIX)
 			mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 	}
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(HEAVY_MIX)
 	{
 		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 		{
@@ -377,7 +416,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 				aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 			}
 
-			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+			if(HEAVY_MIX)
 				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 		}
 
@@ -1000,3 +1039,28 @@ struct Cryptonight_hash_asm<2, 0>
 		}
 	}
 };
+
+struct Cryptonight_hash_gpu
+{
+	static constexpr size_t N = 1;
+
+	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	{
+		constexpr size_t MASK = cn_select_mask<ALGO>();
+		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
+		constexpr size_t MEM = cn_select_memory<ALGO>();
+
+		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		cn_explode_scratchpad_gpu<MEM, PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state);
+
+		if(cngpu_check_avx2())
+			cn_gpu_inner_avx<ITERATIONS, MASK>(ctx[0]->hash_state, ctx[0]->long_state);
+		else
+			cn_gpu_inner_ssse3<ITERATIONS, MASK>(ctx[0]->hash_state, ctx[0]->long_state);
+
+		cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		keccakf((uint64_t*)ctx[0]->hash_state, 24);
+		memcpy(output, ctx[0]->hash_state, 32);
+	}
+};
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 2327bed1d..e1af701e8 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -397,22 +397,32 @@ bool minethd::self_test()
 			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
 			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
 		}
-    else if(algo == cryptonight_superfast)
+		else if(algo == cryptonight_superfast)
 		{
 			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast);
 			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx);
 			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
 		}
-    else if (algo == cryptonight_turtle)
+		else if(algo == cryptonight_gpu)
 		{
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle);
-		hashf("This is a test This is a test This is a test", 44, out, ctx);
-		bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
-		hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle);
-		hashf("This is a test This is a test This is a test", 44, out, ctx);
-		bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
-		}
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_gpu);
+			hashf("", 0, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_gpu);
+			hashf("", 0, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+		}
+		else if (algo == cryptonight_turtle)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_turtle);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
+		
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_turtle);
+			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
+		}
 
 		if(!bResult)
 			printer::inst()->print_msg(L0,
@@ -541,9 +551,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_superfast:
 		algv = 11;
 		break;
-	case cryptonight_turtle:
+	case cryptonight_gpu:
 		algv = 12;
 		break;
+	case cryptonight_turtle:
+		algv = 13;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -609,6 +622,11 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>,
+		
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, false>,
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, false>,
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, true>,
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, true>,
 
 		Cryptonight_hash<N>::template hash<cryptonight_turtle, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_turtle, true, false>,
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 2dd922f91..ae862abae 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -18,7 +18,8 @@ enum xmrstak_algo
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 	cryptonight_monero_v8 = 11,
 	cryptonight_superfast = 12,
-	cryptonight_turtle = 13
+	cryptonight_gpu = 13,
+	cryptonight_turtle = 14
 };
 
 // define aeon settings
@@ -34,6 +35,9 @@ constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
 constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0;
 constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
 
+constexpr uint32_t CRYPTONIGHT_GPU_MASK = 0x1FFFC0;
+constexpr uint32_t CRYPTONIGHT_GPU_ITER = 0xC000;
+
 constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
 
 constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; 
@@ -81,6 +85,9 @@ inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTO
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } 
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_gpu>() { return CRYPTONIGHT_MEMORY; } 
+
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MEMORY; }
 
@@ -94,6 +101,7 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	case cryptonight_masari:
 	case cryptonight:
 	case cryptonight_superfast: 
+	case cryptonight_gpu:
 		return CRYPTONIGHT_MEMORY;
 	case cryptonight_ipbc:
 	case cryptonight_aeon:
@@ -149,6 +157,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTO
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } 
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_gpu>() { return CRYPTONIGHT_GPU_MASK; } 
+
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_MASK; }
 
@@ -171,6 +182,8 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	case cryptonight_haven:
 	case cryptonight_heavy:
 		return CRYPTONIGHT_HEAVY_MASK;
+	case cryptonight_gpu:
+		return CRYPTONIGHT_GPU_MASK;
 	case cryptonight_turtle:
 		return CRYPTONIGHT_TURTLE_MASK;
 	default:
@@ -217,6 +230,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTO
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } 
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_gpu>() { return CRYPTONIGHT_GPU_ITER; } 
+
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_turtle>() { return CRYPTONIGHT_TURTLE_ITER; }
 
@@ -241,6 +257,8 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 		return CRYPTONIGHT_MASARI_ITER;
 	case cryptonight_superfast:
 		return CRYPTONIGHT_SUPERFAST_ITER;
+	case cryptonight_gpu:
+		return CRYPTONIGHT_GPU_ITER;
 	case cryptonight_turtle:
 		return CRYPTONIGHT_TURTLE_ITER;
 	default:
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index c3a97808c..2acf1a387 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -10,6 +10,7 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp"
 
 
 #ifdef _WIN32
@@ -724,7 +725,8 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+		if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+			ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 		{
 			#pragma unroll
 			for ( int j = 0; j < 4; ++j )
@@ -843,6 +845,73 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 	}
 }
 
+template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
+void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce)
+{
+	dim3 grid( ctx->device_blocks );
+	dim3 block( ctx->device_threads );
+	dim3 block2( ctx->device_threads << 1 );
+	dim3 block4( ctx->device_threads << 2 );
+	dim3 block8( ctx->device_threads << 3 );
+
+	size_t intensity = ctx->device_blocks * ctx->device_threads;
+
+	CUDA_CHECK_KERNEL(
+		ctx->device_id,
+		xmrstak::nvidia::cn_explode_gpu<MEMORY><<<intensity,32>>>((int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
+	);
+
+	int partcount = 1 << ctx->device_bfactor;
+	for(int i = 0; i < partcount; i++)
+	{
+		CUDA_CHECK_KERNEL(
+			ctx->device_id,
+			// 36 x 16byte x numThreads
+			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<ITERATIONS, MEMORY>
+				<<<ctx->device_blocks, ctx->device_threads * 16,  36 * 16 * ctx->device_threads>>>
+				(
+					(int*)ctx->d_ctx_state,
+					(int*)ctx->d_long_state,
+					ctx->device_bfactor,
+					i,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b
+				)
+		);
+	}
+
+	/* bfactor for phase 3
+	 *
+	 * 3 consume less time than phase 2, therefore we begin with the
+	 * kernel splitting if the user defined a `bfactor >= 5`
+	 */
+	int bfactorOneThree = ctx->device_bfactor - 4;
+	if( bfactorOneThree < 0 )
+		bfactorOneThree = 0;
+
+	int partcountOneThree = 1 << bfactorOneThree;
+	int roundsPhase3 = partcountOneThree;
+
+	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
+	{
+		// cryptonight_heavy used two full rounds over the scratchpad memory
+		roundsPhase3 *= 2;
+	}
+
+	for ( int i = 0; i < roundsPhase3; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY/4, ALGO><<<
+			grid,
+			block8,
+			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+		>>>( ctx->device_blocks*ctx->device_threads,
+			bfactorOneThree, i,
+			ctx->d_long_state,
+			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+	}
+}
+
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
 	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
@@ -882,10 +951,13 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>,
-    
+
 		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>,
 		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>,
 
+		cryptonight_core_gpu_hash_gpu<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK, CRYPTONIGHT_MEMORY, cryptonight_gpu, 0>,
+		cryptonight_core_gpu_hash_gpu<CRYPTONIGHT_GPU_ITER, CRYPTONIGHT_GPU_MASK, CRYPTONIGHT_MEMORY, cryptonight_gpu, 1>,
+
 		cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 0>,
 		cryptonight_core_gpu_hash<CRYPTONIGHT_TURTLE_ITER, CRYPTONIGHT_TURTLE_MASK, CRYPTONIGHT_TURTLE_MEMORY/4, cryptonight_turtle, 1>
 	};
@@ -895,4 +967,5 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 
 	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
 	selected_function(ctx, startNonce);
+
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
new file mode 100644
index 000000000..a0fe53418
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -0,0 +1,552 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <cstdint>
+
+#include "cuda_keccak.hpp"
+#include "cuda_extra.hpp"
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+struct __m128i : public int4
+{
+
+	__forceinline__ __device__ __m128i(){}
+
+	__forceinline__ __device__ __m128i(
+		const uint32_t x0, const uint32_t x1,
+		const uint32_t x2, const uint32_t x3)
+	{
+		x = x0;
+		y = x1;
+		z = x2;
+		w = x3;
+	}
+
+	__forceinline__ __device__ __m128i( const int x0)
+	{
+		x = x0;
+		y = x0;
+		z = x0;
+		w = x0;
+	}
+
+	__forceinline__ __device__ __m128i operator|(const __m128i& other)
+	{
+		return __m128i(
+			x | other.x,
+			y | other.y,
+			z | other.z,
+			w | other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128i operator^(const __m128i& other)
+	{
+		return __m128i(
+			x ^ other.x,
+			y ^ other.y,
+			z ^ other.z,
+			w ^ other.w
+		);
+	}
+};
+
+struct __m128 : public float4
+{
+
+	__forceinline__ __device__ __m128(){}
+
+	__forceinline__ __device__ __m128(
+		const float x0, const float x1,
+		const float x2, const float x3)
+	{
+		float4::x = x0;
+		float4::y = x1;
+		float4::z = x2;
+		float4::w = x3;
+	}
+
+	__forceinline__ __device__ __m128( const float x0)
+	{
+		float4::x = x0;
+		float4::y = x0;
+		float4::z = x0;
+		float4::w = x0;
+	}
+
+	__forceinline__ __device__ __m128( const __m128i& x0)
+	{
+		float4::x = int2float(x0.x);
+		float4::y = int2float(x0.y);
+		float4::z = int2float(x0.z);
+		float4::w = int2float(x0.w);
+	}
+
+	__forceinline__ __device__ __m128i get_int( )
+	{
+		return __m128i(
+			(int)x,
+			(int)y,
+			(int)z,
+			(int)w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator+(const __m128& other)
+	{
+		return __m128(
+			x + other.x,
+			y + other.y,
+			z + other.z,
+			w + other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator-(const __m128& other)
+	{
+		return __m128(
+			x - other.x,
+			y - other.y,
+			z - other.z,
+			w - other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator*(const __m128& other)
+	{
+		return __m128(
+			x * other.x,
+			y * other.y,
+			z * other.z,
+			w * other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator/(const __m128& other)
+	{
+		return __m128(
+			x / other.x,
+			y / other.y,
+			z / other.z,
+			w / other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128& trunc()
+	{
+		x=::truncf(x);
+		y=::truncf(y);
+		z=::truncf(z);
+		w=::truncf(w);
+
+		return *this;
+	}
+
+	__forceinline__ __device__ __m128& abs()
+	{
+		x=::fabsf(x);
+		y=::fabsf(y);
+		z=::fabsf(z);
+		w=::fabsf(w);
+
+		return *this;
+	}
+
+	__forceinline__ __device__ __m128& floor()
+	{
+		x=::floorf(x);
+		y=::floorf(y);
+		z=::floorf(z);
+		w=::floorf(w);
+
+		return *this;
+	}
+};
+
+
+template<typename T>
+__device__ void print(const char* name, T value)
+{
+	printf("g %s: ", name);
+	for(int i = 0; i < 4; ++i)
+	{
+		printf("%08X ",((uint32_t*)&value)[i]);
+	}
+	printf("\n");
+}
+
+template<>
+__device__ void print<__m128>(const char* name, __m128 value)
+{
+	printf("g %s: ", name);
+	for(int i = 0; i < 4; ++i)
+	{
+		printf("%f ",((float*)&value)[i]);
+	}
+	printf("\n");
+}
+
+#define SHOW(name) print(#name, name)
+
+
+__forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+	return a + b;
+}
+
+__forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+	return a - b;
+}
+
+__forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+	return a * b;
+}
+
+__forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+	return a / b;
+}
+
+__forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
+{
+	return __m128(
+		int_as_float(float_as_int(a.x) & b),
+		int_as_float(float_as_int(a.y) & b),
+		int_as_float(float_as_int(a.z) & b),
+		int_as_float(float_as_int(a.w) & b)
+	);
+}
+
+__forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
+{
+	return __m128(
+		int_as_float(float_as_int(a.x) | b),
+		int_as_float(float_as_int(a.y) | b),
+		int_as_float(float_as_int(a.z) | b),
+		int_as_float(float_as_int(a.w) | b)
+	);
+}
+
+__forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
+{
+	return __m128(
+		int_as_float(float_as_int(a.x) ^ b),
+		int_as_float(float_as_int(a.y) ^ b),
+		int_as_float(float_as_int(a.z) ^ b),
+		int_as_float(float_as_int(a.w) ^ b)
+	);
+}
+
+__forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc)
+{
+	__m128 d(dc);
+	__m128 c = _mm_div_ps(v, d);
+	c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+	// c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2
+	c = _mm_mul_ps(c, d);
+	return _mm_sub_ps(v, c);
+
+
+	//return a.fmodf(b);
+}
+
+__forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+	return a ^ b;
+}
+
+
+__forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot)
+{
+	const uint32_t right = 8 * rot;
+	const uint32_t left = (32 - 8 * rot);
+	return __m128i(
+		((uint32_t)a.x >> right) | ( a.y << left ),
+		((uint32_t)a.y >> right) | ( a.z << left ),
+		((uint32_t)a.z >> right) | ( a.w << left ),
+		((uint32_t)a.w >> right) | ( a.x << left )
+	);
+}
+
+template<uint32_t MASK>
+__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
+
+
+__forceinline__ __device__  __m128 fma_break(__m128 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(x, 0xFEFFFFFF);
+	return _mm_or_ps(x, 0x00800000);
+}
+
+// 9
+__forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
+{
+	n1 = _mm_add_ps(n1, c);
+	__m128 nn = _mm_mul_ps(n0, c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	n = _mm_add_ps(n, nn);
+
+	n3 = _mm_sub_ps(n3, c);
+	__m128 dd = _mm_mul_ps(n2, c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	d = _mm_add_ps(d, dd);
+
+	//Constant feedback
+	c = _mm_add_ps(c, rnd_c);
+	c = _mm_add_ps(c, 0.734375f);
+	__m128 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	c = _mm_add_ps(c, r);
+}
+
+// 9*8 + 2 = 74
+__forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
+{
+	__m128 n(0.0f), d(0.0f);
+
+	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(d, 0xFF7FFFFF);
+	d = _mm_or_ps(d, 0x40000000);
+	r =_mm_add_ps(r, _mm_div_ps(n,d));
+}
+
+// 74*8 = 595
+__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+{
+	__m128 c(cnt);
+	// 35 maths calls follow (140 FLOPS)
+	__m128 r = __m128(0.0f);
+	for(int i=0; i< 4; ++i)
+		round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	sum = r; // 34
+	r = _mm_mul_ps(r, __m128(536870880.0f)); // 35
+	return r.get_int();
+
+}
+
+__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, __m128i v0, __m128i v1, __m128i v2, __m128i v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+{
+	__m128 n0(v0);
+	__m128 n1(v1);
+	__m128 n2(v2);
+	__m128 n3(v3);
+
+	__m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
+	out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
+}
+
+__constant__ uint32_t look[16][4] = {
+	{0, 1, 2, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+
+	{1, 0, 2, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+
+	{2, 1, 0, 3},
+	{2, 0, 3, 1},
+	{2, 3, 1, 0},
+	{2, 3, 0, 1},
+
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1}
+};
+
+__constant__ float ccnt[16] = {
+	1.34375f,
+	1.28125f,
+	1.359375f,
+	1.3671875f,
+
+	1.4296875f,
+	1.3984375f,
+	1.3828125f,
+	1.3046875f,
+
+	1.4140625f,
+	1.2734375f,
+	1.2578125f,
+	1.2890625f,
+
+	1.3203125f,
+	1.3515625f,
+	1.3359375f,
+	1.4609375f
+};
+
+template<size_t ITERATIONS, uint32_t MEMORY>
+__global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
+{
+	static constexpr uint32_t MASK = ((MEMORY-1) >> 6) << 6;
+
+	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+
+	extern __shared__ __m128i smemExtern_in[];
+
+	const uint32_t chunk = threadIdx.x / 16;
+	const uint32_t numHashPerBlock = blockDim.x / 16;
+
+	int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk));
+
+	__m128i* smem = smemExtern_in + 4 * chunk;
+
+	__m128i* smemExtern = smemExtern_in + numHashPerBlock * 4;
+	__m128i* smemOut = smemExtern + 16 * chunk;
+
+	smemExtern = smemExtern + numHashPerBlock * 16;
+	__m128* smemVa = (__m128*)smemExtern + 16 * chunk;
+
+	uint32_t tid = threadIdx.x % 16;
+
+	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16;
+	uint32_t s = 0;
+
+	__m128 vs(0);
+	if(partidx != 0)
+	{
+		vs = ((__m128*)roundVs)[idxHash];
+		s = roundS[idxHash];
+	}
+	else
+	{
+		s = ((uint32_t*)spad)[idxHash * 50] >> 8;
+	}
+
+	for(size_t i = 0; i < batchsize; i++)
+	{
+		__syncthreads();
+
+		((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, tid/4, lpad))[tid%4];
+		__syncthreads();
+
+		__m128 rc = vs;
+
+
+		single_comupte_wrap(
+			tid%4,
+			*(smem + look[tid][0]),
+			*(smem + look[tid][1]),
+			*(smem + look[tid][2]),
+			*(smem + look[tid][3]),
+			ccnt[tid], rc, smemVa[tid],
+			smemOut[tid]
+		);
+
+		__syncthreads();
+
+		if(tid % 4 == 0)
+		{
+			__m128i out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]);
+			__m128i out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]);
+			out = _mm_xor_si128(out, out2);
+			smemOut[tid] = out;
+			*scratchpad_ptr<MASK>(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out);
+		}
+		__syncthreads();
+
+
+		__m128i out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12];
+
+		if(tid%2 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 1];
+
+		if(tid%4 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 2];
+		if(tid%8 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 4];
+		if(tid%16 == 0)
+			smemVa[tid] = smemVa[tid] + smemVa[tid + 8];
+		vs = smemVa[0];
+
+		vs.abs(); // take abs(va) by masking the float sign bit
+		auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
+		// vs range 0 - 64
+		*smem = xx.get_int();
+		*smem = _mm_xor_si128(*smem, out2);
+		// vs is now between 0 and 1
+		vs = _mm_div_ps(vs, __m128(64.0f));
+		s = smem->x ^ smem->y ^ smem->z ^ smem->w;
+	}
+	if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
+	{
+		const uint32_t numHashPerBlock2 = blockDim.x / 16;
+		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16;
+		((__m128*)roundVs)[idxHash2] = vs;
+		roundS[idxHash2] = s;
+	}
+}
+
+__forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, uint8_t* out)
+{
+	uint64_t hash[25];
+
+	hash[0] = in[0] ^ idx;
+	#pragma unroll 24
+	for(int i = 1; i < 25; ++i)
+		hash[i] = in[i];
+
+	cn_keccakf2(hash);
+	#pragma unroll 10
+	for(int i = 0; i < 10; ++i)
+		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+	out+=160;
+
+	cn_keccakf2(hash);
+	#pragma unroll 11
+	for(int i = 0; i < 11; ++i)
+		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+	out+=176;
+
+	cn_keccakf2(hash);
+	#pragma unroll 11
+	for(int i = 0; i < 11; ++i)
+		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+}
+
+template<size_t MEMORY>
+__global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in)
+{
+	__shared__ uint64_t state[25];
+
+	uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY;
+	uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200);
+
+	constexpr size_t hash_size = 200; // 25x8 bytes
+	memcpy(state, spad, hash_size);
+
+	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
+	{
+		generate_512(i, state, (uint8_t*)lpad + i*512);
+	}
+}
+
+} // namespace xmrstak
+} // namespace nvidia
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 7149f37a6..e4574e20a 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -9,21 +9,6 @@
 #include  <algorithm>
 #include "xmrstak/jconf.hpp"
 
-#ifdef __CUDACC__
-__constant__
-#else
-const
-#endif
-uint64_t keccakf_rndc[24] ={
-	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
 
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
@@ -184,7 +169,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 
 	__shared__ uint32_t sharedMemory[1024];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		cn_aes_gpu_init( sharedMemory );
 		__syncthreads( );
@@ -201,7 +187,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	for ( i = 0; i < 50; i++ )
 		state[i] = ctx_state[i];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		uint32_t key[40];
 
@@ -220,33 +207,46 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	}
 	cn_keccakf2( (uint64_t *) state );
 
-	switch ( ( (uint8_t *) state )[0] & 0x03 )
+	if(ALGO == cryptonight_gpu)
 	{
-	case 0:
-		cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
-		break;
-	case 1:
-		cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
-		break;
-	case 2:
-		cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
-		break;
-	case 3:
-		cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
-		break;
-	default:
-		break;
+		if ( ((uint64_t*)state)[3] < target )
+		{
+			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+
+			if(idx < 10)
+				d_res_nonce[idx] = thread;
+		}
 	}
+	else
+	{
+		switch ( ( (uint8_t *) state )[0] & 0x03 )
+		{
+		case 0:
+			cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
+			break;
+		case 1:
+			cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
+			break;
+		case 2:
+			cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
+			break;
+		case 3:
+			cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
+			break;
+		default:
+			break;
+		}
 
-	// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-	// and expect an accurate result for target > 32-bit without implementing carries
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
 
-	if ( hash[3] < target )
-	{
-		uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+		if ( hash[3] < target )
+		{
+			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
 
-		if(idx < 10)
-			d_res_nonce[idx] = thread;
+			if(idx < 10)
+				d_res_nonce[idx] = thread;
+		}
 	}
 }
 
@@ -373,6 +373,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_turtle> << <grid, block >> > (wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2));
 	}
+	else if(miner_algo == cryptonight_gpu)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
@@ -426,6 +431,15 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
 		);
 	}
+	else if(miner_algo == cryptonight_gpu)
+	{
+		// fallback for all other algorithms
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	}
 	else
 	{
 		// fallback for all other algorithms
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
index 99c651645..c75c74964 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -1,3 +1,23 @@
+#pragma once
+
+#include "cuda_extra.hpp"
+
+#ifdef __CUDACC__
+__constant__
+#else
+const
+#endif
+uint64_t keccakf_rndc[24] ={
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
 #if __CUDA_ARCH__ >= 350
 	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
 	{
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 165595c5f..80e6002d7 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -103,6 +103,7 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "cryptonight_gpu",     {cryptonight_gpu, cryptonight_gpu, 255u},            {cryptonight_gpu, cryptonight_gpu, 0u}, nullptr },
 	{ "freehaven",           {cryptonight_heavy, cryptonight_superfast, 255u},    {cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
 	{ "graft",               {cryptonight_monero_v8, cryptonight_monero_v8, 0u},    {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index 2f4e2a11f..c475c4129 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -560,6 +560,12 @@ void executor::ex_main()
 		else
 			pools.emplace_front(0, "donate.xmr-stak.net:5555", "", "", "", 0.0, true, false, "", true);
 		break;
+	case cryptonight_gpu:
+		if(dev_tls)
+			pools.emplace_front(0, "donate.xmr-stak.net:8811", "", "", "", 0.0, true, true, "", false);
+		else
+			pools.emplace_front(0, "donate.xmr-stak.net:5511", "", "", "", 0.0, true, false, "", false);
+		break;
 	case cryptonight_monero_v8:
 	case cryptonight_monero:
 	case cryptonight_turtle:

From 17f3aef08bd21e820d0c58b288dbc7f46b207f79 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 30 Jan 2019 21:23:25 +0100
Subject: [PATCH 09/19] fix compile

- fix broken trutle coin
- fix non cn_gpu algorithms
---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl      | 9 ++++++---
 xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index ea07d6a8b..53394037b 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -347,8 +347,11 @@ XMRSTAK_INCLUDE_JH
 XMRSTAK_INCLUDE_BLAKE256
 //#include "opencl/groestl256.cl"
 XMRSTAK_INCLUDE_GROESTL256
-//#include "opencl/cryptonight_gpu.cl"
-XMRSTAK_INCLUDE_CN_GPU
+
+#if (ALGO == 13)
+	//#include "opencl/cryptonight_gpu.cl"
+	XMRSTAK_INCLUDE_CN_GPU
+#endif
 
 )==="
 R"===(
@@ -789,7 +792,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         ((uint4 *)a)[0] ^= tmp;
 
 // cryptonight_monero_v8
-#if (ALGO == 11 || ALGO==13)
+#if (ALGO == 11 || ALGO==14)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index 4205a67c3..93e304aee 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -4,7 +4,7 @@ R"===(
  */
 
 // cryptonight_monero_v8
-#if(ALGO==11 || ALGO==13)
+#if(ALGO==11 || ALGO==14)
 
 static const __constant uint RCP_C[256] =
 {

From adeeab6fe7dc4f5fbb89da0790dc2dee8fae3aff Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 30 Jan 2019 21:51:55 +0100
Subject: [PATCH 10/19] fix cuda 10

- fix race condition during shared memory access
- optimize memory access
---
 .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
index a0fe53418..d3df0fed2 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -462,30 +462,30 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int
 
 		__syncthreads();
 
-		if(tid % 4 == 0)
-		{
-			__m128i out = _mm_xor_si128(smemOut[tid], smemOut[tid + 1]);
-			__m128i out2 = _mm_xor_si128(smemOut[tid + 2], smemOut[tid + 3]);
-			out = _mm_xor_si128(out, out2);
-			smemOut[tid] = out;
-			*scratchpad_ptr<MASK>(s , tid/4, lpad) = _mm_xor_si128(smem[tid/4], out);
-		}
-		__syncthreads();
+		const uint32_t b = tid / 4;
+		const uint32_t bb = tid % 4;
 
+		int outXor = ((int*)smemOut)[b * 16 + bb];
+		for(uint32_t dd = b * 16 + 4 + bb; dd < (b + 1) * 16; dd += 4)
+			outXor ^= ((int*)smemOut)[dd];
 
-		__m128i out2 = smemOut[0] ^ smemOut[4] ^ smemOut[8] ^ smemOut[12];
+		((int*)scratchpad_ptr<MASK>(s, tid/4, lpad))[tid%4] = outXor ^ ((int*)smem)[tid];
+		((int*)smemOut)[tid] = outXor;
 
-		if(tid%2 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 1];
+		float va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4];
+		float va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12];
+		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
 
-		if(tid%4 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 2];
-		if(tid%8 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 4];
-		if(tid%16 == 0)
-			smemVa[tid] = smemVa[tid] + smemVa[tid + 8];
-		vs = smemVa[0];
+		__syncthreads();
+
+		__m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3];
+		va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4];
+		va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12];
+		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
 
+		__syncthreads();
+
+		vs = smemVa[0];
 		vs.abs(); // take abs(va) by masking the float sign bit
 		auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
 		// vs range 0 - 64

From 88ea7f3693a04c8b66df10dca2d4be2d60b26e01 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 1 Feb 2019 10:23:07 +0100
Subject: [PATCH 11/19] OpenCL: use algorithm names instead of number

Use the algorithm names from `cryptonight.hpp` instead if number within the OpenCL kernel.
---
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 114 +++++++++---------
 .../amd/amd_gpu/opencl/fast_div_heavy.cl      |   4 +-
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    |   3 +-
 3 files changed, 61 insertions(+), 60 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 53394037b..faea409ed 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -14,6 +14,23 @@ R"===(
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
 
+// defines to translate algorithm names int a same number used within cryptonight.h
+#define invalid_algo 0
+#define cryptonight 1
+#define cryptonight_lite 2
+#define cryptonight_monero 3
+#define cryptonight_heavy 4
+#define cryptonight_aeon 5
+#define cryptonight_ipbc 6
+#define cryptonight_stellite 7
+#define cryptonight_masari 8
+#define cryptonight_haven 9
+#define cryptonight_bittube2 10
+#define cryptonight_monero_v8 11
+#define cryptonight_superfast 12
+#define cryptonight_gpu 13
+#define cryptonight_turtle 14
+
 /* For Mesa clover support */
 #ifdef cl_clang_storage_class_specifiers
 #   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
@@ -348,7 +365,7 @@ XMRSTAK_INCLUDE_BLAKE256
 //#include "opencl/groestl256.cl"
 XMRSTAK_INCLUDE_GROESTL256
 
-#if (ALGO == 13)
+#if (ALGO == cryptonight_gpu)
 	//#include "opencl/cryptonight_gpu.cl"
 	XMRSTAK_INCLUDE_CN_GPU
 #endif
@@ -504,8 +521,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 
     mem_fence(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+#if (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
     __local uint4 xin[8][8];
     {
 
@@ -559,8 +575,8 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 )==="
 R"===(
 
-// cryptonight_monero_v8 && NVIDIA
-#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION))
+// __NV_CL_C_VERSION checks if NVIDIA opencl is used
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -569,16 +585,15 @@ R"===(
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, uint Threads
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 , __global ulong *input
 #endif
 )
 {
 	ulong a[2];
 
-// cryptonight_monero_v8
-#if(ALGO==11 || ALGO==14)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -592,8 +607,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
     __local uint AES0[256], AES1[256];
 
-// cryptonight_monero_v8
-#if(ALGO==11 || ALGO==14)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
     __local uint RCP[256];
 #	endif
@@ -608,15 +622,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         const uint tmp = AES0_C[i];
         AES0[i] = tmp;
         AES1[i] = rotate(tmp, 8U);
-// cryptonight_monero_v8
-#if((ALGO==11 || ALGO==14) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
     uint2 tweak1_2;
 #endif
 
@@ -643,8 +657,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 		b_x[0] = ((uint4 *)b)[0];
 
-// cryptonight_monero_v8
-#if(ALGO==11 || ALGO==14)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
         a[1] = states[1] ^ states[5];
         b[2] = states[8] ^ states[10];
         b[3] = states[9] ^ states[11];
@@ -652,8 +665,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		division_result = as_uint2(states[12]);
 		sqrt_result = as_uint2(states[13]).s0;
 #endif
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 		tweak1_2 = as_uint2(input[4]);
 		tweak1_2.s0 >>= 24;
 		tweak1_2.s0 |= tweak1_2.s1 << 8;
@@ -675,22 +688,21 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     for(int i = 0; i < ITERATIONS; ++i)
     {
 			ulong c[2];
-// cryptonight_monero_v8 && NVIDIA
-#if((ALGO==11 || ALGO==14) && defined(__NV_CL_C_VERSION))
+
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle) && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
 
 			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
-// cryptonight_bittube2
-#if(ALGO == 10)
+
+#if(ALGO == cryptonight_bittube2)
 			((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
 			((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
 
-// cryptonight_monero_v8
-#if(ALGO==11 || ALGO==14)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
         {
 			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
@@ -701,12 +713,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         }
 #endif
 
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 			uint table = 0x75310U;
 			b_x[0] ^= ((uint4 *)c)[0];
-// cryptonight_stellite
-#	if(ALGO == 7)
+
+#	if(ALGO == cryptonight_stellite)
 			uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
 #	else
 			uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
@@ -714,8 +725,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
-// cryptonight_monero_v8
-#elif(ALGO==11 || ALGO==14)
+
+#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -733,8 +744,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
-// cryptonight_monero_v8
-#if(ALGO==11 || ALGO==14)
+
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 			// Use division and square root results from the _previous_ iteration to hide the latency
             tmp.s0 ^= division_result.s0;
             tmp.s1 ^= division_result.s1 ^ sqrt_result;
@@ -770,11 +781,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			a[1] += c[0] * as_ulong2(tmp).s0;
 			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
 #endif
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 
-// cryptonight_ipbc || cryptonight_bittube2
-#	if(ALGO == 6 || ALGO == 10)
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+
+#	if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
@@ -791,8 +801,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
         ((uint4 *)a)[0] ^= tmp;
 
-// cryptonight_monero_v8
-#if (ALGO == 11 || ALGO==14)
+#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
@@ -802,15 +811,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			b_x[0] = ((uint4 *)c)[0];
 			idx0 = as_uint2(a[0]).s0 & MASK;
 
-// cryptonight_heavy || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 10)
+#if (ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
             long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = (d ^ as_int2(q).s0) & MASK;
-// cryptonight_haven || cryptonight_superfast
-#elif (ALGO == 9 || ALGO == 12)
+#elif (ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = fast_div_heavy(n, d | 0x5);
@@ -828,8 +835,8 @@ R"===(
 
 __attribute__((reqd_work_group_size(8, 8, 1)))
 __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states,
-// cryptonight_gpu
-#if (ALGO == 13)
+
+#if (ALGO == cryptonight_gpu)
 	__global uint *output, ulong Target, uint Threads)
 #else
 	__global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
@@ -851,8 +858,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10  || ALGO == 12)
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2  || ALGO == cryptonight_superfast)
     __local uint4 xin1[8][8];
     __local uint4 xin2[8][8];
 #endif
@@ -890,8 +896,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
     __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
     __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
     __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
@@ -904,8 +909,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     if (gIdx < Threads)
 #endif
     {
-// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
         #pragma unroll 2
         for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
         {
@@ -945,8 +950,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
     }
 
-// cryptonight_gpu || cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 13 || ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
     /* Also left over threads performe this loop.
      * The left over thread results will be ignored
      */
@@ -988,10 +992,10 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
             for(int i = 0; i < 25; ++i) State[i] = states[i];
 
             keccakf1600_2(State);
-#if (ALGO == 13)
+
+#if (ALGO == cryptonight_gpu)
 			if(State[3] <= Target)
 			{
-                //printf("gt %lu\n", State[3]);
 				ulong outIdx = atomic_inc(output + 0xFF);
 				if(outIdx < 0xFF)
 					output[outIdx] = get_global_id(0);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
index 161f2f55d..4469b0670 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -1,7 +1,6 @@
 R"===(
-#ifndef FAST_DIV_HEAVY_CL
-#define FAST_DIV_HEAVY_CL
 
+#if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 inline long fast_div_heavy(long _a, int _b)
 {
 	long a = abs(_a);
@@ -19,6 +18,5 @@ inline long fast_div_heavy(long _a, int _b)
 	const long q = q1 + q2 + q3;
 	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
 }
-
 #endif
 )==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index 93e304aee..b34e68294 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -3,8 +3,7 @@ R"===(
  * @author SChernykh
  */
 
-// cryptonight_monero_v8
-#if(ALGO==11 || ALGO==14)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_turtle)
 
 static const __constant uint RCP_C[256] =
 {

From e8ec992155db76194662230552ca986d27222968 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Thu, 31 Jan 2019 22:19:06 +0100
Subject: [PATCH 12/19] cuda: optimize cn-gpu

psychocrypt committed 9 minutes ago
 - use precomuted indicies within the loop
 - `cn_explode_gpu` use all threads to load the state
---
 .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 62 +++++++++++--------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
index d3df0fed2..94750560c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -347,7 +347,7 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n
 
 }
 
-__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, __m128i v0, __m128i v1, __m128i v2, __m128i v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
 {
 	__m128 n0(v0);
 	__m128 n1(v1);
@@ -402,6 +402,16 @@ __constant__ float ccnt[16] = {
 	1.4609375f
 };
 
+
+__forceinline__ __device__ void sync()
+{
+#if (__CUDACC_VER_MAJOR__ >= 9)
+	__syncwarp();
+#else
+	__syncthreads( );
+#endif
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY>
 __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
 {
@@ -440,18 +450,19 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int
 		s = ((uint32_t*)spad)[idxHash * 50] >> 8;
 	}
 
+	const uint32_t b = tid / 4;
+	const uint32_t bb = tid % 4;
+	const uint32_t block = b * 16 + bb;
+
 	for(size_t i = 0; i < batchsize; i++)
 	{
-		__syncthreads();
-
-		((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, tid/4, lpad))[tid%4];
-		__syncthreads();
+		sync();
+		((int*)smem)[tid] = ((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb];
+		sync();
 
 		__m128 rc = vs;
-
-
 		single_comupte_wrap(
-			tid%4,
+			bb,
 			*(smem + look[tid][0]),
 			*(smem + look[tid][1]),
 			*(smem + look[tid][2]),
@@ -460,40 +471,37 @@ __global__ void cryptonight_core_gpu_phase2_gpu(int32_t *spad, int *lpad_in, int
 			smemOut[tid]
 		);
 
-		__syncthreads();
+		sync();
 
-		const uint32_t b = tid / 4;
-		const uint32_t bb = tid % 4;
-
-		int outXor = ((int*)smemOut)[b * 16 + bb];
-		for(uint32_t dd = b * 16 + 4 + bb; dd < (b + 1) * 16; dd += 4)
+		int outXor = ((int*)smemOut)[block];
+		for(uint32_t dd = block + 4; dd < (b + 1) * 16; dd += 4)
 			outXor ^= ((int*)smemOut)[dd];
 
-		((int*)scratchpad_ptr<MASK>(s, tid/4, lpad))[tid%4] = outXor ^ ((int*)smem)[tid];
+		((int*)scratchpad_ptr<MASK>(s, b, lpad))[bb] = outXor ^ ((int*)smem)[tid];
 		((int*)smemOut)[tid] = outXor;
 
-		float va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4];
-		float va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12];
+		float va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
+		float va_tmp2 = ((float*)smemVa)[block+ 8] + ((float*)smemVa)[block + 12];
 		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
 
-		__syncthreads();
+		sync();
 
 		__m128i out2 = smemOut[0] ^ smemOut[1] ^ smemOut[2] ^ smemOut[3];
-		va_tmp1 = ((float*)smemVa)[b * 16 + bb] + ((float*)smemVa)[b * 16 + bb + 4];
-		va_tmp2 = ((float*)smemVa)[b * 16 + bb + 8] + ((float*)smemVa)[b * 16 + bb + 12];
+		va_tmp1 = ((float*)smemVa)[block] + ((float*)smemVa)[block + 4];
+		va_tmp2 = ((float*)smemVa)[block + 8] + ((float*)smemVa)[block + 12];
 		((float*)smemVa)[tid] = va_tmp1 + va_tmp2;
 
-		__syncthreads();
+		sync();
 
 		vs = smemVa[0];
 		vs.abs(); // take abs(va) by masking the float sign bit
 		auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
 		// vs range 0 - 64
-		*smem = xx.get_int();
-		*smem = _mm_xor_si128(*smem, out2);
+		auto xx_int = xx.get_int();
+		out2 = _mm_xor_si128(xx_int, out2);
 		// vs is now between 0 and 1
 		vs = _mm_div_ps(vs, __m128(64.0f));
-		s = smem->x ^ smem->y ^ smem->z ^ smem->w;
+		s = out2.x ^ out2.y ^ out2.z ^ out2.w;
 	}
 	if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
 	{
@@ -539,8 +547,10 @@ __global__ void cn_explode_gpu(int32_t *spad_in, int *lpad_in)
 	uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY;
 	uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200);
 
-	constexpr size_t hash_size = 200; // 25x8 bytes
-	memcpy(state, spad, hash_size);
+	for(int i = threadIdx.x; i < 25; i += blockDim.x)
+		state[i] = spad[i];
+
+	sync();
 
 	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
 	{

From 6e4e8d938c2b57a5c05807462c2b898bccd86163 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 1 Feb 2019 20:55:06 +0100
Subject: [PATCH 13/19] add ryo algorithm fork

switch with block 6 to cryptonigh_gpu
(https://github.com/ryo-currency/ryo-currency/blob/7bc5dc40864e518753807d04c17d617f728da156/src/cryptonote_config.h#L185)
---
 xmrstak/jconf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 80e6002d7..6c2de4b3b 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -111,7 +111,7 @@ xmrstak::coin_selection coins[] = {
 	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
 	{ "monero",              {cryptonight_monero_v8, cryptonight_monero_v8, 0u},     {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" },
 	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
+	{ "ryo",                 {cryptonight_gpu, cryptonight_heavy, 6u},            {cryptonight_gpu, cryptonight_heavy, 6u},   nullptr },
 	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "turtlecoin",          {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "plenteum",			 {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }

From 3f6bd5a25e5b6c0e22a99f5d0b296be61b766a63 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 1 Feb 2019 22:30:33 +0100
Subject: [PATCH 14/19] CUDA: optimze cn_gpu auto suggestion

optimize the algorithm for cryptonight_gpu autosuggestion
---
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 45 +++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index e4574e20a..a37ecc8a0 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -593,6 +593,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 	}
 
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+
+
 	// set all device option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
@@ -600,8 +604,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 *   - 3 * SMX count for >=sm_30
 		 *   - 2 * SMX count for  <sm_30
 		 */
-		ctx->device_blocks = props.multiProcessorCount *
-			( props.major < 3 ? 2 : 3 );
+		ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3);
+
+		// use 6 blocks per SM for sm_2X else 8 blocks
+		if(useCryptonight_gpu)
+			ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 6 : 8);
 
 		// increase bfactor for low end devices to avoid that the miner is killed by the OS
 		if(props.multiProcessorCount <= 6)
@@ -613,7 +620,16 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
 		 * `8 * ctx->device_threads` threads per block
 		 */
-		ctx->device_threads = 64;
+		const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024;
+
+		// for the most algorithms we are using 8 threads per hash
+		uint32_t threadsPerHash = 8;
+
+		// phase2_gpu uses 16 threads per hash
+		if(useCryptonight_gpu)
+			threadsPerHash = 16;
+
+		ctx->device_threads = maxThreadsPerBlock / threadsPerHash;
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
 		// no limit by default 1TiB
@@ -678,8 +694,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		ctx->total_device_memory = totalMemory;
 		ctx->free_device_memory = freeMemory;
 
-		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
-
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
@@ -725,10 +739,9 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		// use only odd number of threads
 		ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
 
-		if(props.major == 2 && ctx->device_threads > 64)
+		if(ctx->device_threads > maxThreadsPerBlock / threadsPerHash)
 		{
-			// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
-			ctx->device_threads = 64;
+			ctx->device_threads = maxThreadsPerBlock / threadsPerHash;
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
@@ -749,6 +762,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 				ctx->device_blocks = blockOptimal;
 			}
 		}
+		else if(useCryptonight_gpu)
+		{
+			// 8 based on my profiling sessions maybe it must be adjusted later
+			size_t threads = 8;
+			// 8 is chosen by checking the occupancy calculator
+			size_t blockOptimal = 8 * ctx->device_mpcount;
+			if(gpuArch >= 70)
+				blockOptimal = 5 *  ctx->device_mpcount;
+			
+			if(blockOptimal * threads * hashMemSize < limitedMemory)
+			{
+				ctx->device_threads = threads;
+				ctx->device_blocks = blockOptimal;
+			}
+
+		}
 	}
 	printf("device init succeeded\n");
 

From 4524b875e9397a5856444090ff7f9a930f8a690a Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Fri, 1 Feb 2019 22:32:55 +0100
Subject: [PATCH 15/19] OpenCL: fix work size message

Fix message with the maximal allowed worksize if cryptonight_gpu is
used.
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 857abc138..f80c37a8b 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -284,10 +284,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	/* Some kernel spawn 8 times more threads than the user is configuring.
-	 * To give the user the correct maximum work size we divide the hardware specific max by 8.
-	 */
-	MaximumWorkSize /= 8;
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+
+	if(useCryptonight_gpu)
+	{
+		// work cn_1 we use 16x more threads than configured by the user
+		MaximumWorkSize /= 16;
+	}
+	else
+	{
+		/* Some kernel spawn 8 times more threads than the user is configuring.
+		 * To give the user the correct maximum work size we divide the hardware specific max by 8.
+		 */
+		MaximumWorkSize /= 8;
+	}
 	printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
 #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
 	const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 };
@@ -316,8 +327,6 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
-
 	size_t scratchPadSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{

From e274dbcc8ef19665cb913777837fa0ac7e44ff26 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 2 Feb 2019 22:37:19 +0100
Subject: [PATCH 16/19] OpenCL: fix Blake hashing

Windows driver creates wrong code if unroll is used.
---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index faea409ed..2c775e77e 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -1173,7 +1173,6 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
 
         ((uint8 *)h)[0] = vload8(0U, c_IV256);
 
-        #pragma unroll 4
         for(uint i = 0, bitlen = 0; i < 4; ++i)
         {
             if(i < 3)
@@ -1295,4 +1294,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
         }
     }
 
-)==="
+)==="
\ No newline at end of file

From 1ca0b951a2a7ef2c489b40aa94b00538592adcf5 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 2 Feb 2019 22:55:45 +0100
Subject: [PATCH 17/19] coin cleanup

- rename `Intense` to `Lethean`
- remove `masari` and `stellite`, both hase forked to a currently
unsupported algorithm
---
 README.md         | 3 +--
 xmrstak/jconf.cpp | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e0b62503c..1040e56ea 100644
--- a/README.md
+++ b/README.md
@@ -43,8 +43,7 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [BitTube](https://coin.bit.tube/)
 - [Graft](https://www.graft.network)
 - [Haven](https://havenprotocol.com)
-- [Intense](https://intensecoin.com)
-- [Masari](https://getmasari.org)
+- [Lethean](https://lethean.io)
 - [QRL](https://theqrl.org)
 - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
 - [TurtleCoin](https://turtlecoin.lol)
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 6c2de4b3b..a16910552 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -107,12 +107,10 @@ xmrstak::coin_selection coins[] = {
 	{ "freehaven",           {cryptonight_heavy, cryptonight_superfast, 255u},    {cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
 	{ "graft",               {cryptonight_monero_v8, cryptonight_monero_v8, 0u},    {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
+	{ "lethean",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "monero",              {cryptonight_monero_v8, cryptonight_monero_v8, 0u},     {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" },
 	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_gpu, cryptonight_heavy, 6u},            {cryptonight_gpu, cryptonight_heavy, 6u},   nullptr },
-	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "turtlecoin",          {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "plenteum",			 {cryptonight_turtle, cryptonight_aeon, 5u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
 };

From d834e9e0f6f2eaf15dfc8b426daaa4a40b18c1c5 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 2 Feb 2019 23:24:14 +0100
Subject: [PATCH 18/19] update documentation

- remove driver warning
- add link to supported card y the ROCm driver
---
 doc/compile_Linux.md   | 4 +---
 doc/compile_Windows.md | 3 ---
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index ebf115430..6c80bc56a 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -9,10 +9,8 @@
 - run `./amdgpu-pro-install --opencl=legacy,pal` from the unzipped folder
 - set the environment variable to opencl `export AMDAPPSDKROOT=/opt/amdgpu-pro/`
 
-**ATTENTION** The linux driver 18.3 creating invalid shares. 
-If you have an issue with `invalid shares` please downgrade your driver or switch to ROCm.
-
 For linux also the OpenSource driver ROCm 1.9.X+ is a well working alternative, see https://rocm.github.io/ROCmInstall.html
+ROCm is not supporting old GPUs please check if your GPU is supported https://rocm.github.io/hardware.html.
 
 ### Cuda 8.0+ (only needed to use NVIDIA GPUs)
 
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 8fe4dcf53..64d68bab1 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -34,9 +34,6 @@
 
 - Download & install the AMD driver: https://www.amd.com/en/support
 
-**ATTENTION** Many windows driver 18.5+ creating invalid shares.
-If you have an issue with `invalid shares` please downgrade your driver.
-
 - Download and install the latest version of the OCL-SDK from https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases 
 
 Do not follow old information that you need the AMD APP SDK. AMD has removed the APP SDK and is now shipping the OCL-SDK_light.

From 62b305a73cf8be50c863d8cdb356141b8d2925bb Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sat, 2 Feb 2019 23:28:09 +0100
Subject: [PATCH 19/19] update version to 2.8.0

---
 xmrstak/version.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 88e3a5cad..84fd048bd 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.7.1"
+#define XMR_STAK_VERSION "2.8.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"

Thread ID	10s	60s	15m
Rig ID	%s
Pool address	%s
Connected since	%s
Pool ping time	%u ms
%u	%s	%s	%s
%s	%s	%s	%s
Totals:	%s	%s	%s
Currency	%s
Difficulty	%u
Good results	%u / %u (%.1f %%)
Avg result time	%.1f sec