From faa6f7c50a05edda3069144269fb0225b226306b Mon Sep 17 00:00:00 2001 From: Alexander Lyashevsky Date: Tue, 18 Jul 2017 23:20:16 -0700 Subject: [PATCH 01/11] fixed large image --- src/kernels/MIOpenConv1x1.cl | 52 ++++++++++++++++++++++++++++-------- src/mlo_dir_conv.cpp | 7 +++-- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/kernels/MIOpenConv1x1.cl b/src/kernels/MIOpenConv1x1.cl index a673227290..fb49c822c2 100644 --- a/src/kernels/MIOpenConv1x1.cl +++ b/src/kernels/MIOpenConv1x1.cl @@ -39,12 +39,13 @@ // calculating the size of the area for weights prefetch + #if MLO_N_MAPS_PERGROUP > 1 -#define MLO_WEIGHTS_PER_LOOP_MAX 8 +#define MLO_WEIGHTS_PER_LOOP_MAX (8) #else -#define MLO_WEIGHTS_PER_LOOP_MAX 16 +#define MLO_WEIGHTS_PER_LOOP_MAX (16) #endif -#if((MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS) < MLO_N_INPUTS) +#if(MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS < MLO_N_INPUTS) #define MLO_LCL_IN_ROW (MLO_N_MAPS_PERGROUP * MLO_N_LCL_IN_MAPS) #else #define MLO_LCL_IN_ROW (MLO_N_INPUTS) @@ -52,12 +53,18 @@ #define MLO_WEIGHTS_PER_LOOP_TMP ((MLO_N_INPUTS + MLO_LCL_IN_ROW - 1) / MLO_LCL_IN_ROW) + #if(MLO_WEIGHTS_PER_LOOP_TMP < MLO_WEIGHTS_PER_LOOP_MAX) #define MLO_WEIGHTS_PER_LOOP (MLO_WEIGHTS_PER_LOOP_TMP) #else #define MLO_WEIGHTS_PER_LOOP (MLO_WEIGHTS_PER_LOOP_MAX) #endif + + #define MLO_LCL_WEIGHTS_ROW (MLO_WEIGHTS_PER_LOOP * MLO_LCL_IN_ROW) + +#define MLO_IN_LOOP ((MLO_N_INPUTS + MLO_LCL_WEIGHTS_ROW - 1)/MLO_LCL_WEIGHTS_ROW) + #define MLO_WEIGHTS_ROW (MLO_LCL_WEIGHTS_ROW * MLO_WEI_CHANNEL_STRIDE) // size of the area for weights prefetch @@ -112,6 +119,7 @@ After completion of the main MLO_IN_LOOP loop partial sums have been summed up i */ +__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, const __global _FLOAT* __restrict wei_ptr, #if MLO_CONV_BIAS @@ -128,6 +136,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, __private _FLOAT out_tiles[MLO_N_LCL_BATCHS][MLO_N_LCL_OUT_MAPS][MLO_READ_UNIT]; __local _FLOAT lcl_wei_stage[MLO_LCL_MEM_SZ]; + #if MLO_N_MAPS_PERGROUP > 1 __local _FLOAT* lcl_out_stage = lcl_wei_stage; @@ -167,7 +176,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, } } // over all input maps; with step == MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP; MLO_IN_LOOP - for(uint wc = 0; wc < MLO_IN_LOOP; wc += MLO_WEIGHTS_PER_LOOP) + for(uint wc = 0; wc < MLO_IN_LOOP; ++wc) { // read array of weights barrier(CLK_LOCAL_MEM_FENCE); @@ -184,7 +193,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, uint lwi = (w & (MLO_WEIGHTS_ROW - 1)); #endif - uint wi = (wc * (MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP) + lwi) * + uint wi = (wc * MLO_LCL_WEIGHTS_ROW + lwi) * #if MLO_DIR_FORWARD == 1 MLO_WEI_CHANNEL_STRIDE; #else @@ -192,7 +201,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, #endif // out of range check - uint wei_off_r = wei_off + wi + + uint wei_off1 = wei_off + wi + oi * #if MLO_DIR_FORWARD == 1 MLO_WEI_BSTRIDE; @@ -200,29 +209,30 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, MLO_WEI_CHANNEL_STRIDE; #endif - wei_off_r = (wei_off_r < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off_r : 0; + uint wei_off_r = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off1 : 0; _FLOAT wei_val = wei_ptr[wei_off_r]; - wei_val = (wei_off_r < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0; + wei_val = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0; lcl_wei_stage[w] = wei_val; } barrier(CLK_LOCAL_MEM_FENCE); - + uint c = 0; #if MLO_WEIGHTS_PER_LOOP > 7 #pragma unroll(MLO_WEIGHTS_PER_LOOP / 8) #endif for(uint ci = 0; ci < MLO_WEIGHTS_PER_LOOP; ++ci, in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP) { - uint c = wc + ci; + c = wc + ci; uint wei_indx = ci; // read data // over all local batchs uint in_off1 = in_off; + uint in_off2 = 0; for(uint ib = 0; ib < MLO_N_LCL_BATCHS; ++ib, in_off1 += MLO_IN_BATCH_STRIDE) { - uint in_off2 = in_off1; + in_off2 = in_off1; // lcl in maps (in data tiles) is has the stride = MLO_N_MAPS_PERGROUP for(uint ilc = 0; ilc < MLO_N_LCL_IN_MAPS; ++ilc, in_off2 += MLO_IN_CHANNEL_STRIDE * MLO_N_MAPS_PERGROUP) @@ -234,6 +244,9 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, c * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP + in_map_id + ilc * MLO_N_MAPS_PERGROUP < MLO_N_INPUTS; +#ifndef __AMDGCN__ + in_off2 = (v) ? in_off2 : 0; +#endif __global const _FLOAT* in_p = &in_ptr[in_off2]; #if MLO_C1x1_PIXLEFT > 0 // if the last one @@ -301,6 +314,23 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, for(uint i = 0; i < MLO_READ_UNIT; ++i) { out_tiles[ib][olc][i] += in_stage[ib][ilc][i] * wei_stage; + +#if 0 //MLO_DIR_FORWARD == 0 + if ( /*in_stage[ib][ilc][i] * wei_stage!= 0 && */out_grp_block * MLO_N_LCL_OUT_MAPS + olc == 8 && i == 0 && get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) + { + printf("K:c: %d %d %d %d %f %f %f %f\n", + wc, + MLO_IN_LOOP, + MLO_WEIGHTS_PER_LOOP, + MLO_WEIGHTS_ROW, + + out_tiles[ib][olc][i], + in_stage[ib][ilc][i] * wei_stage, + in_stage[ib][ilc][i], + wei_stage + ); + } +#endif } } } diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index ac6b225f5d..5e04007b41 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -1395,7 +1395,7 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1() // n of input map per group N_MAPS_PERGROUP = std::min(N_MAPS_PERGROUP, n_input_scaled); // number of input loops - int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP; + // int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP; // number of batches inside wk_item _n_stacks = std::min(_batch_sz, _n_stacks); @@ -1434,10 +1434,13 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1() std::to_string(static_cast(wei_cstride)) // algorithm parameters + std::string(" -DMLO_GRP_SZ0=") + std::to_string(static_cast(GRP_SZ)) + + std::string(" -DMLO_GRP_SZ1=") + std::to_string(1) + + std::string(" -DMLO_GRP_SZ2=") + std::to_string(1) + + std::string(" -DMLO_MAP_SZ4=") + std::to_string(static_cast(MAP_SZ4)) + std::string(" -DMLO_C1x1_PIXLEFT=") + std::to_string(static_cast(C1x1_PIXLEFT)) + std::string(" -DMLO_DIVBY4=") + std::to_string(static_cast(DIVBY4)) + - std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast(n_in_loop)) + + //std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast(n_in_loop)) + std::string(" -DMLO_N_LCL_BATCHS=") + std::to_string(static_cast(_n_stacks)) // # of diff stacks (part of batch). + std::string(" -DMLO_N_LCL_OUT_MAPS=") + From 5764b68015cba722177d7b44792bf88c14f9ab75 Mon Sep 17 00:00:00 2001 From: "AMD\\alyashev" Date: Wed, 19 Jul 2017 22:08:22 -0700 Subject: [PATCH 02/11] pool driver fix --- driver/pool_driver.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/pool_driver.hpp b/driver/pool_driver.hpp index c4daca6f87..7ce341f71b 100644 --- a/driver/pool_driver.hpp +++ b/driver/pool_driver.hpp @@ -445,7 +445,7 @@ int PoolDriver::VerifyForward() do_backward, maskhost.data(), mask.data(), - (10e-6)); + 1); printf(match ? "Forward Pooling Verifies on CPU and GPU\n" : "Forward Pooling Verification Failed !!\n"); From 60211041468ac5bce5ad0f22ee500682fd5649da Mon Sep 17 00:00:00 2001 From: Alexander Lyashevsky Date: Thu, 20 Jul 2017 15:21:20 -0700 Subject: [PATCH 03/11] fixed 1x1 fwd/bwd --- src/kernels/MIOpenConv1x1.cl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/kernels/MIOpenConv1x1.cl b/src/kernels/MIOpenConv1x1.cl index fb49c822c2..b1c757e2c5 100644 --- a/src/kernels/MIOpenConv1x1.cl +++ b/src/kernels/MIOpenConv1x1.cl @@ -223,7 +223,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, for(uint ci = 0; ci < MLO_WEIGHTS_PER_LOOP; ++ci, in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP) { - c = wc + ci; + c = wc*MLO_WEIGHTS_PER_LOOP + ci; uint wei_indx = ci; // read data @@ -264,7 +264,8 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, #if DBG_OUT_OF_RNGE if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) { - printf("k:err:in-of-range\n"); + printf("k:err:in-of-range %d %d %d\n"); + } #endif } @@ -282,7 +283,8 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, in_stage[ib][ilc][i] = v ? val : 0.0f; #endif #if DBG_OUT_OF_RNGE - if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) + + if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) { printf("k:err:in-of-range\n"); } @@ -316,7 +318,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, out_tiles[ib][olc][i] += in_stage[ib][ilc][i] * wei_stage; #if 0 //MLO_DIR_FORWARD == 0 - if ( /*in_stage[ib][ilc][i] * wei_stage!= 0 && */out_grp_block * MLO_N_LCL_OUT_MAPS + olc == 8 && i == 0 && get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) + if ( in_stage[ib][ilc][i] * wei_stage!= 0 && out_grp_block * MLO_N_LCL_OUT_MAPS + olc == 0 && i == 0 && get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) { printf("K:c: %d %d %d %d %f %f %f %f\n", wc, From 996e0d9de9c09c5524b8759c4dea92e2e2601015 Mon Sep 17 00:00:00 2001 From: alyashev Date: Thu, 20 Jul 2017 15:54:24 -0700 Subject: [PATCH 04/11] fixed 1x1 fwd/bwd for large maps --- src/kernels/MIOpenConv1x1.cl | 41 ++++++++++++++++-------------------- src/mlo_dir_conv.cpp | 8 +++---- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/src/kernels/MIOpenConv1x1.cl b/src/kernels/MIOpenConv1x1.cl index b1c757e2c5..369643bf17 100644 --- a/src/kernels/MIOpenConv1x1.cl +++ b/src/kernels/MIOpenConv1x1.cl @@ -39,7 +39,6 @@ // calculating the size of the area for weights prefetch - #if MLO_N_MAPS_PERGROUP > 1 #define MLO_WEIGHTS_PER_LOOP_MAX (8) #else @@ -53,17 +52,15 @@ #define MLO_WEIGHTS_PER_LOOP_TMP ((MLO_N_INPUTS + MLO_LCL_IN_ROW - 1) / MLO_LCL_IN_ROW) - #if(MLO_WEIGHTS_PER_LOOP_TMP < MLO_WEIGHTS_PER_LOOP_MAX) #define MLO_WEIGHTS_PER_LOOP (MLO_WEIGHTS_PER_LOOP_TMP) #else #define MLO_WEIGHTS_PER_LOOP (MLO_WEIGHTS_PER_LOOP_MAX) #endif - #define MLO_LCL_WEIGHTS_ROW (MLO_WEIGHTS_PER_LOOP * MLO_LCL_IN_ROW) -#define MLO_IN_LOOP ((MLO_N_INPUTS + MLO_LCL_WEIGHTS_ROW - 1)/MLO_LCL_WEIGHTS_ROW) +#define MLO_IN_LOOP ((MLO_N_INPUTS + MLO_LCL_WEIGHTS_ROW - 1) / MLO_LCL_WEIGHTS_ROW) #define MLO_WEIGHTS_ROW (MLO_LCL_WEIGHTS_ROW * MLO_WEI_CHANNEL_STRIDE) @@ -119,15 +116,15 @@ After completion of the main MLO_IN_LOOP loop partial sums have been summed up i */ -__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) -__kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, - const __global _FLOAT* __restrict wei_ptr, +__attribute__((reqd_work_group_size(MLO_GRP_SZ0, MLO_GRP_SZ1, MLO_GRP_SZ2))) __kernel void +MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, + const __global _FLOAT* __restrict wei_ptr, #if MLO_CONV_BIAS - const __global _FLOAT* __restrict bias, + const __global _FLOAT* __restrict bias, #endif - __global _FLOAT* __restrict out_ptr, - UNUSED _FLOAT dummy_val // nothing - ) + __global _FLOAT* __restrict out_ptr, + UNUSED _FLOAT dummy_val // nothing + ) { // KERNEL // private buffers @@ -136,7 +133,6 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, __private _FLOAT out_tiles[MLO_N_LCL_BATCHS][MLO_N_LCL_OUT_MAPS][MLO_READ_UNIT]; __local _FLOAT lcl_wei_stage[MLO_LCL_MEM_SZ]; - #if MLO_N_MAPS_PERGROUP > 1 __local _FLOAT* lcl_out_stage = lcl_wei_stage; @@ -202,34 +198,34 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, // out of range check uint wei_off1 = wei_off + wi + - oi * + oi * #if MLO_DIR_FORWARD == 1 - MLO_WEI_BSTRIDE; + MLO_WEI_BSTRIDE; #else - MLO_WEI_CHANNEL_STRIDE; + MLO_WEI_CHANNEL_STRIDE; #endif - uint wei_off_r = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off1 : 0; + uint wei_off_r = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_off1 : 0; _FLOAT wei_val = wei_ptr[wei_off_r]; wei_val = (wei_off1 < MLO_N_OUTPUTS * MLO_N_INPUTS) ? wei_val : 0; lcl_wei_stage[w] = wei_val; } barrier(CLK_LOCAL_MEM_FENCE); - uint c = 0; + uint c = 0; #if MLO_WEIGHTS_PER_LOOP > 7 #pragma unroll(MLO_WEIGHTS_PER_LOOP / 8) #endif for(uint ci = 0; ci < MLO_WEIGHTS_PER_LOOP; ++ci, in_off += MLO_IN_CHANNEL_STRIDE * MLO_N_LCL_IN_MAPS * MLO_N_MAPS_PERGROUP) { - c = wc*MLO_WEIGHTS_PER_LOOP + ci; + c = wc * MLO_WEIGHTS_PER_LOOP + ci; uint wei_indx = ci; // read data // over all local batchs uint in_off1 = in_off; - uint in_off2 = 0; + uint in_off2 = 0; for(uint ib = 0; ib < MLO_N_LCL_BATCHS; ++ib, in_off1 += MLO_IN_BATCH_STRIDE) { in_off2 = in_off1; @@ -245,7 +241,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, ilc * MLO_N_MAPS_PERGROUP < MLO_N_INPUTS; #ifndef __AMDGCN__ - in_off2 = (v) ? in_off2 : 0; + in_off2 = (v) ? in_off2 : 0; #endif __global const _FLOAT* in_p = &in_ptr[in_off2]; #if MLO_C1x1_PIXLEFT > 0 @@ -265,7 +261,6 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) { printf("k:err:in-of-range %d %d %d\n"); - } #endif } @@ -284,7 +279,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, #endif #if DBG_OUT_OF_RNGE - if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) + if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) { printf("k:err:in-of-range\n"); } @@ -317,7 +312,7 @@ __kernel void MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, { out_tiles[ib][olc][i] += in_stage[ib][ilc][i] * wei_stage; -#if 0 //MLO_DIR_FORWARD == 0 +#if 0 // MLO_DIR_FORWARD == 0 if ( in_stage[ib][ilc][i] * wei_stage!= 0 && out_grp_block * MLO_N_LCL_OUT_MAPS + olc == 0 && i == 0 && get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) { printf("K:c: %d %d %d %d %f %f %f %f\n", diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 5e04007b41..85f78dbf86 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -1395,7 +1395,7 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1() // n of input map per group N_MAPS_PERGROUP = std::min(N_MAPS_PERGROUP, n_input_scaled); // number of input loops - // int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP; + // int n_in_loop = (n_input_scaled + N_MAPS_PERGROUP - 1) / N_MAPS_PERGROUP; // number of batches inside wk_item _n_stacks = std::min(_batch_sz, _n_stacks); @@ -1434,13 +1434,13 @@ int mlo_construct_direct2D::mloConstructDirect2D1x1() std::to_string(static_cast(wei_cstride)) // algorithm parameters + std::string(" -DMLO_GRP_SZ0=") + std::to_string(static_cast(GRP_SZ)) + - std::string(" -DMLO_GRP_SZ1=") + std::to_string(1) + - std::string(" -DMLO_GRP_SZ2=") + std::to_string(1) + + std::string(" -DMLO_GRP_SZ1=") + std::to_string(1) + std::string(" -DMLO_GRP_SZ2=") + + std::to_string(1) + std::string(" -DMLO_MAP_SZ4=") + std::to_string(static_cast(MAP_SZ4)) + std::string(" -DMLO_C1x1_PIXLEFT=") + std::to_string(static_cast(C1x1_PIXLEFT)) + std::string(" -DMLO_DIVBY4=") + std::to_string(static_cast(DIVBY4)) + - //std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast(n_in_loop)) + + // std::string(" -DMLO_IN_LOOP=") + std::to_string(static_cast(n_in_loop)) + std::string(" -DMLO_N_LCL_BATCHS=") + std::to_string(static_cast(_n_stacks)) // # of diff stacks (part of batch). + std::string(" -DMLO_N_LCL_OUT_MAPS=") + From 97a19d198baaa5501abd5f14320068e176537fd8 Mon Sep 17 00:00:00 2001 From: Alexander Lyashevsky Date: Thu, 20 Jul 2017 16:31:16 -0700 Subject: [PATCH 05/11] minor fix --- src/kernels/MIOpenConv1x1.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/MIOpenConv1x1.cl b/src/kernels/MIOpenConv1x1.cl index 369643bf17..06146dea1d 100644 --- a/src/kernels/MIOpenConv1x1.cl +++ b/src/kernels/MIOpenConv1x1.cl @@ -260,7 +260,7 @@ MIOpenConv1x1(const __global _FLOAT* __restrict in_ptr, #if DBG_OUT_OF_RNGE if(in_off2 + i >= MLO_IN_BATCH_STRIDE * MLO_BATCH_SZ) { - printf("k:err:in-of-range %d %d %d\n"); + printf("k:err:in-of-range\n"); } #endif } From 93382d35217402aa8507a7d1799ec160f4fdb018 Mon Sep 17 00:00:00 2001 From: Alexander Lyashevsky Date: Mon, 24 Jul 2017 13:13:00 -0700 Subject: [PATCH 06/11] fix MIOpenDriver conv_driver.hpp --- driver/conv_driver.hpp | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index bfc6cab7f6..4bde4aa173 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -92,6 +92,9 @@ class ConvDriver : public Driver miopenCreateTensorDescriptor(&biasTensor); miopenCreateConvolutionDescriptor(&convDesc); + + workspace_bwd_dev = 0; + workspace_fwd_dev = 0; } int AddCmdLineArgs(); @@ -373,10 +376,20 @@ int ConvDriver::AllocateBuffersAndCopy() dwei_dev = std::unique_ptr(new GPUMem(ctx, wei_sz, sizeof(float))); dout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(float))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(float))); - workspace_bwd_dev = - std::unique_ptr(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T))); - workspace_fwd_dev = - std::unique_ptr(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T))); + if (workSpaceSize_bwd != 0) + { + workspace_bwd_dev = + std::unique_ptr(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T))); + workspace_bwd = std::vector(workSpaceSize_bwd / sizeof(T), 0); + workspace_bwd_host = std::vector(workSpaceSize_bwd / sizeof(T), 0); + } + if (workSpaceSize_fwd != 0) + { + workspace_fwd_dev = + std::unique_ptr(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T))); + workspace_fwd = std::vector(workSpaceSize_fwd / sizeof(T), 0); + workspace_fwd_host = std::vector(workSpaceSize_fwd / sizeof(T), 0); + } in = std::vector(in_sz); din = std::vector(in_sz); @@ -384,11 +397,10 @@ int ConvDriver::AllocateBuffersAndCopy() dwei = std::vector(wei_sz, 0); dout = std::vector(out_sz, 0); out = std::vector(out_sz, 0); - workspace_bwd = std::vector(workSpaceSize_bwd / sizeof(T), 0); - workspace_fwd = std::vector(workSpaceSize_fwd / sizeof(T), 0); + outhost = std::vector(out_sz, 0); - workspace_bwd_host = std::vector(workSpaceSize_bwd / sizeof(T), 0); - workspace_fwd_host = std::vector(workSpaceSize_fwd / sizeof(T), 0); + + dwei_host = std::vector(wei_sz, 0); din_host = std::vector(in_sz, 0); @@ -498,8 +510,8 @@ int ConvDriver::FindForward(int& ret_algo_count, request_algo_count, &ret_algo_count, perf_results.data(), - workspace_fwd_dev->GetMem(), - workspace_fwd_dev->GetSize(), + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0, (inflags.GetValueInt("search") == 1) ? true : false); } @@ -536,8 +548,9 @@ int ConvDriver::RunForwardGPU() &beta, outputTensor, out_dev->GetMem(), - workspace_fwd_dev->GetMem(), - workspace_fwd_dev->GetSize()); + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0 + ); } if(inflags.GetValueInt("time") == 1) From e040483322b939e9b3ff55f51caab2a4fe5d148b Mon Sep 17 00:00:00 2001 From: alyashev Date: Mon, 24 Jul 2017 13:34:38 -0700 Subject: [PATCH 07/11] formatting --- driver/conv_driver.hpp | 92 +++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 47 deletions(-) diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 4bde4aa173..8ac750b14e 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -93,8 +93,8 @@ class ConvDriver : public Driver miopenCreateConvolutionDescriptor(&convDesc); - workspace_bwd_dev = 0; - workspace_fwd_dev = 0; + workspace_bwd_dev = 0; + workspace_fwd_dev = 0; } int AddCmdLineArgs(); @@ -376,33 +376,32 @@ int ConvDriver::AllocateBuffersAndCopy() dwei_dev = std::unique_ptr(new GPUMem(ctx, wei_sz, sizeof(float))); dout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(float))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(float))); - if (workSpaceSize_bwd != 0) - { - workspace_bwd_dev = - std::unique_ptr(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T))); - workspace_bwd = std::vector(workSpaceSize_bwd / sizeof(T), 0); - workspace_bwd_host = std::vector(workSpaceSize_bwd / sizeof(T), 0); - } - if (workSpaceSize_fwd != 0) - { - workspace_fwd_dev = - std::unique_ptr(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T))); - workspace_fwd = std::vector(workSpaceSize_fwd / sizeof(T), 0); - workspace_fwd_host = std::vector(workSpaceSize_fwd / sizeof(T), 0); - } - - in = std::vector(in_sz); - din = std::vector(in_sz); - wei = std::vector(wei_sz); - dwei = std::vector(wei_sz, 0); - dout = std::vector(out_sz, 0); - out = std::vector(out_sz, 0); - - outhost = std::vector(out_sz, 0); - - - dwei_host = std::vector(wei_sz, 0); - din_host = std::vector(in_sz, 0); + if(workSpaceSize_bwd != 0) + { + workspace_bwd_dev = + std::unique_ptr(new GPUMem(ctx, workSpaceSize_bwd / sizeof(T), sizeof(T))); + workspace_bwd = std::vector(workSpaceSize_bwd / sizeof(T), 0); + workspace_bwd_host = std::vector(workSpaceSize_bwd / sizeof(T), 0); + } + if(workSpaceSize_fwd != 0) + { + workspace_fwd_dev = + std::unique_ptr(new GPUMem(ctx, workSpaceSize_fwd / sizeof(T), sizeof(T))); + workspace_fwd = std::vector(workSpaceSize_fwd / sizeof(T), 0); + workspace_fwd_host = std::vector(workSpaceSize_fwd / sizeof(T), 0); + } + + in = std::vector(in_sz); + din = std::vector(in_sz); + wei = std::vector(wei_sz); + dwei = std::vector(wei_sz, 0); + dout = std::vector(out_sz, 0); + out = std::vector(out_sz, 0); + + outhost = std::vector(out_sz, 0); + + dwei_host = std::vector(wei_sz, 0); + din_host = std::vector(in_sz, 0); std::string inFileName = inflags.GetValueStr("in_data"); std::string weiFileName = inflags.GetValueStr("weights"); @@ -499,21 +498,21 @@ int ConvDriver::FindForward(int& ret_algo_count, std::vector& perf_results) { - return miopenFindConvolutionForwardAlgorithm(GetHandle(), - inputTensor, - in_dev->GetMem(), - weightTensor, - wei_dev->GetMem(), - convDesc, - outputTensor, - out_dev->GetMem(), - request_algo_count, - &ret_algo_count, - perf_results.data(), - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0, - (inflags.GetValueInt("search") == 1) ? true - : false); + return miopenFindConvolutionForwardAlgorithm( + GetHandle(), + inputTensor, + in_dev->GetMem(), + weightTensor, + wei_dev->GetMem(), + convDesc, + outputTensor, + out_dev->GetMem(), + request_algo_count, + &ret_algo_count, + perf_results.data(), + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0, + (inflags.GetValueInt("search") == 1) ? true : false); } template @@ -548,9 +547,8 @@ int ConvDriver::RunForwardGPU() &beta, outputTensor, out_dev->GetMem(), - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0 - ); + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, + (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0); } if(inflags.GetValueInt("time") == 1) From aab32ccd739c2fcfb95eaad7eaec0b54fd5e5491 Mon Sep 17 00:00:00 2001 From: Mayank Daga Date: Tue, 25 Jul 2017 12:39:31 -0500 Subject: [PATCH 08/11] disable a config WrW asm --- src/mlo_dir_conv.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 85f78dbf86..3b058826f9 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -3383,8 +3383,9 @@ bool mlo_construct_BwdWrW2D::mloIsFastAsmDirect3x3WrW() const // They work fine on gfx8 // /todo fix memory faults on gfx9 const std::string name = _stream->GetDeviceName(); - return !(name == "gfx900" && (_in_width == 13 || _in_width == 27 || _in_width == 54 || - _in_width == 57 || _in_width == 17 || _in_width == 250)); + return !(name == "gfx900" && + (_in_width == 13 || _in_width == 27 || _in_width == 54 || _in_width == 57 || + _in_width == 17 || _in_width == 250 || _in_width == 175)); } int mlo_construct_BwdWrW2D::mloConstructAsmDirect3x3WrW() From d9c54c70704788bd65417e3d8224082c0a4d8562 Mon Sep 17 00:00:00 2001 From: Mayank Daga Date: Tue, 25 Jul 2017 22:45:23 -0500 Subject: [PATCH 09/11] more tidy fixes --- driver/conv_driver.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 8ac750b14e..9719f90694 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -510,8 +510,8 @@ int ConvDriver::FindForward(int& ret_algo_count, request_algo_count, &ret_algo_count, perf_results.data(), - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0, + (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetMem() : nullptr, + (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetSize() : 0, (inflags.GetValueInt("search") == 1) ? true : false); } From 0b862047cbdf50e375884c970f3b2e62da638463 Mon Sep 17 00:00:00 2001 From: Mayank Daga Date: Tue, 25 Jul 2017 22:11:26 -0500 Subject: [PATCH 10/11] fixed tidy warnings --- driver/conv_driver.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 9719f90694..808982595b 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -93,8 +93,8 @@ class ConvDriver : public Driver miopenCreateConvolutionDescriptor(&convDesc); - workspace_bwd_dev = 0; - workspace_fwd_dev = 0; + workspace_bwd_dev = nullptr; + workspace_fwd_dev = nullptr; } int AddCmdLineArgs(); @@ -547,8 +547,9 @@ int ConvDriver::RunForwardGPU() &beta, outputTensor, out_dev->GetMem(), - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetMem() : nullptr, - (workspace_fwd_dev != 0) ? workspace_fwd_dev->GetSize() : 0); + (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetMem() + : nullptr, + (workspace_fwd_dev != nullptr) ? workspace_fwd_dev->GetSize() : 0); } if(inflags.GetValueInt("time") == 1) From f1ed0c831cf653cdf1a9fcd759829fbcb211c38c Mon Sep 17 00:00:00 2001 From: Mayank Daga Date: Tue, 25 Jul 2017 22:23:27 -0500 Subject: [PATCH 11/11] disabled 1x1 winograd for hip --- src/mlo_dir_conv.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 3b058826f9..0aee0d74f8 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -239,12 +239,14 @@ int mlo_construct_winograd::mloConstruct() { return (mloConstructBinaryWinograd3x3U(rmv)); } +#if MIOPEN_BACKEND_OPENCL if(mloIsCorrectBinaryWinogradRxSFwd(rmv) && !miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS{}) && (no_perf_filtering || mloIsFastBinaryWinogradRxSFwd())) { return (mloConstructBinaryWinogradRxSFwd()); } +#endif } } #endif