From 63b95338cf1661de9bbe3658731097f5bf488359 Mon Sep 17 00:00:00 2001 From: tk-yoshimura Date: Sun, 18 Sep 2022 23:06:25 +0900 Subject: [PATCH] fix error-free arithmetic --- README.md | 2 +- TensorShaderCudaBackend/ShaderDefines.cs | 14 +++++++------- .../FloatFloatPrecision/Convolution1D.cs | 2 +- .../FloatFloatPrecision/Convolution2D.cs | 2 +- .../FloatFloatPrecision/Convolution3D.cs | 2 +- .../FloatFloatPrecision/Deconvolution1D.cs | 2 +- .../FloatFloatPrecision/Deconvolution2D.cs | 2 +- .../FloatFloatPrecision/Deconvolution3D.cs | 2 +- .../Convolution/FloatFloatPrecision/Dense.cs | 2 +- .../FloatFloatPrecision/TransposeDense.cs | 2 +- .../ChannelwiseConvolution1D.cs | 2 +- .../ChannelwiseConvolution2D.cs | 2 +- .../ChannelwiseConvolution3D.cs | 2 +- .../ChannelwiseDeconvolution1D.cs | 2 +- .../ChannelwiseDeconvolution2D.cs | 2 +- .../ChannelwiseDeconvolution3D.cs | 2 +- .../FloatFloatPrecision/Convolution1D.cs | 2 +- .../FloatFloatPrecision/Convolution2D.cs | 2 +- .../FloatFloatPrecision/Convolution3D.cs | 2 +- .../FloatFloatPrecision/Deconvolution1D.cs | 2 +- .../FloatFloatPrecision/Deconvolution2D.cs | 2 +- .../FloatFloatPrecision/Deconvolution3D.cs | 2 +- .../Convolution/FloatFloatPrecision/Dense.cs | 2 +- .../FloatFloatPrecision/PointwiseConvolution.cs | 2 +- .../FloatFloatPrecision/PointwiseDeconvolution.cs | 2 +- .../FloatFloatPrecision/TransposeDense.cs | 2 +- .../FloatFloatPrecision/Convolution1D.cs | 2 +- .../FloatFloatPrecision/Convolution2D.cs | 2 +- .../FloatFloatPrecision/Convolution3D.cs | 2 +- .../FloatFloatPrecision/Deconvolution1D.cs | 2 +- .../FloatFloatPrecision/Deconvolution2D.cs | 2 +- .../FloatFloatPrecision/Deconvolution3D.cs | 2 +- .../Convolution/FloatFloatPrecision/Dense.cs | 2 +- .../FloatFloatPrecision/TransposeDense.cs | 2 +- .../Shaders/Transform/ColumnToImage1D.cs | 2 +- .../Shaders/Transform/ColumnToImage2D.cs | 2 +- .../Shaders/Transform/ColumnToImage3D.cs | 2 +- .../FloatFloatPrecision/Convolution1D.cs | 2 +- .../FloatFloatPrecision/Convolution2D.cs | 2 +- .../FloatFloatPrecision/Convolution3D.cs | 2 +- .../FloatFloatPrecision/Deconvolution1D.cs | 2 +- .../FloatFloatPrecision/Deconvolution2D.cs | 2 +- .../FloatFloatPrecision/Deconvolution3D.cs | 2 +- .../Convolution/FloatFloatPrecision/Dense.cs | 2 +- .../FloatFloatPrecision/TransposeDense.cs | 2 +- TensorShaderSample/MNIST/MnistDownloader.cs | 4 ++-- 46 files changed, 53 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 2de45171..cf3a744d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ **Supports High-Dimensional Convolution Neural Networks.** (Complex, Quaternion, Vector3D) -For regression problems: **High precision calculate** by FP32-FP32 arithmetic (1/8 error of FP32 arithmetic) +For regression problems: **High precision calculate** by FP32-FP32 arithmetic ## Requirement .NET 6.0 diff --git a/TensorShaderCudaBackend/ShaderDefines.cs b/TensorShaderCudaBackend/ShaderDefines.cs index 1c1c0a1e..c7766b02 100644 --- a/TensorShaderCudaBackend/ShaderDefines.cs +++ b/TensorShaderCudaBackend/ShaderDefines.cs @@ -402,7 +402,7 @@ static __inline__ __device__ void floatfloat_hilo_add(float &hi, float &lo, floa public static string AtomicAdd => $@" static __inline__ __device__ void floatfloat_atomicadd(float *ptr, float hi, float lo){{ - float tmp = atomicAdd(ptr, hi + lo); + float tmp = atomicAdd(ptr, hi); atomicAdd(ptr + 1, lo - (((tmp + hi) - tmp) - hi)); }}"; @@ -465,9 +465,9 @@ static __inline__ __device__ void complex_mulgrad(float2 &hi, float2 &lo, float2 static __inline__ __device__ void floatfloat_atomicadd(float2 *ptr, float2 hi, float2 lo){{ float *ptr_float = (float*)ptr; - float tmpx = atomicAdd(ptr_float, hi.x + lo.x); + float tmpx = atomicAdd(ptr_float, hi.x); atomicAdd(ptr_float + 1, lo.x - (((tmpx + hi.x) - tmpx) - hi.x)); - float tmpy = atomicAdd(ptr_float + 2, hi.y + lo.y); + float tmpy = atomicAdd(ptr_float + 2, hi.y); atomicAdd(ptr_float + 3, lo.y - (((tmpy + hi.y) - tmpy) - hi.y)); }}"; } @@ -585,13 +585,13 @@ static __inline__ __device__ void quaternion_mulgrad(float4 &hi, float4 &lo, flo static __inline__ __device__ void floatfloat_atomicadd(float4 *ptr, float4 hi, float4 lo){{ float *ptr_float = (float*)ptr; - float tmpx = atomicAdd(ptr_float, hi.x + lo.x); + float tmpx = atomicAdd(ptr_float, hi.x); atomicAdd(ptr_float + 1, lo.x - (((tmpx + hi.x) - tmpx) - hi.x)); - float tmpy = atomicAdd(ptr_float + 2, hi.y + lo.y); + float tmpy = atomicAdd(ptr_float + 2, hi.y); atomicAdd(ptr_float + 3, lo.y - (((tmpy + hi.y) - tmpy) - hi.y)); - float tmpz = atomicAdd(ptr_float + 4, hi.z + lo.z); + float tmpz = atomicAdd(ptr_float + 4, hi.z); atomicAdd(ptr_float + 5, lo.z - (((tmpz + hi.z) - tmpz) - hi.z)); - float tmpw = atomicAdd(ptr_float + 6, hi.w + lo.w); + float tmpw = atomicAdd(ptr_float + 6, hi.w); atomicAdd(ptr_float + 7, lo.w - (((tmpw + hi.w) - tmpw) - hi.w)); }}"; } diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs index a36180d9..5c1a6352 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs @@ -90,7 +90,7 @@ __global__ void complex_convolution_1d(const float2* __restrict__ inmap, float2* { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs index 0ce5d160..41fd8704 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs @@ -103,7 +103,7 @@ __global__ void complex_convolution_2d(const float2* __restrict__ inmap, float2* { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs index a7cfbd6b..1b63f06b 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs @@ -111,7 +111,7 @@ __global__ void complex_convolution_3d(const float2* __restrict__ inmap, float2* { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs index 32df67cb..365f2fd9 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs @@ -90,7 +90,7 @@ __global__ void complex_deconvolution_1d(const float2* __restrict__ inmap, float { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs index bb66b172..3f624333 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs @@ -106,7 +106,7 @@ __global__ void complex_deconvolution_2d(const float2* __restrict__ inmap, float { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs index a3c0cbe7..d821a34e 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs @@ -118,7 +118,7 @@ __global__ void complex_deconvolution_3d(const float2* __restrict__ inmap, float { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs index 18e5f833..9386b773 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs @@ -76,7 +76,7 @@ __global__ void complex_dense(const float2* __restrict__ inmap, float2* __restri filter_idx += {OutChannels}; }} - outmap[outch] = ctor_float2(vu_hi.x + vu_lo.x, vu_hi.y + vu_lo.y); + outmap[outch] = ctor_float2(vu_hi.x, vu_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs index c33cd19b..0f9bbb4f 100644 --- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs +++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs @@ -74,7 +74,7 @@ __global__ void complex_transpose_dense(const float2* __restrict__ inmap, float2 filter_idx += {OutChannels}; }} - outmap[outch] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y); + outmap[outch] = ctor_float2(uv_hi.x, uv_hi.y); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs index 44ae15c3..cf754252 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs @@ -59,7 +59,7 @@ __global__ void chwise_convolution_1d(const float* __restrict__ inmap, float* __ unsigned int outmap_idx = ch + {Channels} * ox; - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; }}"; this.Kernel = new Kernel(code, "chwise_convolution_1d"); diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs index 33cbf6cf..a720203b 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs @@ -72,7 +72,7 @@ __global__ void chwise_convolution_2d(const float* __restrict__ inmap, float* __ unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; }}"; this.Kernel = new Kernel(code, "chwise_convolution_2d"); diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs index f55ebe98..e06eb3e6 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs @@ -79,7 +79,7 @@ __global__ void chwise_convolution_3d(const float* __restrict__ inmap, float* __ unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; }}"; this.Kernel = new Kernel(code, "chwise_convolution_3d"); diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs index 09c015f1..5ce6b61f 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs @@ -61,7 +61,7 @@ __global__ void chwise_deconvolution_1d(const float* __restrict__ inmap, float* unsigned int outmap_idx = ch + {Channels} * ox; - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; }}"; this.Kernel = new Kernel(code, "chwise_deconvolution_1d"); diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs index f0853720..e2a6b6c2 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs @@ -77,7 +77,7 @@ __global__ void chwise_deconvolution_2d(const float* __restrict__ inmap, float* unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; }}"; this.Kernel = new Kernel(code, "chwise_deconvolution_2d"); diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs index 03e6ac99..45e05b85 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs @@ -88,7 +88,7 @@ __global__ void chwise_deconvolution_3d(const float* __restrict__ inmap, float* unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; }}"; this.Kernel = new Kernel(code, "chwise_deconvolution_3d"); diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs index e801701f..d322b93d 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs @@ -81,7 +81,7 @@ __global__ void convolution_1d(const float* __restrict__ inmap, float* __restric { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs index 52627fa6..8ca3460b 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs @@ -94,7 +94,7 @@ __global__ void convolution_2d(const float* __restrict__ inmap, float* __restric { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs index db229a9d..001cc9b3 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs @@ -101,7 +101,7 @@ __global__ void convolution_3d(const float* __restrict__ inmap, float* __restric { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs index 7c1f80f8..d610587a 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs @@ -81,7 +81,7 @@ __global__ void deconvolution_1d(const float* __restrict__ inmap, float* __restr { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs index fc62484b..309de596 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs @@ -97,7 +97,7 @@ __global__ void deconvolution_2d(const float* __restrict__ inmap, float* __restr { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs index beb6bb56..b96c65bb 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs @@ -108,7 +108,7 @@ __global__ void deconvolution_3d(const float* __restrict__ inmap, float* __restr { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs index 0c35cddb..636b9781 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs @@ -70,7 +70,7 @@ __global__ void dense(const float* __restrict__ inmap, float* __restrict__ outma filter_idx += {OutChannels}; }} - outmap[outch] = uv_hi + uv_lo; + outmap[outch] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs index b4ebd5c8..db861d70 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs @@ -70,7 +70,7 @@ __global__ void ptwise_convolution(const float* __restrict__ inmap, float* __res unsigned int outmap_idx = outch + {OutChannels} * i; - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs index c1d219ce..9e2f2bfc 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs @@ -68,7 +68,7 @@ __global__ void ptwise_deconvolution(const float* __restrict__ inmap, float* __r unsigned int outmap_idx = outch + {OutChannels} * i; - outmap[outmap_idx] = uv_hi + uv_lo; + outmap[outmap_idx] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs index 6fbd5e77..0975ecf1 100644 --- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs +++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs @@ -65,7 +65,7 @@ __global__ void transpose_dense(const float* __restrict__ inmap, float* __restri filter_idx += {OutChannels}; }} - outmap[outch] = uv_hi + uv_lo; + outmap[outch] = uv_hi; { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs index af3db82a..2c1ac2e6 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs @@ -90,7 +90,7 @@ __global__ void quaternion_convolution_1d(const float4* __restrict__ inmap, floa { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs index df4016c5..1c0ff920 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs @@ -103,7 +103,7 @@ __global__ void quaternion_convolution_2d(const float4* __restrict__ inmap, floa { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs index 40eef3f4..aba29543 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs @@ -111,7 +111,7 @@ __global__ void quaternion_convolution_3d(const float4* __restrict__ inmap, floa { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs index f2a7472b..379ac1e0 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs @@ -90,7 +90,7 @@ __global__ void quaternion_deconvolution_1d(const float4* __restrict__ inmap, fl { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs index 366fe997..f205f3fc 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs @@ -106,7 +106,7 @@ __global__ void quaternion_deconvolution_2d(const float4* __restrict__ inmap, fl { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs index 24e370a4..04799ef5 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs @@ -118,7 +118,7 @@ __global__ void quaternion_deconvolution_3d(const float4* __restrict__ inmap, fl { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs index cd4eef2e..af0d079b 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs @@ -77,7 +77,7 @@ __global__ void quaternion_dense(const float4* __restrict__ inmap, float4* __res filter_idx += {OutChannels}; }} - outmap[outch] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outch] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs index 990e8841..f1a52f3c 100644 --- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs +++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs @@ -75,7 +75,7 @@ __global__ void quaternion_transpose_dense(const float4* __restrict__ inmap, flo filter_idx += {OutChannels}; }} - outmap[outch] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w); + outmap[outch] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs index b0fe8270..7237184a 100644 --- a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs +++ b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs @@ -57,7 +57,7 @@ __global__ void column_to_image_1d(const float* __restrict__ inmap, float* __res }} unsigned int outmap_idx = ch + {Channels} * ox; - outmap[outmap_idx] = hi + lo; + outmap[outmap_idx] = hi; }}"; this.Kernel = new Kernel(code, "column_to_image_1d"); diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs index 7eb9a103..ff6268d7 100644 --- a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs +++ b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs @@ -69,7 +69,7 @@ __global__ void column_to_image_2d(const float* __restrict__ inmap, float* __res }} unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy); - outmap[outmap_idx] = hi + lo; + outmap[outmap_idx] = hi; }}"; this.Kernel = new Kernel(code, "column_to_image_2d"); diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs index 9ea7a51c..4c946062 100644 --- a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs +++ b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs @@ -81,7 +81,7 @@ __global__ void column_to_image_3d(const float* __restrict__ inmap, float* __res }} unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = hi + lo; + outmap[outmap_idx] = hi; }}"; this.Kernel = new Kernel(code, "column_to_image_3d"); diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs index f0134446..5e4fa2dc 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs @@ -89,7 +89,7 @@ __global__ void trivector_convolution_1d(const float3* __restrict__ inmap, float { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs index d8cbcad4..d29da203 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs @@ -102,7 +102,7 @@ __global__ void trivector_convolution_2d(const float3* __restrict__ inmap, float { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs index 613d74e5..e4a66e5a 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs @@ -110,7 +110,7 @@ __global__ void trivector_convolution_3d(const float3* __restrict__ inmap, float { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs index beea3f03..59870c93 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs @@ -89,7 +89,7 @@ __global__ void trivector_deconvolution_1d(const float3* __restrict__ inmap, flo { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * ox; - outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs index 9f1afaa6..5e2023fe 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs @@ -105,7 +105,7 @@ __global__ void trivector_deconvolution_2d(const float3* __restrict__ inmap, flo { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy); - outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs index 6a1c711d..18f99b64 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs @@ -117,7 +117,7 @@ __global__ void trivector_deconvolution_3d(const float3* __restrict__ inmap, flo { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") } unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz)); - outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs index 5b37c532..8aaccc37 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs @@ -76,7 +76,7 @@ __global__ void trivector_dense(const float3* __restrict__ inmap, float3* __rest filter_idx += {OutChannels}; }} - outmap[outch] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outch] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs index 593facf4..fd7f02bb 100644 --- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs +++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs @@ -74,7 +74,7 @@ __global__ void trivector_transpose_dense(const float3* __restrict__ inmap, floa filter_idx += {OutChannels}; }} - outmap[outch] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z); + outmap[outch] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z); { (OutChannels % ThreadsX != 0 ? "}" : "") } }}"; diff --git a/TensorShaderSample/MNIST/MnistDownloader.cs b/TensorShaderSample/MNIST/MnistDownloader.cs index 57acf2c6..0583eda2 100644 --- a/TensorShaderSample/MNIST/MnistDownloader.cs +++ b/TensorShaderSample/MNIST/MnistDownloader.cs @@ -30,8 +30,8 @@ public static async void Download(string dirpath_dataset) { } if (!File.Exists(filepath)) { - using HttpResponseMessage res = await client.GetAsync($"{url}{filename}"); - using (Stream stream = await res.Content.ReadAsStreamAsync()) { + using HttpResponseMessage res = client.GetAsync($"{url}{filename}").Result; + using (Stream stream = res.Content.ReadAsStreamAsync().Result) { using FileStream fs = new(filepath_temp, FileMode.CreateNew); stream.CopyTo(fs); }