From 63b95338cf1661de9bbe3658731097f5bf488359 Mon Sep 17 00:00:00 2001
From: tk-yoshimura <yoshimura8192@gmail.com>
Date: Sun, 18 Sep 2022 23:06:25 +0900
Subject: [PATCH] fix error-free arithmetic

---
 README.md                                          |  2 +-
 TensorShaderCudaBackend/ShaderDefines.cs           | 14 +++++++-------
 .../FloatFloatPrecision/Convolution1D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution2D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution3D.cs           |  2 +-
 .../FloatFloatPrecision/Deconvolution1D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution2D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution3D.cs         |  2 +-
 .../Convolution/FloatFloatPrecision/Dense.cs       |  2 +-
 .../FloatFloatPrecision/TransposeDense.cs          |  2 +-
 .../ChannelwiseConvolution1D.cs                    |  2 +-
 .../ChannelwiseConvolution2D.cs                    |  2 +-
 .../ChannelwiseConvolution3D.cs                    |  2 +-
 .../ChannelwiseDeconvolution1D.cs                  |  2 +-
 .../ChannelwiseDeconvolution2D.cs                  |  2 +-
 .../ChannelwiseDeconvolution3D.cs                  |  2 +-
 .../FloatFloatPrecision/Convolution1D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution2D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution3D.cs           |  2 +-
 .../FloatFloatPrecision/Deconvolution1D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution2D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution3D.cs         |  2 +-
 .../Convolution/FloatFloatPrecision/Dense.cs       |  2 +-
 .../FloatFloatPrecision/PointwiseConvolution.cs    |  2 +-
 .../FloatFloatPrecision/PointwiseDeconvolution.cs  |  2 +-
 .../FloatFloatPrecision/TransposeDense.cs          |  2 +-
 .../FloatFloatPrecision/Convolution1D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution2D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution3D.cs           |  2 +-
 .../FloatFloatPrecision/Deconvolution1D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution2D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution3D.cs         |  2 +-
 .../Convolution/FloatFloatPrecision/Dense.cs       |  2 +-
 .../FloatFloatPrecision/TransposeDense.cs          |  2 +-
 .../Shaders/Transform/ColumnToImage1D.cs           |  2 +-
 .../Shaders/Transform/ColumnToImage2D.cs           |  2 +-
 .../Shaders/Transform/ColumnToImage3D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution1D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution2D.cs           |  2 +-
 .../FloatFloatPrecision/Convolution3D.cs           |  2 +-
 .../FloatFloatPrecision/Deconvolution1D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution2D.cs         |  2 +-
 .../FloatFloatPrecision/Deconvolution3D.cs         |  2 +-
 .../Convolution/FloatFloatPrecision/Dense.cs       |  2 +-
 .../FloatFloatPrecision/TransposeDense.cs          |  2 +-
 TensorShaderSample/MNIST/MnistDownloader.cs        |  4 ++--
 46 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 2de45171..cf3a744d 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 
 **Supports High-Dimensional Convolution Neural Networks.** (Complex, Quaternion, Vector3D)
 
-For regression problems: **High precision calculate** by FP32-FP32 arithmetic (1/8 error of FP32 arithmetic)
+For regression problems: **High precision calculate** by FP32-FP32 arithmetic
 
 ## Requirement  
 .NET 6.0  
diff --git a/TensorShaderCudaBackend/ShaderDefines.cs b/TensorShaderCudaBackend/ShaderDefines.cs
index 1c1c0a1e..c7766b02 100644
--- a/TensorShaderCudaBackend/ShaderDefines.cs
+++ b/TensorShaderCudaBackend/ShaderDefines.cs
@@ -402,7 +402,7 @@ static __inline__ __device__ void floatfloat_hilo_add(float &hi, float &lo, floa
                 public static string AtomicAdd =>
                 $@"
                 static __inline__ __device__ void floatfloat_atomicadd(float *ptr, float hi, float lo){{
-                    float tmp = atomicAdd(ptr, hi + lo);
+                    float tmp = atomicAdd(ptr, hi);
                     atomicAdd(ptr + 1, lo - (((tmp + hi) - tmp) - hi));
                 }}";
 
@@ -465,9 +465,9 @@ static __inline__ __device__ void complex_mulgrad(float2 &hi, float2 &lo, float2
                     static __inline__ __device__ void floatfloat_atomicadd(float2 *ptr, float2 hi, float2 lo){{
                         float *ptr_float = (float*)ptr;
 
-                        float tmpx = atomicAdd(ptr_float, hi.x + lo.x);
+                        float tmpx = atomicAdd(ptr_float, hi.x);
                         atomicAdd(ptr_float + 1, lo.x - (((tmpx + hi.x) - tmpx) - hi.x));
-                        float tmpy = atomicAdd(ptr_float + 2, hi.y + lo.y);
+                        float tmpy = atomicAdd(ptr_float + 2, hi.y);
                         atomicAdd(ptr_float + 3, lo.y - (((tmpy + hi.y) - tmpy) - hi.y));
                     }}";
                 }
@@ -585,13 +585,13 @@ static __inline__ __device__ void quaternion_mulgrad(float4 &hi, float4 &lo, flo
                     static __inline__ __device__ void floatfloat_atomicadd(float4 *ptr, float4 hi, float4 lo){{
                         float *ptr_float = (float*)ptr;
 
-                        float tmpx = atomicAdd(ptr_float, hi.x + lo.x);
+                        float tmpx = atomicAdd(ptr_float, hi.x);
                         atomicAdd(ptr_float + 1, lo.x - (((tmpx + hi.x) - tmpx) - hi.x));
-                        float tmpy = atomicAdd(ptr_float + 2, hi.y + lo.y);
+                        float tmpy = atomicAdd(ptr_float + 2, hi.y);
                         atomicAdd(ptr_float + 3, lo.y - (((tmpy + hi.y) - tmpy) - hi.y));
-                        float tmpz = atomicAdd(ptr_float + 4, hi.z + lo.z);
+                        float tmpz = atomicAdd(ptr_float + 4, hi.z);
                         atomicAdd(ptr_float + 5, lo.z - (((tmpz + hi.z) - tmpz) - hi.z));
-                        float tmpw = atomicAdd(ptr_float + 6, hi.w + lo.w);
+                        float tmpw = atomicAdd(ptr_float + 6, hi.w);
                         atomicAdd(ptr_float + 7, lo.w - (((tmpw + hi.w) - tmpw) - hi.w));
                     }}";
                 }
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs
index a36180d9..5c1a6352 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -90,7 +90,7 @@ __global__ void complex_convolution_1d(const float2* __restrict__ inmap, float2*
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs
index 0ce5d160..41fd8704 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -103,7 +103,7 @@ __global__ void complex_convolution_2d(const float2* __restrict__ inmap, float2*
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs
index a7cfbd6b..1b63f06b 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -111,7 +111,7 @@ __global__ void complex_convolution_3d(const float2* __restrict__ inmap, float2*
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs
index 32df67cb..365f2fd9 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -90,7 +90,7 @@ __global__ void complex_deconvolution_1d(const float2* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs
index bb66b172..3f624333 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -106,7 +106,7 @@ __global__ void complex_deconvolution_2d(const float2* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs
index a3c0cbe7..d821a34e 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -118,7 +118,7 @@ __global__ void complex_deconvolution_3d(const float2* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs
index 18e5f833..9386b773 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs
@@ -76,7 +76,7 @@ __global__ void complex_dense(const float2* __restrict__ inmap, float2* __restri
 
                         filter_idx += {OutChannels};
                     }}
-                    outmap[outch] = ctor_float2(vu_hi.x + vu_lo.x, vu_hi.y + vu_lo.y);
+                    outmap[outch] = ctor_float2(vu_hi.x, vu_hi.y);
 
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs
index c33cd19b..0f9bbb4f 100644
--- a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs
+++ b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -74,7 +74,7 @@ __global__ void complex_transpose_dense(const float2* __restrict__ inmap, float2
 
                         filter_idx += {OutChannels};
                     }}
-                    outmap[outch] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outch] = ctor_float2(uv_hi.x, uv_hi.y);
 
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs
index 44ae15c3..cf754252 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs
@@ -59,7 +59,7 @@ __global__ void chwise_convolution_1d(const float* __restrict__ inmap, float* __
 
                 unsigned int outmap_idx = ch + {Channels} * ox;
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_convolution_1d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs
index 33cbf6cf..a720203b 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs
@@ -72,7 +72,7 @@ __global__ void chwise_convolution_2d(const float* __restrict__ inmap, float* __
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy);
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_convolution_2d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs
index f55ebe98..e06eb3e6 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs
@@ -79,7 +79,7 @@ __global__ void chwise_convolution_3d(const float* __restrict__ inmap, float* __
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz));
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_convolution_3d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs
index 09c015f1..5ce6b61f 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs
@@ -61,7 +61,7 @@ __global__ void chwise_deconvolution_1d(const float* __restrict__ inmap, float*
 
                 unsigned int outmap_idx = ch + {Channels} * ox;
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_deconvolution_1d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs
index f0853720..e2a6b6c2 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs
@@ -77,7 +77,7 @@ __global__ void chwise_deconvolution_2d(const float* __restrict__ inmap, float*
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy);
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_deconvolution_2d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs
index 03e6ac99..45e05b85 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs
@@ -88,7 +88,7 @@ __global__ void chwise_deconvolution_3d(const float* __restrict__ inmap, float*
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz));
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_deconvolution_3d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs
index e801701f..d322b93d 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -81,7 +81,7 @@ __global__ void convolution_1d(const float* __restrict__ inmap, float* __restric
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs
index 52627fa6..8ca3460b 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -94,7 +94,7 @@ __global__ void convolution_2d(const float* __restrict__ inmap, float* __restric
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs
index db229a9d..001cc9b3 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -101,7 +101,7 @@ __global__ void convolution_3d(const float* __restrict__ inmap, float* __restric
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs
index 7c1f80f8..d610587a 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -81,7 +81,7 @@ __global__ void deconvolution_1d(const float* __restrict__ inmap, float* __restr
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs
index fc62484b..309de596 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -97,7 +97,7 @@ __global__ void deconvolution_2d(const float* __restrict__ inmap, float* __restr
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs
index beb6bb56..b96c65bb 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -108,7 +108,7 @@ __global__ void deconvolution_3d(const float* __restrict__ inmap, float* __restr
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs
index 0c35cddb..636b9781 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs
@@ -70,7 +70,7 @@ __global__ void dense(const float* __restrict__ inmap, float* __restrict__ outma
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = uv_hi + uv_lo;
+                    outmap[outch] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs
index b4ebd5c8..db861d70 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs
@@ -70,7 +70,7 @@ __global__ void ptwise_convolution(const float* __restrict__ inmap, float* __res
 
                     unsigned int outmap_idx = outch + {OutChannels} * i;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs
index c1d219ce..9e2f2bfc 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs
@@ -68,7 +68,7 @@ __global__ void ptwise_deconvolution(const float* __restrict__ inmap, float* __r
 
                     unsigned int outmap_idx = outch + {OutChannels} * i;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs
index 6fbd5e77..0975ecf1 100644
--- a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs
+++ b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -65,7 +65,7 @@ __global__ void transpose_dense(const float* __restrict__ inmap, float* __restri
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = uv_hi + uv_lo;
+                    outmap[outch] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs
index af3db82a..2c1ac2e6 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -90,7 +90,7 @@ __global__ void quaternion_convolution_1d(const float4* __restrict__ inmap, floa
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs
index df4016c5..1c0ff920 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -103,7 +103,7 @@ __global__ void quaternion_convolution_2d(const float4* __restrict__ inmap, floa
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs
index 40eef3f4..aba29543 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -111,7 +111,7 @@ __global__ void quaternion_convolution_3d(const float4* __restrict__ inmap, floa
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs
index f2a7472b..379ac1e0 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -90,7 +90,7 @@ __global__ void quaternion_deconvolution_1d(const float4* __restrict__ inmap, fl
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs
index 366fe997..f205f3fc 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -106,7 +106,7 @@ __global__ void quaternion_deconvolution_2d(const float4* __restrict__ inmap, fl
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs
index 24e370a4..04799ef5 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -118,7 +118,7 @@ __global__ void quaternion_deconvolution_3d(const float4* __restrict__ inmap, fl
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs
index cd4eef2e..af0d079b 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs
@@ -77,7 +77,7 @@ __global__ void quaternion_dense(const float4* __restrict__ inmap, float4* __res
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outch] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs
index 990e8841..f1a52f3c 100644
--- a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs
+++ b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -75,7 +75,7 @@ __global__ void quaternion_transpose_dense(const float4* __restrict__ inmap, flo
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outch] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs
index b0fe8270..7237184a 100644
--- a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs
@@ -57,7 +57,7 @@ __global__ void column_to_image_1d(const float* __restrict__ inmap, float* __res
                 }}
 
                 unsigned int outmap_idx = ch + {Channels} * ox;
-                outmap[outmap_idx] = hi + lo;
+                outmap[outmap_idx] = hi;
             }}";
 
             this.Kernel = new Kernel(code, "column_to_image_1d");
diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs
index 7eb9a103..ff6268d7 100644
--- a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs
@@ -69,7 +69,7 @@ __global__ void column_to_image_2d(const float* __restrict__ inmap, float* __res
                 }}
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy);
-                outmap[outmap_idx] = hi + lo;
+                outmap[outmap_idx] = hi;
             }}";
 
             this.Kernel = new Kernel(code, "column_to_image_2d");
diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs
index 9ea7a51c..4c946062 100644
--- a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs
@@ -81,7 +81,7 @@ __global__ void column_to_image_3d(const float* __restrict__ inmap, float* __res
                 }}
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz));
-                outmap[outmap_idx] = hi + lo;
+                outmap[outmap_idx] = hi;
             }}";
 
             this.Kernel = new Kernel(code, "column_to_image_3d");
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs
index f0134446..5e4fa2dc 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -89,7 +89,7 @@ __global__ void trivector_convolution_1d(const float3* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs
index d8cbcad4..d29da203 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -102,7 +102,7 @@ __global__ void trivector_convolution_2d(const float3* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs
index 613d74e5..e4a66e5a 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -110,7 +110,7 @@ __global__ void trivector_convolution_3d(const float3* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs
index beea3f03..59870c93 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -89,7 +89,7 @@ __global__ void trivector_deconvolution_1d(const float3* __restrict__ inmap, flo
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs
index 9f1afaa6..5e2023fe 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -105,7 +105,7 @@ __global__ void trivector_deconvolution_2d(const float3* __restrict__ inmap, flo
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs
index 6a1c711d..18f99b64 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -117,7 +117,7 @@ __global__ void trivector_deconvolution_3d(const float3* __restrict__ inmap, flo
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs
index 5b37c532..8aaccc37 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs
@@ -76,7 +76,7 @@ __global__ void trivector_dense(const float3* __restrict__ inmap, float3* __rest
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outch] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs
index 593facf4..fd7f02bb 100644
--- a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs
+++ b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -74,7 +74,7 @@ __global__ void trivector_transpose_dense(const float3* __restrict__ inmap, floa
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outch] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderSample/MNIST/MnistDownloader.cs b/TensorShaderSample/MNIST/MnistDownloader.cs
index 57acf2c6..0583eda2 100644
--- a/TensorShaderSample/MNIST/MnistDownloader.cs
+++ b/TensorShaderSample/MNIST/MnistDownloader.cs
@@ -30,8 +30,8 @@ public static async void Download(string dirpath_dataset) {
                     }
 
                     if (!File.Exists(filepath)) {
-                        using HttpResponseMessage res = await client.GetAsync($"{url}{filename}");
-                        using (Stream stream = await res.Content.ReadAsStreamAsync()) {
+                        using HttpResponseMessage res = client.GetAsync($"{url}{filename}").Result;
+                        using (Stream stream = res.Content.ReadAsStreamAsync().Result) {
                             using FileStream fs = new(filepath_temp, FileMode.CreateNew);
                             stream.CopyTo(fs);
                         }