Simd API C++ wrappers.
diff --git a/docs/help/namespacemembers_func_s.html b/docs/help/namespacemembers_func_s.html
index 4acddcfd1a..4c66a60401 100644
--- a/docs/help/namespacemembers_func_s.html
+++ b/docs/help/namespacemembers_func_s.html
@@ -62,7 +62,7 @@
- s -
- SquaredDistance() : Simd
- SquareSum() : Simd
- StretchGray2x2() : Simd
-- SynetSetInput() : Simd
+- SynetSetInput() : Simd
diff --git a/docs/help/namespacemembers_s.html b/docs/help/namespacemembers_s.html
index 988d569ae6..d825ca00d7 100644
--- a/docs/help/namespacemembers_s.html
+++ b/docs/help/namespacemembers_s.html
@@ -64,7 +64,7 @@
- s -
diff --git a/py/SimdPy/ b/py/SimdPy/
index ec2fa755d4..dad41b959a 100644
--- a/py/SimdPy/
+++ b/py/SimdPy/
@@ -759,15 +759,12 @@ def Resized(src : Image, width :int, height: int, method = Simd.ResizeMethod.Bil
## @ingroup python
# Sets image to the input of neural network of
Synet Framework.
-# @param src - an original input image.
-# @param dst - a resized output image.
-# @param method - a resizing method. By default it is equal to Simd.ResizeMethod.Bilinear.
- # @param src - an input image. There are following supported pixel format: aSimd.PixelFormat.Gray8, Simd.PixelFormat.Bgr24, Simd.PixelFormat.Bgra32, Simd.PixelFormat.Rgb24.
- # @param lower - an array with lower bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor.
- # @param upper - an array with upper bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor.
- # @param dst - a pointer to the output 32-bit float image tensor.
- # @param channels - a number of channels in the output image tensor. It can be 1 or 3.
- # @param format - a format of output image tensor. There are supported following tensor formats: Simd.TensorFormat.Nchw, Simd.TensorFormat.Nhwc.
+# @param src - an input image. There are following supported pixel format: aSimd.PixelFormat.Gray8, Simd.PixelFormat.Bgr24, Simd.PixelFormat.Bgra32, Simd.PixelFormat.Rgb24.
+# @param lower - an array with lower bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor.
+# @param upper - an array with upper bound of values of the output tensor. The size of the array have to correspond number of channels in the output image tensor.
+# @param dst - a pointer to the output 32-bit float image tensor.
+# @param channels - a number of channels in the output image tensor. It can be 1 or 3.
+# @param format - a format of output image tensor. There are supported following tensor formats: Simd.TensorFormat.Nchw, Simd.TensorFormat.Nhwc.
def SynetSetInput(src : Image, lower, upper, dst : ctypes.c_void_p, channels : int, format : Simd.TensorFormat) :
if src.Format() != PixelFormat.Gray8 and src.Format() != PixelFormat.Bgr24 and src.Format() != PixelFormat.Bgra32 and src.Format() != PixelFormat.Rgb24 :
raise Exception("Incompatible image pixel format: {0}!".format(src.Format()))
diff --git a/src/Simd/SimdAvx2SynetConversion.cpp b/src/Simd/SimdAvx2SynetConversion.cpp
index 7170a0a052..eeb29e361f 100644
--- a/src/Simd/SimdAvx2SynetConversion.cpp
+++ b/src/Simd/SimdAvx2SynetConversion.cpp
@@ -355,6 +355,16 @@ namespace Simd
return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+ const __m256i K16_RED_BLUE = SIMD_MM256_SET2_EPI16(Base::RED_TO_GRAY_WEIGHT, Base::BLUE_TO_GRAY_WEIGHT);
+ SIMD_INLINE __m256i RgbaToGray32(__m256i rgba)
+ {
+ const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(rgba, 1), K16_00FF);
+ const __m256i r0b0 = _mm256_and_si256(rgba, K16_00FF);
+ const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(r0b0, K16_RED_BLUE));
+ return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+ }
SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m256 scale, __m256 shift, float * dst);
SIMD_INLINE void SynetSetInput1Gray8(__m128i gray8, __m256 scale, __m256 shift, float * dst)
@@ -399,6 +409,14 @@ namespace Simd
StoreScaled(dst + 3 * F, BgraToGray32(RgbToBgra(Load((__m256i*)(src + 64)), K32_01000000)), scale, shift);
+ template<> SIMD_INLINE void SynetSetInput1(const uint8_t* src, __m256 scale, __m256 shift, float* dst)
+ {
+ StoreScaled(dst + 0 * F, RgbaToGray32(Load((__m256i*)src + 0)), scale, shift);
+ StoreScaled(dst + 1 * F, RgbaToGray32(Load((__m256i*)src + 1)), scale, shift);
+ StoreScaled(dst + 2 * F, RgbaToGray32(Load((__m256i*)src + 2)), scale, shift);
+ StoreScaled(dst + 3 * F, RgbaToGray32(Load((__m256i*)src + 3)), scale, shift);
+ }
template void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
__m256 _scale = _mm256_set1_ps(scale[0]);
@@ -479,6 +497,22 @@ namespace Simd
SynetSetInput1Gray8(BgrToBlue(_rgb), scale[2], shift[2], dst + 2 * channel);
+ SIMD_INLINE void SynetSetInputNchw3Rgba32(const uint8_t* src, const __m256* scale, const __m256* shift, float* dst, size_t channel)
+ {
+ __m256i rgba = Load((__m256i*)src);
+ StoreScaled(dst + 0 * channel, _mm256_and_si256(_mm256_srli_si256(rgba, 2), K32_000000FF), scale[0], shift[0]);
+ StoreScaled(dst + 1 * channel, _mm256_and_si256(_mm256_srli_si256(rgba, 1), K32_000000FF), scale[1], shift[1]);
+ StoreScaled(dst + 2 * channel, _mm256_and_si256(_mm256_srli_si256(rgba, 0), K32_000000FF), scale[2], shift[2]);
+ }
+ template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t* src, const __m256* scale, const __m256* shift, float* dst, size_t channel)
+ {
+ SynetSetInputNchw3Rgba32(src + 0 * A, scale, shift, dst + 0 * F, channel);
+ SynetSetInputNchw3Rgba32(src + 1 * A, scale, shift, dst + 1 * F, channel);
+ SynetSetInputNchw3Rgba32(src + 2 * A, scale, shift, dst + 2 * F, channel);
+ SynetSetInputNchw3Rgba32(src + 3 * A, scale, shift, dst + 3 * F, channel);
+ }
template void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
size_t aligned = AlignLo(width, A), channel = width * height;
@@ -587,6 +621,26 @@ namespace Simd
StoreScaled(dst + 0xB * F, _mm256_cvtepi16_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 80)), K8_RGB_UNPACK_2)), scale[2], shift[2]);
+ const __m128i K8_RGBA_TO_BGR_0 = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, 0x6, 0x5, 0x4, 0xA, 0x9, -1, -1, -1, -1, -1, -1, -1, -1);
+ const __m128i K8_RGBA_TO_BGR_1 = SIMD_MM_SETR_EPI8(0x0, 0x6, 0x5, 0x4, 0xA, 0x9, 0x8, 0xE, -1, -1, -1, -1, -1, -1, -1, -1);
+ const __m128i K8_RGBA_TO_BGR_2 = SIMD_MM_SETR_EPI8(0x5, 0x4, 0xA, 0x9, 0x8, 0xE, 0xD, 0xC, -1, -1, -1, -1, -1, -1, -1, -1);
+ template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t* src, const __m256* scale, const __m256* shift, float* dst)
+ {
+ StoreScaled(dst + 0x0 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 0)), K8_RGBA_TO_BGR_0)), scale[0], shift[0]);
+ StoreScaled(dst + 0x1 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 8)), K8_RGBA_TO_BGR_1)), scale[1], shift[1]);
+ StoreScaled(dst + 0x2 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 16)), K8_RGBA_TO_BGR_2)), scale[2], shift[2]);
+ StoreScaled(dst + 0x3 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 32)), K8_RGBA_TO_BGR_0)), scale[0], shift[0]);
+ StoreScaled(dst + 0x4 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 40)), K8_RGBA_TO_BGR_1)), scale[1], shift[1]);
+ StoreScaled(dst + 0x5 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 48)), K8_RGBA_TO_BGR_2)), scale[2], shift[2]);
+ StoreScaled(dst + 0x6 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 64)), K8_RGBA_TO_BGR_0)), scale[0], shift[0]);
+ StoreScaled(dst + 0x7 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 72)), K8_RGBA_TO_BGR_1)), scale[1], shift[1]);
+ StoreScaled(dst + 0x8 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 80)), K8_RGBA_TO_BGR_2)), scale[2], shift[2]);
+ StoreScaled(dst + 0x9 * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 96)), K8_RGBA_TO_BGR_0)), scale[0], shift[0]);
+ StoreScaled(dst + 0xA * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 104)), K8_RGBA_TO_BGR_1)), scale[1], shift[1]);
+ StoreScaled(dst + 0xB * F, _mm256_cvtepu8_epi32(_mm_shuffle_epi8(Sse41::Load((__m128i*)(src + 112)), K8_RGBA_TO_BGR_2)), scale[2], shift[2]);
+ }
template void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
size_t aligned = AlignLo(width, A);
@@ -625,6 +679,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
@@ -638,6 +693,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
@@ -648,6 +704,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
diff --git a/src/Simd/SimdBaseSynetConversion.cpp b/src/Simd/SimdBaseSynetConversion.cpp
index 302226bc06..29359a7168 100644
--- a/src/Simd/SimdBaseSynetConversion.cpp
+++ b/src/Simd/SimdBaseSynetConversion.cpp
@@ -199,6 +199,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
@@ -212,6 +213,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
@@ -222,6 +224,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
diff --git a/src/Simd/SimdSse41SynetConversion.cpp b/src/Simd/SimdSse41SynetConversion.cpp
index b9adf74f71..1b48c94f80 100644
--- a/src/Simd/SimdSse41SynetConversion.cpp
+++ b/src/Simd/SimdSse41SynetConversion.cpp
@@ -275,7 +275,8 @@ namespace Simd
+ const __m128i K16_GREEN_0 = SIMD_MM_SET2_EPI16(Base::GREEN_TO_GRAY_WEIGHT, 0);
+ const __m128i K32_ROUND_TERM = SIMD_MM_SET1_EPI32(Base::BGR_TO_GRAY_ROUND_TERM);
const __m128i K8_BGR_TO_BGRA = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, -1, 0x3, 0x4, 0x5, -1, 0x6, 0x7, 0x8, -1, 0x9, 0xA, 0xB, -1);
const __m128i K8_RGB_TO_BGRA = SIMD_MM_SETR_EPI8(0x2, 0x1, 0x0, -1, 0x5, 0x4, 0x3, -1, 0x8, 0x7, 0x6, -1, 0xB, 0xA, 0x9, -1);
@@ -288,8 +289,18 @@ namespace Simd
const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF);
const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF);
- const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_ROUND), _mm_madd_epi16(b0r0, K16_BLUE_RED));
- return _mm_srli_epi32(weightedSum, Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+ const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0), _mm_madd_epi16(b0r0, K16_BLUE_RED));
+ return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
+ }
+ SIMD_INLINE __m128i RgbaToGray32(__m128i rgba)
+ {
+ const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(rgba, 1), K16_00FF);
+ const __m128i r0b0 = _mm_and_si128(rgba, K16_00FF);
+ const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0), _mm_madd_epi16(r0b0, K16_RED_BLUE));
+ return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
template SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m128 scale, __m128 shift, float * dst);
@@ -309,26 +320,34 @@ namespace Simd
template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m128 scale, __m128 shift, float * dst)
- StoreScaled(dst + 0 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(src + 0)), K8_BGR_TO_BGRA))), scale, shift);
- StoreScaled(dst + 1 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(src + 12)), K8_BGR_TO_BGRA))), scale, shift);
- StoreScaled(dst + 2 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(src + 24)), K8_BGR_TO_BGRA))), scale, shift);
- StoreScaled(dst + 3 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(src + 32)), 4), K8_BGR_TO_BGRA))), scale, shift);
+ StoreScaled(dst + 0 * F, BgraToGray32(_mm_shuffle_epi8(Load((__m128i*)(src + 0)), K8_BGR_TO_BGRA)), scale, shift);
+ StoreScaled(dst + 1 * F, BgraToGray32(_mm_shuffle_epi8(Load((__m128i*)(src + 12)), K8_BGR_TO_BGRA)), scale, shift);
+ StoreScaled(dst + 2 * F, BgraToGray32(_mm_shuffle_epi8(Load((__m128i*)(src + 24)), K8_BGR_TO_BGRA)), scale, shift);
+ StoreScaled(dst + 3 * F, BgraToGray32(_mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(src + 32)), 4), K8_BGR_TO_BGRA)), scale, shift);
template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m128 scale, __m128 shift, float * dst)
- StoreScaled(dst + 0 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_and_si128(K32_00FFFFFF, Load((__m128i*)src + 0)))), scale, shift);
- StoreScaled(dst + 1 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_and_si128(K32_00FFFFFF, Load((__m128i*)src + 1)))), scale, shift);
- StoreScaled(dst + 2 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_and_si128(K32_00FFFFFF, Load((__m128i*)src + 2)))), scale, shift);
- StoreScaled(dst + 3 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_and_si128(K32_00FFFFFF, Load((__m128i*)src + 3)))), scale, shift);
+ StoreScaled(dst + 0 * F, BgraToGray32(Load((__m128i*)src + 0)), scale, shift);
+ StoreScaled(dst + 1 * F, BgraToGray32(Load((__m128i*)src + 1)), scale, shift);
+ StoreScaled(dst + 2 * F, BgraToGray32(Load((__m128i*)src + 2)), scale, shift);
+ StoreScaled(dst + 3 * F, BgraToGray32(Load((__m128i*)src + 3)), scale, shift);
template<> SIMD_INLINE void SynetSetInput1(const uint8_t * src, __m128 scale, __m128 shift, float * dst)
- StoreScaled(dst + 0 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(src + 0)), K8_RGB_TO_BGRA))), scale, shift);
- StoreScaled(dst + 1 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(src + 12)), K8_RGB_TO_BGRA))), scale, shift);
- StoreScaled(dst + 2 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(Load((__m128i*)(src + 24)), K8_RGB_TO_BGRA))), scale, shift);
- StoreScaled(dst + 3 * F, BgraToGray32(_mm_or_si128(K32_01000000, _mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(src + 32)), 4), K8_RGB_TO_BGRA))), scale, shift);
+ StoreScaled(dst + 0 * F, BgraToGray32(_mm_shuffle_epi8(Load((__m128i*)(src + 0)), K8_RGB_TO_BGRA)), scale, shift);
+ StoreScaled(dst + 1 * F, BgraToGray32(_mm_shuffle_epi8(Load((__m128i*)(src + 12)), K8_RGB_TO_BGRA)), scale, shift);
+ StoreScaled(dst + 2 * F, BgraToGray32(_mm_shuffle_epi8(Load((__m128i*)(src + 24)), K8_RGB_TO_BGRA)), scale, shift);
+ StoreScaled(dst + 3 * F, BgraToGray32(_mm_shuffle_epi8(_mm_srli_si128(Load((__m128i*)(src + 32)), 4), K8_RGB_TO_BGRA)), scale, shift);
+ }
+ template<> SIMD_INLINE void SynetSetInput1(const uint8_t* src, __m128 scale, __m128 shift, float* dst)
+ {
+ StoreScaled(dst + 0 * F, RgbaToGray32(Load((__m128i*)src + 0)), scale, shift);
+ StoreScaled(dst + 1 * F, RgbaToGray32(Load((__m128i*)src + 1)), scale, shift);
+ StoreScaled(dst + 2 * F, RgbaToGray32(Load((__m128i*)src + 2)), scale, shift);
+ StoreScaled(dst + 3 * F, RgbaToGray32(Load((__m128i*)src + 3)), scale, shift);
template void SynetSetInput1(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
@@ -410,6 +429,22 @@ namespace Simd
SynetSetInput1Gray8(BgrToBlue(_rgb), scale[2], shift[2], dst + 2 * channel);
+ SIMD_INLINE void SynetSetInputNchw3Rgba32(const uint8_t* src, const __m128* scale, const __m128* shift, float* dst, size_t channel)
+ {
+ __m128i rgba = Load((__m128i*)src);
+ StoreScaled(dst + 0 * channel, _mm_and_si128(_mm_srli_si128(rgba, 2), K32_000000FF), scale[0], shift[0]);
+ StoreScaled(dst + 1 * channel, _mm_and_si128(_mm_srli_si128(rgba, 1), K32_000000FF), scale[1], shift[1]);
+ StoreScaled(dst + 2 * channel, _mm_and_si128(_mm_srli_si128(rgba, 0), K32_000000FF), scale[2], shift[2]);
+ }
+ template<> SIMD_INLINE void SynetSetInputNchw3(const uint8_t* src, const __m128* scale, const __m128* shift, float* dst, size_t channel)
+ {
+ SynetSetInputNchw3Rgba32(src + 0 * A, scale, shift, dst + 0 * F, channel);
+ SynetSetInputNchw3Rgba32(src + 1 * A, scale, shift, dst + 1 * F, channel);
+ SynetSetInputNchw3Rgba32(src + 2 * A, scale, shift, dst + 2 * F, channel);
+ SynetSetInputNchw3Rgba32(src + 3 * A, scale, shift, dst + 3 * F, channel);
+ }
template void SynetSetInputNchw3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
size_t aligned = AlignLo(width, A), channel = width * height;
@@ -471,34 +506,28 @@ namespace Simd
StoreScaled(dst + 0xB * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr2, 0xC)), scale[2], shift[2]);
- const __m128i K8_BGRA_TO_BGR_00 = SIMD_MM_SETR_EPI8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
- const __m128i K8_BGRA_TO_BGR_01 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4);
- const __m128i K8_BGRA_TO_BGR_10 = SIMD_MM_SETR_EPI8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1);
- const __m128i K8_BGRA_TO_BGR_11 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9);
- const __m128i K8_BGRA_TO_BGR_20 = SIMD_MM_SETR_EPI8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
- const __m128i K8_BGRA_TO_BGR_21 = SIMD_MM_SETR_EPI8(-1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE);
+ const __m128i K8_BGRA_UNPACK_0 = SIMD_MM_SETR_EPI8(0x0, -1, -1, -1, 0x1, -1, -1, -1, 0x2, -1, -1, -1, 0x4, -1, -1, -1);
+ const __m128i K8_BGRA_UNPACK_1 = SIMD_MM_SETR_EPI8(0x5, -1, -1, -1, 0x6, -1, -1, -1, 0x8, -1, -1, -1, 0x9, -1, -1, -1);
+ const __m128i K8_BGRA_UNPACK_2 = SIMD_MM_SETR_EPI8(0xA, -1, -1, -1, 0xC, -1, -1, -1, 0xD, -1, -1, -1, 0xE, -1, -1, -1);
template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t * src, const __m128 * scale, const __m128 * shift, float * dst)
__m128i bgra0 = Load((__m128i*)src + 0);
+ StoreScaled(dst + 0x0 * F, _mm_shuffle_epi8(bgra0, K8_BGRA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0x1 * F, _mm_shuffle_epi8(bgra0, K8_BGRA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0x2 * F, _mm_shuffle_epi8(bgra0, K8_BGRA_UNPACK_2), scale[2], shift[2]);
__m128i bgra1 = Load((__m128i*)src + 1);
+ StoreScaled(dst + 0x3 * F, _mm_shuffle_epi8(bgra1, K8_BGRA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0x4 * F, _mm_shuffle_epi8(bgra1, K8_BGRA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0x5 * F, _mm_shuffle_epi8(bgra1, K8_BGRA_UNPACK_2), scale[2], shift[2]);
__m128i bgra2 = Load((__m128i*)src + 2);
+ StoreScaled(dst + 0x6 * F, _mm_shuffle_epi8(bgra2, K8_BGRA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0x7 * F, _mm_shuffle_epi8(bgra2, K8_BGRA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0x8 * F, _mm_shuffle_epi8(bgra2, K8_BGRA_UNPACK_2), scale[2], shift[2]);
__m128i bgra3 = Load((__m128i*)src + 3);
- __m128i bgr0 = _mm_or_si128(_mm_shuffle_epi8(bgra0, K8_BGRA_TO_BGR_00), _mm_shuffle_epi8(bgra1, K8_BGRA_TO_BGR_01));
- StoreScaled(dst + 0x0 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr0, 0x0)), scale[0], shift[0]);
- StoreScaled(dst + 0x1 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr0, 0x4)), scale[1], shift[1]);
- StoreScaled(dst + 0x2 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr0, 0x8)), scale[2], shift[2]);
- StoreScaled(dst + 0x3 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr0, 0xC)), scale[0], shift[0]);
- __m128i bgr1 = _mm_or_si128(_mm_shuffle_epi8(bgra1, K8_BGRA_TO_BGR_10), _mm_shuffle_epi8(bgra2, K8_BGRA_TO_BGR_11));
- StoreScaled(dst + 0x4 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr1, 0x0)), scale[1], shift[1]);
- StoreScaled(dst + 0x5 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr1, 0x4)), scale[2], shift[2]);
- StoreScaled(dst + 0x6 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr1, 0x8)), scale[0], shift[0]);
- StoreScaled(dst + 0x7 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr1, 0xC)), scale[1], shift[1]);
- __m128i bgr2 = _mm_or_si128(_mm_shuffle_epi8(bgra2, K8_BGRA_TO_BGR_20), _mm_shuffle_epi8(bgra3, K8_BGRA_TO_BGR_21));
- StoreScaled(dst + 0x8 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr2, 0x0)), scale[2], shift[2]);
- StoreScaled(dst + 0x9 * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr2, 0x4)), scale[0], shift[0]);
- StoreScaled(dst + 0xA * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr2, 0x8)), scale[1], shift[1]);
- StoreScaled(dst + 0xB * F, _mm_cvtepu8_epi32(_mm_srli_si128(bgr2, 0xC)), scale[2], shift[2]);
+ StoreScaled(dst + 0x9 * F, _mm_shuffle_epi8(bgra3, K8_BGRA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0xA * F, _mm_shuffle_epi8(bgra3, K8_BGRA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0xB * F, _mm_shuffle_epi8(bgra3, K8_BGRA_UNPACK_2), scale[2], shift[2]);
const __m128i K8_RGB_UNPACK_0 = SIMD_MM_SETR_EPI8(0x2, -1, -1, -1, 0x1, -1, -1, -1, 0x0, -1, -1, -1, 0x5, -1, -1, -1);
@@ -525,6 +554,30 @@ namespace Simd
StoreScaled(dst + 0xB * F, _mm_shuffle_epi8(bgr3, K8_RGB_UNPACK_2), scale[2], shift[2]);
+ const __m128i K8_RGBA_UNPACK_0 = SIMD_MM_SETR_EPI8(0x2, -1, -1, -1, 0x1, -1, -1, -1, 0x0, -1, -1, -1, 0x6, -1, -1, -1);
+ const __m128i K8_RGBA_UNPACK_1 = SIMD_MM_SETR_EPI8(0x5, -1, -1, -1, 0x4, -1, -1, -1, 0xA, -1, -1, -1, 0x9, -1, -1, -1);
+ const __m128i K8_RGBA_UNPACK_2 = SIMD_MM_SETR_EPI8(0x8, -1, -1, -1, 0xE, -1, -1, -1, 0xD, -1, -1, -1, 0xC, -1, -1, -1);
+ template<> SIMD_INLINE void SynetSetInputNhwc3(const uint8_t* src, const __m128* scale, const __m128* shift, float* dst)
+ {
+ __m128i rgba0 = Load((__m128i*)src + 0);
+ StoreScaled(dst + 0x0 * F, _mm_shuffle_epi8(rgba0, K8_RGBA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0x1 * F, _mm_shuffle_epi8(rgba0, K8_RGBA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0x2 * F, _mm_shuffle_epi8(rgba0, K8_RGBA_UNPACK_2), scale[2], shift[2]);
+ __m128i rgba1 = Load((__m128i*)src + 1);
+ StoreScaled(dst + 0x3 * F, _mm_shuffle_epi8(rgba1, K8_RGBA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0x4 * F, _mm_shuffle_epi8(rgba1, K8_RGBA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0x5 * F, _mm_shuffle_epi8(rgba1, K8_RGBA_UNPACK_2), scale[2], shift[2]);
+ __m128i rgba2 = Load((__m128i*)src + 2);
+ StoreScaled(dst + 0x6 * F, _mm_shuffle_epi8(rgba2, K8_RGBA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0x7 * F, _mm_shuffle_epi8(rgba2, K8_RGBA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0x8 * F, _mm_shuffle_epi8(rgba2, K8_RGBA_UNPACK_2), scale[2], shift[2]);
+ __m128i rgba3 = Load((__m128i*)src + 3);
+ StoreScaled(dst + 0x9 * F, _mm_shuffle_epi8(rgba3, K8_RGBA_UNPACK_0), scale[0], shift[0]);
+ StoreScaled(dst + 0xA * F, _mm_shuffle_epi8(rgba3, K8_RGBA_UNPACK_1), scale[1], shift[1]);
+ StoreScaled(dst + 0xB * F, _mm_shuffle_epi8(rgba3, K8_RGBA_UNPACK_2), scale[2], shift[2]);
+ }
template void SynetSetInputNhwc3(const uint8_t * src, size_t width, size_t height, size_t stride, const float * scale, const float * shift, float * dst)
size_t aligned = AlignLo(width, A);
@@ -563,6 +616,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInput1(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
@@ -576,6 +630,7 @@ namespace Simd
case SimdPixelFormatBgr24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInputNchw3(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
@@ -583,9 +638,10 @@ namespace Simd
switch (srcFormat)
case SimdPixelFormatGray8: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
- case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); break;
+ case SimdPixelFormatBgr24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatBgra32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
case SimdPixelFormatRgb24: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
+ case SimdPixelFormatRgba32: SynetSetInputNhwc3(src, width, height, stride, scale, lower, dst); return;
default: assert(0);
diff --git a/src/Test/TestSynetConversion.cpp b/src/Test/TestSynetConversion.cpp
index 7ae2b2a7a2..09a81eaff8 100644
--- a/src/Test/TestSynetConversion.cpp
+++ b/src/Test/TestSynetConversion.cpp
@@ -283,7 +283,7 @@ namespace Test
bool result = true;
- View::Format srcFormat[4] = { View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24 };
+ View::Format srcFormat[5] = { View::Gray8, View::Bgr24, View::Bgra32, View::Rgb24, View::Rgba32 };
size_t channels[2] = { 1, 3 };
SimdTensorFormatType dstFormat[2] = { SimdTensorFormatNchw, SimdTensorFormatNhwc };