From 57adab64d127555705a5f4997425ddda79ac50a5 Mon Sep 17 00:00:00 2001 From: Yoh Date: Wed, 19 Oct 2022 01:44:59 +0800 Subject: [PATCH 001/127] pnnx support grid_sample op --- tools/pnnx/src/CMakeLists.txt | 1 + tools/pnnx/src/pass_level5/F_grid_sample.cpp | 72 ++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 tools/pnnx/src/pass_level5/F_grid_sample.cpp diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 49b79d9f7d9..6198c2aefcc 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -366,6 +366,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/F_elu.cpp pass_ncnn/F_embedding.cpp pass_ncnn/F_gelu.cpp + pass_ncnn/F_grid_sample.cpp pass_ncnn/F_group_norm.cpp pass_ncnn/F_hardsigmoid.cpp pass_ncnn/F_hardswish.cpp diff --git a/tools/pnnx/src/pass_level5/F_grid_sample.cpp b/tools/pnnx/src/pass_level5/F_grid_sample.cpp new file mode 100644 index 00000000000..584e67dc18f --- /dev/null +++ b/tools/pnnx/src/pass_level5/F_grid_sample.cpp @@ -0,0 +1,72 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" +#include + +namespace pnnx { + + namespace ncnn { + + class F_grid_sample : public GraphRewriterPass + { + public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 input0 +pnnx.Input input_1 0 1 input1 +F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Grid_sample"; + } + + const char* name_str() const + { + return "grid_sample"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::string& mode = captured_params.at("mode").s; + const std::string& padding_mode = captured_params.at("padding_mode").s; + + if (mode == "bilinear") + op->params["0"] = 1; + if (mode == "nearest") + op->params["0"] = 2; + if (mode == "bicubic") + op->params["0"] = 3; + + if (padding_mode == "zeors") + op->params["1"] = 1; + if (padding_mode == "border") + op->params["1"] = 2; + if (padding_mode == "reflection") + op->params["1"] = 3; + + op->params["6"] = captured_params.at("align_corners").b ? 1 : 0; // align_corners + } + }; + + REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) + } // namespace ncnn + +} // namespace pnnx From 14b1d3ba7776aa3d9cd2b981ed11841ddb1c7946 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Tue, 18 Oct 2022 18:38:45 +0000 Subject: [PATCH 002/127] apply code-format changes --- tools/pnnx/src/pass_level5/F_grid_sample.cpp | 74 ++++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/tools/pnnx/src/pass_level5/F_grid_sample.cpp b/tools/pnnx/src/pass_level5/F_grid_sample.cpp index 584e67dc18f..f2e8152bacb 100644 --- a/tools/pnnx/src/pass_level5/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_level5/F_grid_sample.cpp @@ -17,56 +17,56 @@ namespace pnnx { - namespace ncnn { +namespace ncnn { - class F_grid_sample : public GraphRewriterPass - { - public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 +class F_grid_sample : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 4 3 pnnx.Input input_0 0 1 input0 pnnx.Input input_1 0 1 input1 F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners pnnx.Output output 1 0 out )PNNXIR"; - } + } - const char* type_str() const - { - return "Grid_sample"; - } + const char* type_str() const + { + return "Grid_sample"; + } - const char* name_str() const - { - return "grid_sample"; - } + const char* name_str() const + { + return "grid_sample"; + } - void write(Operator* op, const std::map& captured_params) const - { - const std::string& mode = captured_params.at("mode").s; - const std::string& padding_mode = captured_params.at("padding_mode").s; + void write(Operator* op, const std::map& captured_params) const + { + const std::string& mode = captured_params.at("mode").s; + const std::string& padding_mode = captured_params.at("padding_mode").s; - if (mode == "bilinear") - op->params["0"] = 1; - if (mode == "nearest") - op->params["0"] = 2; - if (mode == "bicubic") - op->params["0"] = 3; + if (mode == "bilinear") + op->params["0"] = 1; + if (mode == "nearest") + op->params["0"] = 2; + if (mode == "bicubic") + op->params["0"] = 3; - if (padding_mode == "zeors") - op->params["1"] = 1; - if (padding_mode == "border") - op->params["1"] = 2; - if (padding_mode == "reflection") - op->params["1"] = 3; + if (padding_mode == "zeors") + op->params["1"] = 1; + if (padding_mode == "border") + op->params["1"] = 2; + if (padding_mode == "reflection") + op->params["1"] = 3; - op->params["6"] = captured_params.at("align_corners").b ? 1 : 0; // align_corners - } - }; + op->params["6"] = captured_params.at("align_corners").b ? 1 : 0; // align_corners + } +}; - REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) - } // namespace ncnn +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) +} // namespace ncnn } // namespace pnnx From 4f6ebdac0e2e3e9ab8cd9fa272e08261989cb1b4 Mon Sep 17 00:00:00 2001 From: Yoh Date: Thu, 20 Oct 2022 20:42:09 +0800 Subject: [PATCH 003/127] [WIP]add naive gridsample --- src/CMakeLists.txt | 1 + src/layer/gridsample.cpp | 151 ++++++++++++++++++ src/layer/gridsample.h | 40 +++++ tests/test_grid_sample.cpp | 0 .../F_grid_sample.cpp | 0 5 files changed, 192 insertions(+) create mode 100644 src/layer/gridsample.cpp create mode 100644 src/layer/gridsample.h create mode 100644 tests/test_grid_sample.cpp rename tools/pnnx/src/{pass_level5 => pass_ncnn}/F_grid_sample.cpp (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 11b8573462a..2af11faab4b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -156,6 +156,7 @@ ncnn_add_layer(Deconvolution3D) ncnn_add_layer(DeconvolutionDepthWise3D) ncnn_add_layer(Einsum) ncnn_add_layer(DeformableConv2D) +ncnn_add_layer(GridSample) if(NCNN_VULKAN) ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp new file mode 100644 index 00000000000..d83a5c149eb --- /dev/null +++ b/src/layer/gridsample.cpp @@ -0,0 +1,151 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gridsample.h" +#include + +namespace ncnn { + enum InterpolationMode + { + Bilinear = 1, + Nearest = 2, + Bicubic = 3 + }; + + enum PaddingMode + { + Zeros = 1, + Border = 2, + Reflection = 3 + }; + + static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); + } + + static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, + int64_t twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + int64_t min = static_cast(twice_low) / 2; + int64_t span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + int64_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } + else { + return span - extra + min; + } + } + + static inline int64_t compute_coordinates(int64_t coord, int64_t size, + PaddingMode padding_mode, + bool align_corners) { + if (padding_mode == PaddingMode::Border) { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + else if (padding_mode == PaddingMode::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates(coord, 0, 2 * (size - 1)); + } + else { + coord = reflect_coordinates(coord, -1, 2 * size - 1); + } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + return coord; + } + + template + struct ApplyGridSample; + + template + struct ApplyGridSample + { + inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + { + + } + }; + + template + struct ApplyGridSample + { + inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + { + + } + }; + + template + struct ApplyGridSample + { + inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + { + + } + }; + + GridSample::GridSample() + { + one_blob_only = false; + support_inplace = false; + } + + int GridSample::load_param(const ParamDict& pd) + { + mode = pd.get(0, 0); + padding_mode = pd.get(1, 0); + align_corners = pd.get(6, 0); + + return 0; + } + + int GridSample::forward(const std::vector& bottom_blobs, Mat& top_blobs, const Option& opt) const + { + #define HANDLE_PADDING(interp, padding, align_corners) \ + case padding:{ \ + printf("mode: %d, padding_mode: %d, align: %d", interp, padding, align_corners); \ + break; \ + } + + #define HANDLE_INTERP(interp, align_corners) \ + case interp:{ \ + switch(static_cast(padding_mode)) { \ + HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ + } \ + break; \ + } + + + + switch (static_cast(mode)) { + HANDLE_INTERP(InterpolationMode::Bilinear, align_corners); + HANDLE_INTERP(InterpolationMode::Nearest, align_corners); + HANDLE_INTERP(InterpolationMode::Bicubic, align_corners); + } +#undef HANDLE_PADDING +#undef HANDLE_INTERP + } + +} // namespace ncnn diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h new file mode 100644 index 00000000000..4d9eb326152 --- /dev/null +++ b/src/layer/gridsample.h @@ -0,0 +1,40 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_UNARYOP_H +#define LAYER_UNARYOP_H + +#include "layer.h" + +namespace ncnn { + + class GridSample : public Layer + { + public: + GridSample(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const std::vector& bottom_blob, Mat& top_blob, const Option& opt) const; + + public: + // param + int mode; //1 bilinear 2 nearest 3 bicubic + int padding_mode; //1 zeros 2 border 3 reflection + bool align_corners; + }; + +} // namespace ncnn + +#endif // LAYER_UNARYOP_H diff --git a/tests/test_grid_sample.cpp b/tests/test_grid_sample.cpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tools/pnnx/src/pass_level5/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp similarity index 100% rename from tools/pnnx/src/pass_level5/F_grid_sample.cpp rename to tools/pnnx/src/pass_ncnn/F_grid_sample.cpp From 1b9c7097658c06d9f8bbef1feb874622cdecca63 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 20 Oct 2022 12:45:07 +0000 Subject: [PATCH 004/127] apply code-format changes --- src/layer/gridsample.cpp | 221 ++++++++++++++++++++------------------- src/layer/gridsample.h | 24 ++--- 2 files changed, 127 insertions(+), 118 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index d83a5c149eb..ace5d90e3bf 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -16,136 +16,145 @@ #include namespace ncnn { - enum InterpolationMode +enum InterpolationMode +{ + Bilinear = 1, + Nearest = 2, + Bicubic = 3 +}; + +enum PaddingMode +{ + Zeros = 1, + Border = 2, + Reflection = 3 +}; + +static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) +{ + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} + +static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, + int64_t twice_high) +{ + if (twice_low == twice_high) { - Bilinear = 1, - Nearest = 2, - Bicubic = 3 - }; - - enum PaddingMode + return static_cast(0); + } + int64_t min = static_cast(twice_low) / 2; + int64_t span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + int64_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) { - Zeros = 1, - Border = 2, - Reflection = 3 - }; - - static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); + return extra + min; } - - static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, - int64_t twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - int64_t min = static_cast(twice_low) / 2; - int64_t span = static_cast(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - int64_t extra = std::fmod(in, span); - int flips = static_cast(std::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } - else { - return span - extra + min; - } + else + { + return span - extra + min; } +} - static inline int64_t compute_coordinates(int64_t coord, int64_t size, +static inline int64_t compute_coordinates(int64_t coord, int64_t size, PaddingMode padding_mode, - bool align_corners) { - if (padding_mode == PaddingMode::Border) { - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } - else if (padding_mode == PaddingMode::Reflection) { - // reflect coordinates by image borders - if (align_corners) { - coord = reflect_coordinates(coord, 0, 2 * (size - 1)); - } - else { - coord = reflect_coordinates(coord, -1, 2 * size - 1); - } - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } - return coord; + bool align_corners) +{ + if (padding_mode == PaddingMode::Border) + { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); } - - template - struct ApplyGridSample; - - template - struct ApplyGridSample + else if (padding_mode == PaddingMode::Reflection) { - inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + // reflect coordinates by image borders + if (align_corners) { - + coord = reflect_coordinates(coord, 0, 2 * (size - 1)); } - }; - - template - struct ApplyGridSample - { - inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + else { - + coord = reflect_coordinates(coord, -1, 2 * size - 1); } - }; - - template - struct ApplyGridSample - { - inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) - { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + return coord; +} - } - }; +template +struct ApplyGridSample; - GridSample::GridSample() +template +struct ApplyGridSample +{ + inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) { - one_blob_only = false; - support_inplace = false; } +}; - int GridSample::load_param(const ParamDict& pd) +template +struct ApplyGridSample +{ + inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) { - mode = pd.get(0, 0); - padding_mode = pd.get(1, 0); - align_corners = pd.get(6, 0); - - return 0; } +}; - int GridSample::forward(const std::vector& bottom_blobs, Mat& top_blobs, const Option& opt) const +template +struct ApplyGridSample +{ + inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) { - #define HANDLE_PADDING(interp, padding, align_corners) \ - case padding:{ \ - printf("mode: %d, padding_mode: %d, align: %d", interp, padding, align_corners); \ - break; \ - } - - #define HANDLE_INTERP(interp, align_corners) \ - case interp:{ \ - switch(static_cast(padding_mode)) { \ - HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ - } \ - break; \ - } - + } +}; + +GridSample::GridSample() +{ + one_blob_only = false; + support_inplace = false; +} + +int GridSample::load_param(const ParamDict& pd) +{ + mode = pd.get(0, 0); + padding_mode = pd.get(1, 0); + align_corners = pd.get(6, 0); + + return 0; +} + +int GridSample::forward(const std::vector& bottom_blobs, Mat& top_blobs, const Option& opt) const +{ +#define HANDLE_PADDING(interp, padding, align_corners) \ + case padding: \ + { \ + printf("mode: %d, padding_mode: %d, align: %d", interp, padding, align_corners); \ + break; \ + } +#define HANDLE_INTERP(interp, align_corners) \ + case interp: \ + { \ + switch (static_cast(padding_mode)) \ + { \ + HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ + } \ + break; \ + } - switch (static_cast(mode)) { - HANDLE_INTERP(InterpolationMode::Bilinear, align_corners); - HANDLE_INTERP(InterpolationMode::Nearest, align_corners); - HANDLE_INTERP(InterpolationMode::Bicubic, align_corners); - } + switch (static_cast(mode)) + { + HANDLE_INTERP(InterpolationMode::Bilinear, align_corners); + HANDLE_INTERP(InterpolationMode::Nearest, align_corners); + HANDLE_INTERP(InterpolationMode::Bicubic, align_corners); + } #undef HANDLE_PADDING #undef HANDLE_INTERP - } +} } // namespace ncnn diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 4d9eb326152..28bbedf6c66 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -19,21 +19,21 @@ namespace ncnn { - class GridSample : public Layer - { - public: - GridSample(); +class GridSample : public Layer +{ +public: + GridSample(); - virtual int load_param(const ParamDict& pd); + virtual int load_param(const ParamDict& pd); - virtual int forward(const std::vector& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blob, Mat& top_blob, const Option& opt) const; - public: - // param - int mode; //1 bilinear 2 nearest 3 bicubic - int padding_mode; //1 zeros 2 border 3 reflection - bool align_corners; - }; +public: + // param + int mode; //1 bilinear 2 nearest 3 bicubic + int padding_mode; //1 zeros 2 border 3 reflection + bool align_corners; +}; } // namespace ncnn From 73ff5100d49357dadb4c2e0536de6984dead3c03 Mon Sep 17 00:00:00 2001 From: Yoh Date: Fri, 21 Oct 2022 19:23:56 +0800 Subject: [PATCH 005/127] [WIP]add gridsample, add bilinear interpolation --- src/layer/gridsample.cpp | 155 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 11 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index d83a5c149eb..0980d547c69 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -14,6 +14,7 @@ #include "gridsample.h" #include +#include namespace ncnn { enum InterpolationMode @@ -30,20 +31,22 @@ namespace ncnn { Reflection = 3 }; - static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); + template + static inline elem_type clip_coordinates(elem_type in, int64_t clip_limit) { + return std::min(static_cast(clip_limit - static_cast(1)), std::max(in, static_cast(0))); } - static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, + template + static inline elem_type reflect_coordinates(elem_type in, int64_t twice_low, int64_t twice_high) { if (twice_low == twice_high) { - return static_cast(0); + return static_cast(0); } - int64_t min = static_cast(twice_low) / 2; - int64_t span = static_cast(twice_high - twice_low) / 2; + elem_type min = static_cast(twice_low) / 2; + elem_type span = static_cast(twice_high - twice_low) / 2; in = std::fabs(in - min); // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - int64_t extra = std::fmod(in, span); + elem_type extra = std::fmod(in, span); int flips = static_cast(std::floor(in / span)); if (flips % 2 == 0) { return extra + min; @@ -53,7 +56,8 @@ namespace ncnn { } } - static inline int64_t compute_coordinates(int64_t coord, int64_t size, + template + static inline elem_type compute_coordinates(elem_type coord, int64_t size, PaddingMode padding_mode, bool align_corners) { if (padding_mode == PaddingMode::Border) { @@ -74,22 +78,151 @@ namespace ncnn { return coord; } + template + static inline elem_type grid_sampler_unnormalize(elem_type coord, int64_t size, + bool align_corners) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); + } + else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; + } + } + + template + static inline elem_type grid_sampler_compute_source_index( + elem_type coord, + int64_t size, + PaddingMode padding_mode, + bool align_corners) { + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; + } + template struct ApplyGridSample; template struct ApplyGridSample { - inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + const bool must_in_bound = padding != PaddingMode::Zeros; + inline std::tuple compute_interp_params_d3(float x, float y) const { + auto x_w = std::floor(x); + auto y_n = std::floor(y); + + auto w = x - x_w; + auto e = 1.0f - w; + auto n = y - y_n; + auto s = 1.0f - n; + + auto nw = s * e; + auto ne = s * w; + auto sw = n * e; + auto se = n * w; + + return std::make_tuple(nw, ne, sw, se); + } + + inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + { + int dims = input.dims; + int w = input.w; + int h = input.h; + int channels = input.c; + + if (dims == 3) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* output_ptr = output.data; + const Mat image = input.channel(q); + + const float* gx_ptr = grid.channel(0); + const float* gy_ptr = grid.channel(1); + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + auto gx = grid_sampler_compute_source_index(*gx_ptr, w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(*gy_ptr, h, padding, align_corners); + + auto interp_params = compute_interp_params_d3(gx, gy); + + auto nw = std::get<0>(interp_params); + auto ne = std::get<1>(interp_params); + auto sw = std::get<2>(interp_params); + auto se = std::get<3>(interp_params); + + auto i_x = static_cast(std::floor(gx)); + auto i_y = static_cast(std::floor(gy)); + + float v = 0.0f; + if (must_in_bound) + { + //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 + auto nw_val = image.row(i_x)[i_y]; + auto ne_val = i_y + 1 < h ? image.row(i_x)[i_y + 1] : 0; + auto sw_val = i_x + 1 < w ? image.row(i_x + 1)[i_y] : 0; + auto se_val = i_x + 1 < w && i_y + 1 < h ? image.row(i_x + 1)[i_y + 1] : 0; + + v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; + } + else //PaddingMode::Zeors + { + auto x0 = i_x; + auto x1 = i_x + 1; + auto y0 = i_y; + auto y1 = i_y + 1; + + auto x0_in_range = (x0 > -1) & (x0 < w); + auto x1_in_range = (x1 > -1) & (x1 < w); + auto y0_in_range = (y0 > -1) & (y0 < w); + auto y1_in_range = (y1 > -1) & (y1 < w); + + auto v00_in_range = x0_in_range & y0_in_range; + auto v01_in_range = x0_in_range & y1_in_range; + auto v10_in_range = x1_in_range & y0_in_range; + auto v11_in_range = x1_in_range & y1_in_range; + + auto nw_val = v00_in_range ? image.row(x0)[y0] : 0; + auto ne_val = v01_in_range ? image.row(x0)[y1] : 0; + auto sw_val = v10_in_range ? image.row(x1)[y0] : 0; + auto se_val = v11_in_range ? image.row(x1)[y1] : 0; + + v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; + } + + *output = v; + + output++; + fxptr++; + fyptr++; + } + } + } + } + else if (dims == 4) + { + + } + else + { + return -100; + } } }; template struct ApplyGridSample { - inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + inline void forward(const Mat& input, const Mat& grid, Mat& output) { } @@ -98,7 +231,7 @@ namespace ncnn { template struct ApplyGridSample { - inline void forward(const ncnn::Mat& input, const ncnn::Mat& grid, ncnn::Mat& output) + inline void forward(const Mat& input, const Mat& grid, Mat& output) { } From 390bbfb13238358fb111cd77634d82a21830b187 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Fri, 21 Oct 2022 11:32:02 +0000 Subject: [PATCH 006/127] apply code-format changes --- src/layer/gridsample.cpp | 357 ++++++++++++++++++++------------------- 1 file changed, 184 insertions(+), 173 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 82b122999ce..79f613b6736 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -24,216 +24,227 @@ enum InterpolationMode Bicubic = 3 }; - enum PaddingMode +enum PaddingMode +{ + Zeros = 1, + Border = 2, + Reflection = 3 +}; + +static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) +{ + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} + +static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, + int64_t twice_high) +{ + if (twice_low == twice_high) { - Zeros = 1, - Border = 2, - Reflection = 3 - }; - - static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); + return static_cast(0); } - - static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, - int64_t twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - int64_t min = static_cast(twice_low) / 2; - int64_t span = static_cast(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - int64_t extra = std::fmod(in, span); - int flips = static_cast(std::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } - else { - return span - extra + min; - } + int64_t min = static_cast(twice_low) / 2; + int64_t span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + int64_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) + { + return extra + min; } + else + { + return span - extra + min; + } +} - template - static inline elem_type compute_coordinates(elem_type coord, int64_t size, +template +static inline elem_type compute_coordinates(elem_type coord, int64_t size, PaddingMode padding_mode, - bool align_corners) { - if (padding_mode == PaddingMode::Border) { - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } - else if (padding_mode == PaddingMode::Reflection) { - // reflect coordinates by image borders - if (align_corners) { - coord = reflect_coordinates(coord, 0, 2 * (size - 1)); - } - else { - coord = reflect_coordinates(coord, -1, 2 * size - 1); - } - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } - return coord; + bool align_corners) +{ + if (padding_mode == PaddingMode::Border) + { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); } - - template - static inline elem_type grid_sampler_unnormalize(elem_type coord, int64_t size, - bool align_corners) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1) / 2) * (size - 1); + else if (padding_mode == PaddingMode::Reflection) + { + // reflect coordinates by image borders + if (align_corners) + { + coord = reflect_coordinates(coord, 0, 2 * (size - 1)); } - else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1) * size - 1) / 2; + else + { + coord = reflect_coordinates(coord, -1, 2 * size - 1); } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); } + return coord; +} - template - static inline elem_type grid_sampler_compute_source_index( - elem_type coord, - int64_t size, - PaddingMode padding_mode, - bool align_corners) { - coord = grid_sampler_unnormalize(coord, size, align_corners); - coord = compute_coordinates(coord, size, padding_mode, align_corners); - return coord; +template +static inline elem_type grid_sampler_unnormalize(elem_type coord, int64_t size, + bool align_corners) +{ + if (align_corners) + { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); + } + else + { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; } +} + +template +static inline elem_type grid_sampler_compute_source_index( + elem_type coord, + int64_t size, + PaddingMode padding_mode, + bool align_corners) +{ + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} template struct ApplyGridSample; - template - struct ApplyGridSample +template +struct ApplyGridSample +{ + const bool must_in_bound = padding != PaddingMode::Zeros; + inline std::tuple compute_interp_params_d3(float x, float y) const { - const bool must_in_bound = padding != PaddingMode::Zeros; - inline std::tuple compute_interp_params_d3(float x, float y) const - { - auto x_w = std::floor(x); - auto y_n = std::floor(y); + auto x_w = std::floor(x); + auto y_n = std::floor(y); - auto w = x - x_w; - auto e = 1.0f - w; - auto n = y - y_n; - auto s = 1.0f - n; + auto w = x - x_w; + auto e = 1.0f - w; + auto n = y - y_n; + auto s = 1.0f - n; - auto nw = s * e; - auto ne = s * w; - auto sw = n * e; - auto se = n * w; + auto nw = s * e; + auto ne = s * w; + auto sw = n * e; + auto se = n * w; - return std::make_tuple(nw, ne, sw, se); - } + return std::make_tuple(nw, ne, sw, se); + } - inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) - { - int dims = input.dims; - int w = input.w; - int h = input.h; - int channels = input.c; + inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + { + int dims = input.dims; + int w = input.w; + int h = input.h; + int channels = input.c; - if (dims == 3) + if (dims == 3) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = output.data; + float* output_ptr = output.data; - const Mat image = input.channel(q); + const Mat image = input.channel(q); - const float* gx_ptr = grid.channel(0); - const float* gy_ptr = grid.channel(1); + const float* gx_ptr = grid.channel(0); + const float* gy_ptr = grid.channel(1); - for (int y = 0; y < h; y++) + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) { - for (int x = 0; x < w; x++) + auto gx = grid_sampler_compute_source_index(*gx_ptr, w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(*gy_ptr, h, padding, align_corners); + + auto interp_params = compute_interp_params_d3(gx, gy); + + auto nw = std::get<0>(interp_params); + auto ne = std::get<1>(interp_params); + auto sw = std::get<2>(interp_params); + auto se = std::get<3>(interp_params); + + auto i_x = static_cast(std::floor(gx)); + auto i_y = static_cast(std::floor(gy)); + + float v = 0.0f; + if (must_in_bound) { - auto gx = grid_sampler_compute_source_index(*gx_ptr, w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(*gy_ptr, h, padding, align_corners); - - auto interp_params = compute_interp_params_d3(gx, gy); - - auto nw = std::get<0>(interp_params); - auto ne = std::get<1>(interp_params); - auto sw = std::get<2>(interp_params); - auto se = std::get<3>(interp_params); - - auto i_x = static_cast(std::floor(gx)); - auto i_y = static_cast(std::floor(gy)); - - float v = 0.0f; - if (must_in_bound) - { - //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 - auto nw_val = image.row(i_x)[i_y]; - auto ne_val = i_y + 1 < h ? image.row(i_x)[i_y + 1] : 0; - auto sw_val = i_x + 1 < w ? image.row(i_x + 1)[i_y] : 0; - auto se_val = i_x + 1 < w && i_y + 1 < h ? image.row(i_x + 1)[i_y + 1] : 0; - - v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; - } - else //PaddingMode::Zeors - { - auto x0 = i_x; - auto x1 = i_x + 1; - auto y0 = i_y; - auto y1 = i_y + 1; - - auto x0_in_range = (x0 > -1) & (x0 < w); - auto x1_in_range = (x1 > -1) & (x1 < w); - auto y0_in_range = (y0 > -1) & (y0 < w); - auto y1_in_range = (y1 > -1) & (y1 < w); - - auto v00_in_range = x0_in_range & y0_in_range; - auto v01_in_range = x0_in_range & y1_in_range; - auto v10_in_range = x1_in_range & y0_in_range; - auto v11_in_range = x1_in_range & y1_in_range; - - auto nw_val = v00_in_range ? image.row(x0)[y0] : 0; - auto ne_val = v01_in_range ? image.row(x0)[y1] : 0; - auto sw_val = v10_in_range ? image.row(x1)[y0] : 0; - auto se_val = v11_in_range ? image.row(x1)[y1] : 0; - - v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; - } - - *output = v; - - output++; - fxptr++; - fyptr++; + //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 + auto nw_val = image.row(i_x)[i_y]; + auto ne_val = i_y + 1 < h ? image.row(i_x)[i_y + 1] : 0; + auto sw_val = i_x + 1 < w ? image.row(i_x + 1)[i_y] : 0; + auto se_val = i_x + 1 < w && i_y + 1 < h ? image.row(i_x + 1)[i_y + 1] : 0; + + v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } + else //PaddingMode::Zeors + { + auto x0 = i_x; + auto x1 = i_x + 1; + auto y0 = i_y; + auto y1 = i_y + 1; + + auto x0_in_range = (x0 > -1) & (x0 < w); + auto x1_in_range = (x1 > -1) & (x1 < w); + auto y0_in_range = (y0 > -1) & (y0 < w); + auto y1_in_range = (y1 > -1) & (y1 < w); + + auto v00_in_range = x0_in_range & y0_in_range; + auto v01_in_range = x0_in_range & y1_in_range; + auto v10_in_range = x1_in_range & y0_in_range; + auto v11_in_range = x1_in_range & y1_in_range; + + auto nw_val = v00_in_range ? image.row(x0)[y0] : 0; + auto ne_val = v01_in_range ? image.row(x0)[y1] : 0; + auto sw_val = v10_in_range ? image.row(x1)[y0] : 0; + auto se_val = v11_in_range ? image.row(x1)[y1] : 0; + + v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; + } + + *output = v; + + output++; + fxptr++; + fyptr++; } } } - else if (dims == 4) - { - - } - else - { - return -100; - } } - }; - - template - struct ApplyGridSample - { - inline void forward(const Mat& input, const Mat& grid, Mat& output) + else if (dims == 4) { - } - }; + else + { + return -100; + } + } +}; - template - struct ApplyGridSample +template +struct ApplyGridSample +{ + inline void forward(const Mat& input, const Mat& grid, Mat& output) { - inline void forward(const Mat& input, const Mat& grid, Mat& output) - { + } +}; - } - }; +template +struct ApplyGridSample +{ + inline void forward(const Mat& input, const Mat& grid, Mat& output) + { + } +}; GridSample::GridSample() { From e16a3f48f5d4d483eaaaca55c70914e7a5bef739 Mon Sep 17 00:00:00 2001 From: Yoh Date: Thu, 27 Oct 2022 20:38:36 +0800 Subject: [PATCH 007/127] add bilinear --- src/layer/gridsample.cpp | 202 ++++++++++-------- src/layer/gridsample.h | 6 +- tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 4 +- .../pnnx/src/pass_ncnn/solve_batch_index.cpp | 6 + 4 files changed, 122 insertions(+), 96 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 82b122999ce..008bcdf4531 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -17,12 +17,12 @@ #include namespace ncnn { -enum InterpolationMode -{ - Bilinear = 1, - Nearest = 2, - Bicubic = 3 -}; + enum InterpolationMode + { + Bilinear = 1, + Nearest = 2, + Bicubic = 3 + }; enum PaddingMode { @@ -30,21 +30,21 @@ enum InterpolationMode Border = 2, Reflection = 3 }; - - static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); + + static inline float clip_coordinates(float in, int64_t clip_limit) { + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); } - static inline int64_t reflect_coordinates(int64_t in, int64_t twice_low, + static inline float reflect_coordinates(float in, int64_t twice_low, int64_t twice_high) { if (twice_low == twice_high) { - return static_cast(0); + return static_cast(0); } - int64_t min = static_cast(twice_low) / 2; - int64_t span = static_cast(twice_high - twice_low) / 2; + float min = static_cast(twice_low) / 2; + float span = static_cast(twice_high - twice_low) / 2; in = std::fabs(in - min); // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - int64_t extra = std::fmod(in, span); + float extra = std::fmod(in, span); int flips = static_cast(std::floor(in / span)); if (flips % 2 == 0) { return extra + min; @@ -54,8 +54,7 @@ enum InterpolationMode } } - template - static inline elem_type compute_coordinates(elem_type coord, int64_t size, + static inline float compute_coordinates(float coord, int64_t size, PaddingMode padding_mode, bool align_corners) { if (padding_mode == PaddingMode::Border) { @@ -76,8 +75,7 @@ enum InterpolationMode return coord; } - template - static inline elem_type grid_sampler_unnormalize(elem_type coord, int64_t size, + static inline float grid_sampler_unnormalize(float coord, int64_t size, bool align_corners) { if (align_corners) { // unnormalize coord from [-1, 1] to [0, size - 1] @@ -89,9 +87,8 @@ enum InterpolationMode } } - template - static inline elem_type grid_sampler_compute_source_index( - elem_type coord, + static inline float grid_sampler_compute_source_index( + float coord, int64_t size, PaddingMode padding_mode, bool align_corners) { @@ -100,8 +97,8 @@ enum InterpolationMode return coord; } -template -struct ApplyGridSample; + template + struct ApplyGridSample; template struct ApplyGridSample @@ -127,32 +124,36 @@ struct ApplyGridSample; inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - int dims = input.dims; - int w = input.w; - int h = input.h; - int channels = input.c; + const int dims = input.dims; + const int w = input.w; + const int h = input.h; + const int outW = grid.h; + const int outH = grid.c; + const int channels = input.c; if (dims == 3) { - #pragma omp parallel for num_threads(opt.num_threads) + output.create(outW, outH, input.c); +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { - float* output_ptr = output.data; + float* output_ptr = static_cast(output.channel(q).data); const Mat image = input.channel(q); - const float* gx_ptr = grid.channel(0); - const float* gy_ptr = grid.channel(1); + //const float* gxy_ptr = static_cast(grid.data); - for (int y = 0; y < h; y++) + for (int y = 0; y < outH; y++) { - for (int x = 0; x < w; x++) + for (int x = 0; x < outW; x++) { - auto gx = grid_sampler_compute_source_index(*gx_ptr, w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(*gy_ptr, h, padding, align_corners); + const float* gxy_ptr = grid.channel(y).row(x); + auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); auto interp_params = compute_interp_params_d3(gx, gy); + auto nw = std::get<0>(interp_params); auto ne = std::get<1>(interp_params); auto sw = std::get<2>(interp_params); @@ -162,13 +163,13 @@ struct ApplyGridSample; auto i_y = static_cast(std::floor(gy)); float v = 0.0f; - if (must_in_bound) + if (must_in_bound) { //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 - auto nw_val = image.row(i_x)[i_y]; - auto ne_val = i_y + 1 < h ? image.row(i_x)[i_y + 1] : 0; - auto sw_val = i_x + 1 < w ? image.row(i_x + 1)[i_y] : 0; - auto se_val = i_x + 1 < w && i_y + 1 < h ? image.row(i_x + 1)[i_y + 1] : 0; + auto nw_val = image.row(i_y)[i_x]; + auto ne_val = i_x + 1 < w ? image.row(i_y)[i_x + 1] : 0; + auto sw_val = i_y + 1 < h ? image.row(i_y + 1)[i_x] : 0; + auto se_val = ((i_x + 1 < w) & (i_y + 1 < h)) ? image.row(i_y + 1)[i_x + 1] : 0; v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } @@ -181,27 +182,28 @@ struct ApplyGridSample; auto x0_in_range = (x0 > -1) & (x0 < w); auto x1_in_range = (x1 > -1) & (x1 < w); - auto y0_in_range = (y0 > -1) & (y0 < w); - auto y1_in_range = (y1 > -1) & (y1 < w); + auto y0_in_range = (y0 > -1) & (y0 < h); + auto y1_in_range = (y1 > -1) & (y1 < h); auto v00_in_range = x0_in_range & y0_in_range; auto v01_in_range = x0_in_range & y1_in_range; auto v10_in_range = x1_in_range & y0_in_range; auto v11_in_range = x1_in_range & y1_in_range; - auto nw_val = v00_in_range ? image.row(x0)[y0] : 0; - auto ne_val = v01_in_range ? image.row(x0)[y1] : 0; - auto sw_val = v10_in_range ? image.row(x1)[y0] : 0; - auto se_val = v11_in_range ? image.row(x1)[y1] : 0; + auto nw_val = v00_in_range ? image.row(y0)[x0] : 0; + auto ne_val = v10_in_range ? image.row(y0)[x1] : 0; + auto sw_val = v01_in_range ? image.row(y1)[x0] : 0; + auto se_val = v11_in_range ? image.row(y1)[x1] : 0; v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } - *output = v; - output++; - fxptr++; - fyptr++; + + *output_ptr = v; + + output_ptr++; + gxy_ptr += 2; } } } @@ -220,7 +222,7 @@ struct ApplyGridSample; template struct ApplyGridSample { - inline void forward(const Mat& input, const Mat& grid, Mat& output) + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { } @@ -229,56 +231,74 @@ struct ApplyGridSample; template struct ApplyGridSample { - inline void forward(const Mat& input, const Mat& grid, Mat& output) + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { } }; -GridSample::GridSample() -{ - one_blob_only = false; - support_inplace = false; -} - -int GridSample::load_param(const ParamDict& pd) -{ - mode = pd.get(0, 0); - padding_mode = pd.get(1, 0); - align_corners = pd.get(6, 0); - - return 0; -} - -int GridSample::forward(const std::vector& bottom_blobs, Mat& top_blobs, const Option& opt) const -{ -#define HANDLE_PADDING(interp, padding, align_corners) \ - case padding: \ - { \ - printf("mode: %d, padding_mode: %d, align: %d", interp, padding, align_corners); \ - break; \ + GridSample::GridSample() + { + one_blob_only = false; + support_inplace = false; } -#define HANDLE_INTERP(interp, align_corners) \ - case interp: \ - { \ - switch (static_cast(padding_mode)) \ - { \ - HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ - } \ - break; \ + int GridSample::load_param(const ParamDict& pd) + { + mode = pd.get(0, 0); + padding_mode = pd.get(1, 0); + align_corners = pd.get(6, 0); + + return 0; } - switch (static_cast(mode)) + int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { - HANDLE_INTERP(InterpolationMode::Bilinear, align_corners); - HANDLE_INTERP(InterpolationMode::Nearest, align_corners); - HANDLE_INTERP(InterpolationMode::Bicubic, align_corners); + #define HANDLE_PADDING(interp, padding, align_corners) \ + case padding: \ + { \ + ApplyGridSample func; \ + func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ + break; \ + } + + #define HANDLE_INTERP(interp, align_corners) \ + case interp: \ + { \ + switch (static_cast(padding_mode)) \ + { \ + HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ + } \ + break; \ + } + + + + if (align_corners == true) + { + switch (static_cast(mode)) + { + HANDLE_INTERP(InterpolationMode::Bilinear, true); + HANDLE_INTERP(InterpolationMode::Nearest, true); + HANDLE_INTERP(InterpolationMode::Bicubic, true); + } + } + else + { + switch (static_cast(mode)) + { + HANDLE_INTERP(InterpolationMode::Bilinear, false); + HANDLE_INTERP(InterpolationMode::Nearest, false); + HANDLE_INTERP(InterpolationMode::Bicubic, false); + } + } + #undef HANDLE_PADDING + #undef HANDLE_INTERP + + + return 0; } -#undef HANDLE_PADDING -#undef HANDLE_INTERP -} } // namespace ncnn diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 28bbedf6c66..0d51b548e70 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -12,8 +12,8 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#ifndef LAYER_UNARYOP_H -#define LAYER_UNARYOP_H +#ifndef LAYER_GRIDSAMPLE_H +#define LAYER_GRIDSAMPLE_H #include "layer.h" @@ -26,7 +26,7 @@ class GridSample : public Layer virtual int load_param(const ParamDict& pd); - virtual int forward(const std::vector& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blob, std::vector& top_blobs, const Option& opt) const; public: // param diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index f2e8152bacb..2c61c245473 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -35,7 +35,7 @@ pnnx.Output output 1 0 out const char* type_str() const { - return "Grid_sample"; + return "GridSample"; } const char* name_str() const @@ -55,7 +55,7 @@ pnnx.Output output 1 0 out if (mode == "bicubic") op->params["0"] = 3; - if (padding_mode == "zeors") + if (padding_mode == "zeros") op->params["1"] = 1; if (padding_mode == "border") op->params["1"] = 2; diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp index 13049e5f05b..dfe6d65f663 100644 --- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp +++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp @@ -283,6 +283,12 @@ void solve_batch_index(Graph& graph) { if (is_known_operator_with_batch_index_0(op)) { + if (op->type == std::string("F.grid_sample")) + { + // grid_sample's grid input may be a 5d tensor :( + op->inputs[1]->params["__batch_index"] = 0; + } + op->inputs[0]->params["__batch_index"] = 0; op->outputs[0]->params["__batch_index"] = 0; } From 54f3bd7cd8842406342b46325581d25a4b0e4aa6 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 27 Oct 2022 12:44:20 +0000 Subject: [PATCH 008/127] apply code-format changes --- src/layer/gridsample.cpp | 350 +++++++++++++++++++-------------------- 1 file changed, 175 insertions(+), 175 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 42736d2eefb..ba8acaf6fcb 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -17,46 +17,51 @@ #include namespace ncnn { - enum InterpolationMode - { - Bilinear = 1, - Nearest = 2, - Bicubic = 3 - }; +enum InterpolationMode +{ + Bilinear = 1, + Nearest = 2, + Bicubic = 3 +}; - enum PaddingMode - { - Zeros = 1, - Border = 2, - Reflection = 3 - }; +enum PaddingMode +{ + Zeros = 1, + Border = 2, + Reflection = 3 +}; - static inline float clip_coordinates(float in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); - } +static inline float clip_coordinates(float in, int64_t clip_limit) +{ + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} - static inline float reflect_coordinates(float in, int64_t twice_low, - int64_t twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - float min = static_cast(twice_low) / 2; - float span = static_cast(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - float extra = std::fmod(in, span); - int flips = static_cast(std::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } - else { - return span - extra + min; - } +static inline float reflect_coordinates(float in, int64_t twice_low, + int64_t twice_high) +{ + if (twice_low == twice_high) + { + return static_cast(0); + } + float min = static_cast(twice_low) / 2; + float span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + float extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) + { + return extra + min; + } + else + { + return span - extra + min; } +} - static inline float compute_coordinates(float coord, int64_t size, - PaddingMode padding_mode, - bool align_corners) +static inline float compute_coordinates(float coord, int64_t size, + PaddingMode padding_mode, + bool align_corners) { if (padding_mode == PaddingMode::Border) { @@ -80,30 +85,34 @@ namespace ncnn { return coord; } - static inline float grid_sampler_unnormalize(float coord, int64_t size, - bool align_corners) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1) / 2) * (size - 1); - } - else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1) * size - 1) / 2; - } +static inline float grid_sampler_unnormalize(float coord, int64_t size, + bool align_corners) +{ + if (align_corners) + { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); } - - static inline float grid_sampler_compute_source_index( - float coord, - int64_t size, - PaddingMode padding_mode, - bool align_corners) { - coord = grid_sampler_unnormalize(coord, size, align_corners); - coord = compute_coordinates(coord, size, padding_mode, align_corners); - return coord; + else + { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; } +} + +static inline float grid_sampler_compute_source_index( + float coord, + int64_t size, + PaddingMode padding_mode, + bool align_corners) +{ + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} - template - struct ApplyGridSample; +template +struct ApplyGridSample; template struct ApplyGridSample @@ -127,54 +136,53 @@ struct ApplyGridSample return std::make_tuple(nw, ne, sw, se); } - inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + { + const int dims = input.dims; + const int w = input.w; + const int h = input.h; + const int outW = grid.h; + const int outH = grid.c; + const int channels = input.c; + + if (dims == 3) { - const int dims = input.dims; - const int w = input.w; - const int h = input.h; - const int outW = grid.h; - const int outH = grid.c; - const int channels = input.c; - - if (dims == 3) + output.create(outW, outH, input.c); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - output.create(outW, outH, input.c); -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = static_cast(output.channel(q).data); + float* output_ptr = static_cast(output.channel(q).data); const Mat image = input.channel(q); - //const float* gxy_ptr = static_cast(grid.data); + //const float* gxy_ptr = static_cast(grid.data); - for (int y = 0; y < outH; y++) + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gxy_ptr = grid.channel(y).row(x); - auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); + const float* gxy_ptr = grid.channel(y).row(x); + auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); auto interp_params = compute_interp_params_d3(gx, gy); - - auto nw = std::get<0>(interp_params); - auto ne = std::get<1>(interp_params); - auto sw = std::get<2>(interp_params); - auto se = std::get<3>(interp_params); + auto nw = std::get<0>(interp_params); + auto ne = std::get<1>(interp_params); + auto sw = std::get<2>(interp_params); + auto se = std::get<3>(interp_params); auto i_x = static_cast(std::floor(gx)); auto i_y = static_cast(std::floor(gy)); - float v = 0.0f; - if (must_in_bound) - { - //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 - auto nw_val = image.row(i_y)[i_x]; - auto ne_val = i_x + 1 < w ? image.row(i_y)[i_x + 1] : 0; - auto sw_val = i_y + 1 < h ? image.row(i_y + 1)[i_x] : 0; - auto se_val = ((i_x + 1 < w) & (i_y + 1 < h)) ? image.row(i_y + 1)[i_x + 1] : 0; + float v = 0.0f; + if (must_in_bound) + { + //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 + auto nw_val = image.row(i_y)[i_x]; + auto ne_val = i_x + 1 < w ? image.row(i_y)[i_x + 1] : 0; + auto sw_val = i_y + 1 < h ? image.row(i_y + 1)[i_x] : 0; + auto se_val = ((i_x + 1 < w) & (i_y + 1 < h)) ? image.row(i_y + 1)[i_x + 1] : 0; v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } @@ -185,125 +193,117 @@ struct ApplyGridSample auto y0 = i_y; auto y1 = i_y + 1; - auto x0_in_range = (x0 > -1) & (x0 < w); - auto x1_in_range = (x1 > -1) & (x1 < w); - auto y0_in_range = (y0 > -1) & (y0 < h); - auto y1_in_range = (y1 > -1) & (y1 < h); + auto x0_in_range = (x0 > -1) & (x0 < w); + auto x1_in_range = (x1 > -1) & (x1 < w); + auto y0_in_range = (y0 > -1) & (y0 < h); + auto y1_in_range = (y1 > -1) & (y1 < h); auto v00_in_range = x0_in_range & y0_in_range; auto v01_in_range = x0_in_range & y1_in_range; auto v10_in_range = x1_in_range & y0_in_range; auto v11_in_range = x1_in_range & y1_in_range; - auto nw_val = v00_in_range ? image.row(y0)[x0] : 0; - auto ne_val = v10_in_range ? image.row(y0)[x1] : 0; - auto sw_val = v01_in_range ? image.row(y1)[x0] : 0; - auto se_val = v11_in_range ? image.row(y1)[x1] : 0; + auto nw_val = v00_in_range ? image.row(y0)[x0] : 0; + auto ne_val = v10_in_range ? image.row(y0)[x1] : 0; + auto sw_val = v01_in_range ? image.row(y1)[x0] : 0; + auto se_val = v11_in_range ? image.row(y1)[x1] : 0; v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } + *output_ptr = v; - - *output_ptr = v; - - output_ptr++; - gxy_ptr += 2; - } + output_ptr++; + gxy_ptr += 2; } } } - else if (dims == 4) - { - - } - else - { - return -100; - } } - }; - - template - struct ApplyGridSample - { - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + else if (dims == 4) { - } - }; - - template - struct ApplyGridSample - { - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + else { - + return -100; } - }; + } +}; - GridSample::GridSample() +template +struct ApplyGridSample +{ + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - one_blob_only = false; - support_inplace = false; } +}; - int GridSample::load_param(const ParamDict& pd) +template +struct ApplyGridSample +{ + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - mode = pd.get(0, 0); - padding_mode = pd.get(1, 0); - align_corners = pd.get(6, 0); - - return 0; } +}; - int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - #define HANDLE_PADDING(interp, padding, align_corners) \ - case padding: \ - { \ - ApplyGridSample func; \ - func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ - break; \ - } +GridSample::GridSample() +{ + one_blob_only = false; + support_inplace = false; +} - #define HANDLE_INTERP(interp, align_corners) \ - case interp: \ - { \ - switch (static_cast(padding_mode)) \ - { \ - HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ - } \ - break; \ - } +int GridSample::load_param(const ParamDict& pd) +{ + mode = pd.get(0, 0); + padding_mode = pd.get(1, 0); + align_corners = pd.get(6, 0); + + return 0; +} +int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#define HANDLE_PADDING(interp, padding, align_corners) \ + case padding: \ + { \ + ApplyGridSample func; \ + func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ + break; \ + } +#define HANDLE_INTERP(interp, align_corners) \ + case interp: \ + { \ + switch (static_cast(padding_mode)) \ + { \ + HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ + } \ + break; \ + } - if (align_corners == true) + if (align_corners == true) + { + switch (static_cast(mode)) { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, true); - HANDLE_INTERP(InterpolationMode::Nearest, true); - HANDLE_INTERP(InterpolationMode::Bicubic, true); - } + HANDLE_INTERP(InterpolationMode::Bilinear, true); + HANDLE_INTERP(InterpolationMode::Nearest, true); + HANDLE_INTERP(InterpolationMode::Bicubic, true); } - else + } + else + { + switch (static_cast(mode)) { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, false); - HANDLE_INTERP(InterpolationMode::Nearest, false); - HANDLE_INTERP(InterpolationMode::Bicubic, false); - } + HANDLE_INTERP(InterpolationMode::Bilinear, false); + HANDLE_INTERP(InterpolationMode::Nearest, false); + HANDLE_INTERP(InterpolationMode::Bicubic, false); } - #undef HANDLE_PADDING - #undef HANDLE_INTERP - - - return 0; } +#undef HANDLE_PADDING +#undef HANDLE_INTERP + + return 0; +} } // namespace ncnn From bc1967f86dbc3583861ce5fa09cea09800f53f1d Mon Sep 17 00:00:00 2001 From: Yoh Date: Fri, 28 Oct 2022 17:06:25 +0800 Subject: [PATCH 009/127] add nearest interpolation --- src/layer/gridsample.cpp | 154 ++++++++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 52 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 42736d2eefb..303b5ee7df0 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -127,54 +127,54 @@ struct ApplyGridSample return std::make_tuple(nw, ne, sw, se); } - inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + { + const int dims = input.dims; + const int w = input.w; + const int h = input.h; + const int outW = grid.h; + const int outH = grid.c; + const int channels = input.c; + + if (dims == 3) { - const int dims = input.dims; - const int w = input.w; - const int h = input.h; - const int outW = grid.h; - const int outH = grid.c; - const int channels = input.c; - - if (dims == 3) - { - output.create(outW, outH, input.c); + output.create(outW, outH, input.c); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = static_cast(output.channel(q).data); + for (int q = 0; q < channels; q++) + { + float* output_ptr = static_cast(output.channel(q).data); const Mat image = input.channel(q); - //const float* gxy_ptr = static_cast(grid.data); + //const float* gxy_ptr = static_cast(grid.data); - for (int y = 0; y < outH; y++) + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gxy_ptr = grid.channel(y).row(x); - auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); + const float* gxy_ptr = grid.channel(y).row(x); + auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); auto interp_params = compute_interp_params_d3(gx, gy); - auto nw = std::get<0>(interp_params); - auto ne = std::get<1>(interp_params); - auto sw = std::get<2>(interp_params); - auto se = std::get<3>(interp_params); + auto nw = std::get<0>(interp_params); + auto ne = std::get<1>(interp_params); + auto sw = std::get<2>(interp_params); + auto se = std::get<3>(interp_params); auto i_x = static_cast(std::floor(gx)); auto i_y = static_cast(std::floor(gy)); - float v = 0.0f; - if (must_in_bound) - { - //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 - auto nw_val = image.row(i_y)[i_x]; - auto ne_val = i_x + 1 < w ? image.row(i_y)[i_x + 1] : 0; - auto sw_val = i_y + 1 < h ? image.row(i_y + 1)[i_x] : 0; - auto se_val = ((i_x + 1 < w) & (i_y + 1 < h)) ? image.row(i_y + 1)[i_x + 1] : 0; + float v = 0.0f; + if (must_in_bound) + { + //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 + auto nw_val = image.row(i_y)[i_x]; + auto ne_val = i_x + 1 < w ? image.row(i_y)[i_x + 1] : 0; + auto sw_val = i_y + 1 < h ? image.row(i_y + 1)[i_x] : 0; + auto se_val = ((i_x + 1 < w) & (i_y + 1 < h)) ? image.row(i_y + 1)[i_x + 1] : 0; v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } @@ -185,30 +185,89 @@ struct ApplyGridSample auto y0 = i_y; auto y1 = i_y + 1; - auto x0_in_range = (x0 > -1) & (x0 < w); - auto x1_in_range = (x1 > -1) & (x1 < w); - auto y0_in_range = (y0 > -1) & (y0 < h); - auto y1_in_range = (y1 > -1) & (y1 < h); + auto x0_in_range = (x0 > -1) & (x0 < w); + auto x1_in_range = (x1 > -1) & (x1 < w); + auto y0_in_range = (y0 > -1) & (y0 < h); + auto y1_in_range = (y1 > -1) & (y1 < h); auto v00_in_range = x0_in_range & y0_in_range; auto v01_in_range = x0_in_range & y1_in_range; auto v10_in_range = x1_in_range & y0_in_range; auto v11_in_range = x1_in_range & y1_in_range; - auto nw_val = v00_in_range ? image.row(y0)[x0] : 0; - auto ne_val = v10_in_range ? image.row(y0)[x1] : 0; - auto sw_val = v01_in_range ? image.row(y1)[x0] : 0; - auto se_val = v11_in_range ? image.row(y1)[x1] : 0; + auto nw_val = v00_in_range ? image.row(y0)[x0] : 0; + auto ne_val = v10_in_range ? image.row(y0)[x1] : 0; + auto sw_val = v01_in_range ? image.row(y1)[x0] : 0; + auto se_val = v11_in_range ? image.row(y1)[x1] : 0; v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } + *output_ptr = v; + + output_ptr++; + } + } + } + } + else if (dims == 4) + { + + } + else + { + return -100; + } + } +}; + + template + struct ApplyGridSample + { + const bool must_in_bound = padding != PaddingMode::Zeros; + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + { + const int dims = input.dims; + const int w = input.w; + const int h = input.h; + const int outW = grid.h; + const int outH = grid.c; + const int channels = input.c; + + if (dims == 3) + { + output.create(outW, outH, input.c); +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* output_ptr = static_cast(output.channel(q).data); + + const Mat image = input.channel(q); + + //const float* gxy_ptr = static_cast(grid.data); + + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gxy_ptr = grid.channel(y).row(x); + auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); + + auto x_nearest = static_cast(std::round(gx)); + auto y_nearest = static_cast(std::round(gy)); + + float v = image.row(y_nearest)[x_nearest]; + if (!must_in_bound) + { + v = ((x_nearest < w) & (x_nearest > -1) & (y_nearest < h) & (y_nearest > -1)) ? v : 0; + } + *output_ptr = v; output_ptr++; - gxy_ptr += 2; } } } @@ -219,17 +278,8 @@ struct ApplyGridSample } else { - return -100; - } - } - }; - - template - struct ApplyGridSample - { - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) - { + } } }; From 973b80bf8dc1ccd0f499ee24e506f9e198b2e047 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Fri, 28 Oct 2022 09:08:17 +0000 Subject: [PATCH 010/127] apply code-format changes --- src/layer/gridsample.cpp | 323 +++++++++++++++++++-------------------- 1 file changed, 161 insertions(+), 162 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 303b5ee7df0..24f0472b4f8 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -17,46 +17,51 @@ #include namespace ncnn { - enum InterpolationMode - { - Bilinear = 1, - Nearest = 2, - Bicubic = 3 - }; +enum InterpolationMode +{ + Bilinear = 1, + Nearest = 2, + Bicubic = 3 +}; - enum PaddingMode - { - Zeros = 1, - Border = 2, - Reflection = 3 - }; +enum PaddingMode +{ + Zeros = 1, + Border = 2, + Reflection = 3 +}; - static inline float clip_coordinates(float in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); - } +static inline float clip_coordinates(float in, int64_t clip_limit) +{ + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} - static inline float reflect_coordinates(float in, int64_t twice_low, - int64_t twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - float min = static_cast(twice_low) / 2; - float span = static_cast(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - float extra = std::fmod(in, span); - int flips = static_cast(std::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } - else { - return span - extra + min; - } +static inline float reflect_coordinates(float in, int64_t twice_low, + int64_t twice_high) +{ + if (twice_low == twice_high) + { + return static_cast(0); } + float min = static_cast(twice_low) / 2; + float span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + float extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) + { + return extra + min; + } + else + { + return span - extra + min; + } +} - static inline float compute_coordinates(float coord, int64_t size, - PaddingMode padding_mode, - bool align_corners) +static inline float compute_coordinates(float coord, int64_t size, + PaddingMode padding_mode, + bool align_corners) { if (padding_mode == PaddingMode::Border) { @@ -80,30 +85,34 @@ namespace ncnn { return coord; } - static inline float grid_sampler_unnormalize(float coord, int64_t size, - bool align_corners) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1) / 2) * (size - 1); - } - else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1) * size - 1) / 2; - } +static inline float grid_sampler_unnormalize(float coord, int64_t size, + bool align_corners) +{ + if (align_corners) + { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); } - - static inline float grid_sampler_compute_source_index( - float coord, - int64_t size, - PaddingMode padding_mode, - bool align_corners) { - coord = grid_sampler_unnormalize(coord, size, align_corners); - coord = compute_coordinates(coord, size, padding_mode, align_corners); - return coord; + else + { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; } +} - template - struct ApplyGridSample; +static inline float grid_sampler_compute_source_index( + float coord, + int64_t size, + PaddingMode padding_mode, + bool align_corners) +{ + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +template +struct ApplyGridSample; template struct ApplyGridSample @@ -139,7 +148,7 @@ struct ApplyGridSample if (dims == 3) { output.create(outW, outH, input.c); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { float* output_ptr = static_cast(output.channel(q).data); @@ -158,7 +167,6 @@ struct ApplyGridSample auto interp_params = compute_interp_params_d3(gx, gy); - auto nw = std::get<0>(interp_params); auto ne = std::get<1>(interp_params); auto sw = std::get<2>(interp_params); @@ -203,8 +211,6 @@ struct ApplyGridSample v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } - - *output_ptr = v; output_ptr++; @@ -214,7 +220,6 @@ struct ApplyGridSample } else if (dims == 4) { - } else { @@ -223,137 +228,131 @@ struct ApplyGridSample } }; - template - struct ApplyGridSample +template +struct ApplyGridSample +{ + const bool must_in_bound = padding != PaddingMode::Zeros; + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - const bool must_in_bound = padding != PaddingMode::Zeros; - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + const int dims = input.dims; + const int w = input.w; + const int h = input.h; + const int outW = grid.h; + const int outH = grid.c; + const int channels = input.c; + + if (dims == 3) { - const int dims = input.dims; - const int w = input.w; - const int h = input.h; - const int outW = grid.h; - const int outH = grid.c; - const int channels = input.c; - - if (dims == 3) + output.create(outW, outH, input.c); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - output.create(outW, outH, input.c); -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = static_cast(output.channel(q).data); + float* output_ptr = static_cast(output.channel(q).data); - const Mat image = input.channel(q); + const Mat image = input.channel(q); - //const float* gxy_ptr = static_cast(grid.data); + //const float* gxy_ptr = static_cast(grid.data); - for (int y = 0; y < outH; y++) + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gxy_ptr = grid.channel(y).row(x); - auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); + const float* gxy_ptr = grid.channel(y).row(x); + auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); - auto x_nearest = static_cast(std::round(gx)); - auto y_nearest = static_cast(std::round(gy)); + auto x_nearest = static_cast(std::round(gx)); + auto y_nearest = static_cast(std::round(gy)); - float v = image.row(y_nearest)[x_nearest]; - if (!must_in_bound) - { - v = ((x_nearest < w) & (x_nearest > -1) & (y_nearest < h) & (y_nearest > -1)) ? v : 0; - } + float v = image.row(y_nearest)[x_nearest]; + if (!must_in_bound) + { + v = ((x_nearest < w) & (x_nearest > -1) & (y_nearest < h) & (y_nearest > -1)) ? v : 0; + } - *output_ptr = v; + *output_ptr = v; - output_ptr++; - } + output_ptr++; } } } - else if (dims == 4) - { - - } - else - { - - } } - }; - - template - struct ApplyGridSample - { - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + else if (dims == 4) + { + } + else { - } - }; - - GridSample::GridSample() - { - one_blob_only = false; - support_inplace = false; } +}; - int GridSample::load_param(const ParamDict& pd) +template +struct ApplyGridSample +{ + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - mode = pd.get(0, 0); - padding_mode = pd.get(1, 0); - align_corners = pd.get(6, 0); - - return 0; } +}; - int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - #define HANDLE_PADDING(interp, padding, align_corners) \ - case padding: \ - { \ - ApplyGridSample func; \ - func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ - break; \ - } +GridSample::GridSample() +{ + one_blob_only = false; + support_inplace = false; +} - #define HANDLE_INTERP(interp, align_corners) \ - case interp: \ - { \ - switch (static_cast(padding_mode)) \ - { \ - HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ - } \ - break; \ - } +int GridSample::load_param(const ParamDict& pd) +{ + mode = pd.get(0, 0); + padding_mode = pd.get(1, 0); + align_corners = pd.get(6, 0); + return 0; +} + +int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#define HANDLE_PADDING(interp, padding, align_corners) \ + case padding: \ + { \ + ApplyGridSample func; \ + func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ + break; \ + } +#define HANDLE_INTERP(interp, align_corners) \ + case interp: \ + { \ + switch (static_cast(padding_mode)) \ + { \ + HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ + } \ + break; \ + } - if (align_corners == true) + if (align_corners == true) + { + switch (static_cast(mode)) { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, true); - HANDLE_INTERP(InterpolationMode::Nearest, true); - HANDLE_INTERP(InterpolationMode::Bicubic, true); - } + HANDLE_INTERP(InterpolationMode::Bilinear, true); + HANDLE_INTERP(InterpolationMode::Nearest, true); + HANDLE_INTERP(InterpolationMode::Bicubic, true); } - else + } + else + { + switch (static_cast(mode)) { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, false); - HANDLE_INTERP(InterpolationMode::Nearest, false); - HANDLE_INTERP(InterpolationMode::Bicubic, false); - } + HANDLE_INTERP(InterpolationMode::Bilinear, false); + HANDLE_INTERP(InterpolationMode::Nearest, false); + HANDLE_INTERP(InterpolationMode::Bicubic, false); } - #undef HANDLE_PADDING - #undef HANDLE_INTERP - - - return 0; } +#undef HANDLE_PADDING +#undef HANDLE_INTERP + + return 0; +} } // namespace ncnn From 959698223cf2bbf102741488341061c020b9de3c Mon Sep 17 00:00:00 2001 From: Yoh Date: Wed, 2 Nov 2022 20:56:40 +0800 Subject: [PATCH 011/127] add x86 optimize --- src/layer/x86/gridsample_x86.cpp | 348 +++++++++++++++++++++ src/layer/x86/gridsample_x86.h | 32 ++ tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 2 +- 3 files changed, 381 insertions(+), 1 deletion(-) create mode 100644 src/layer/x86/gridsample_x86.cpp create mode 100644 src/layer/x86/gridsample_x86.h diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp new file mode 100644 index 00000000000..ffbef87730b --- /dev/null +++ b/src/layer/x86/gridsample_x86.cpp @@ -0,0 +1,348 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gridsample_x86.h" + +#if __SSE2__ +#include +#include "sse_mathfun.h" +#if __AVX__ +#include +#include "avx_mathfun.h" +#endif // __AVX__ +#endif // __SSE2__ +#include "x86_usability.h" + +namespace ncnn { + + GridSample_x86::GridSample_x86() + { +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ + } + +#if __SSE2__ +#if __AVX__ + const __m256 v1f = *(__m256*)_ps256_1; + + static __m256 NCNN_FORCEINLINE + grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) + { + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1f), two), _mm256_sub_ps(w, v1f)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1f), w), v1f), two); + } + + static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) + { + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); + } + + static __m256 reflect_coord(__m256 x, __m256 high) + { + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(x, reflect_v); + return x; + } + + static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) + { + if (padding_mode == 2) // border + { + sx = border_coord(sx, _mm256_sub_ps(w, v1f)); + } + else if (padding_mode == 3) // reflection + { + if (align_corner) + { + sx = reflect_coord(sx, _mm256_sub_ps(w, v1f)); + } + else + { + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1f)); + } + } + + return sx; + } + + static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) + { + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); + + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + + return coord; + } + + + +#endif // __AVX__ + +#endif // __SSE2__ + + int GridSample_x86::forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const + { + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + +#if __SSE2__ +#if __AVX__ + if (elempack == 8) + { + + if (dims == 3) + { + const int outW = grid.h; + const int outH = grid.c; + + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (resize_type == 1) + { + if (padding_mode == 1) //zeros + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(outH).row(outW); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); + + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); + + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1f, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1f, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + + + outptr++; + } + } + } + } + else //border bilinear + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(outH).row(outW); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); + + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); + + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1f, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1f, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + + + outptr++; + } + } + } + } + + } + + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } + } + } + } + + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } + } + } + } + } + + if (dims == 4) + { + + } + } + +#endif // __AVX__ + + if (elempack == 4) + { + if (dims == 3) + { + if (dims == 3) + { + const int outW = grid.h; + const int outH = grid.c; + + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (resize_type == 1) + { + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } + } + } + } + + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } + } + } + } + + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } + } + } + } + } + + if (dims == 4) + { + + } + } + +#endif // __SSE2__ + + if (elempack == 1) + { + return forward(bottom_blobs, top_blobs, opt); + } + + return 0; + } + +} // namespace ncnn diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h new file mode 100644 index 00000000000..ed9b117c769 --- /dev/null +++ b/src/layer/x86/gridsample_x86.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_GRIDSAMPLE_X86_H +#define LAYER_GRIDSAMPLE_X86_H + +#include "gridsample.h" + +namespace ncnn { + + class GridSample_x86 : virtual public GridSample + { + public: + GridSample_x86(); + + virtual int forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + }; + +} // namespace ncnn + +#endif // LAYER_GRIDSAMPLE_X86_H diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 2c61c245473..0e382c2de6b 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -62,7 +62,7 @@ pnnx.Output output 1 0 out if (padding_mode == "reflection") op->params["1"] = 3; - op->params["6"] = captured_params.at("align_corners").b ? 1 : 0; // align_corners + op->params["3"] = captured_params.at("align_corners").b ? 1 : 0; // align_corners } }; From 697f32d0f7cfb68f99497b157e494c772b1dedd3 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Wed, 2 Nov 2022 12:59:21 +0000 Subject: [PATCH 012/127] apply code-format changes --- src/layer/gridsample.cpp | 323 +++++++++++++------------- src/layer/x86/gridsample_x86.cpp | 381 +++++++++++++++---------------- src/layer/x86/gridsample_x86.h | 12 +- 3 files changed, 345 insertions(+), 371 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 303b5ee7df0..24f0472b4f8 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -17,46 +17,51 @@ #include namespace ncnn { - enum InterpolationMode - { - Bilinear = 1, - Nearest = 2, - Bicubic = 3 - }; +enum InterpolationMode +{ + Bilinear = 1, + Nearest = 2, + Bicubic = 3 +}; - enum PaddingMode - { - Zeros = 1, - Border = 2, - Reflection = 3 - }; +enum PaddingMode +{ + Zeros = 1, + Border = 2, + Reflection = 3 +}; - static inline float clip_coordinates(float in, int64_t clip_limit) { - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); - } +static inline float clip_coordinates(float in, int64_t clip_limit) +{ + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} - static inline float reflect_coordinates(float in, int64_t twice_low, - int64_t twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - float min = static_cast(twice_low) / 2; - float span = static_cast(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - float extra = std::fmod(in, span); - int flips = static_cast(std::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } - else { - return span - extra + min; - } +static inline float reflect_coordinates(float in, int64_t twice_low, + int64_t twice_high) +{ + if (twice_low == twice_high) + { + return static_cast(0); } + float min = static_cast(twice_low) / 2; + float span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + float extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) + { + return extra + min; + } + else + { + return span - extra + min; + } +} - static inline float compute_coordinates(float coord, int64_t size, - PaddingMode padding_mode, - bool align_corners) +static inline float compute_coordinates(float coord, int64_t size, + PaddingMode padding_mode, + bool align_corners) { if (padding_mode == PaddingMode::Border) { @@ -80,30 +85,34 @@ namespace ncnn { return coord; } - static inline float grid_sampler_unnormalize(float coord, int64_t size, - bool align_corners) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1) / 2) * (size - 1); - } - else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1) * size - 1) / 2; - } +static inline float grid_sampler_unnormalize(float coord, int64_t size, + bool align_corners) +{ + if (align_corners) + { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); } - - static inline float grid_sampler_compute_source_index( - float coord, - int64_t size, - PaddingMode padding_mode, - bool align_corners) { - coord = grid_sampler_unnormalize(coord, size, align_corners); - coord = compute_coordinates(coord, size, padding_mode, align_corners); - return coord; + else + { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; } +} - template - struct ApplyGridSample; +static inline float grid_sampler_compute_source_index( + float coord, + int64_t size, + PaddingMode padding_mode, + bool align_corners) +{ + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +template +struct ApplyGridSample; template struct ApplyGridSample @@ -139,7 +148,7 @@ struct ApplyGridSample if (dims == 3) { output.create(outW, outH, input.c); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { float* output_ptr = static_cast(output.channel(q).data); @@ -158,7 +167,6 @@ struct ApplyGridSample auto interp_params = compute_interp_params_d3(gx, gy); - auto nw = std::get<0>(interp_params); auto ne = std::get<1>(interp_params); auto sw = std::get<2>(interp_params); @@ -203,8 +211,6 @@ struct ApplyGridSample v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; } - - *output_ptr = v; output_ptr++; @@ -214,7 +220,6 @@ struct ApplyGridSample } else if (dims == 4) { - } else { @@ -223,137 +228,131 @@ struct ApplyGridSample } }; - template - struct ApplyGridSample +template +struct ApplyGridSample +{ + const bool must_in_bound = padding != PaddingMode::Zeros; + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - const bool must_in_bound = padding != PaddingMode::Zeros; - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + const int dims = input.dims; + const int w = input.w; + const int h = input.h; + const int outW = grid.h; + const int outH = grid.c; + const int channels = input.c; + + if (dims == 3) { - const int dims = input.dims; - const int w = input.w; - const int h = input.h; - const int outW = grid.h; - const int outH = grid.c; - const int channels = input.c; - - if (dims == 3) + output.create(outW, outH, input.c); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - output.create(outW, outH, input.c); -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = static_cast(output.channel(q).data); + float* output_ptr = static_cast(output.channel(q).data); - const Mat image = input.channel(q); + const Mat image = input.channel(q); - //const float* gxy_ptr = static_cast(grid.data); + //const float* gxy_ptr = static_cast(grid.data); - for (int y = 0; y < outH; y++) + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gxy_ptr = grid.channel(y).row(x); - auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); + const float* gxy_ptr = grid.channel(y).row(x); + auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); + auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); - auto x_nearest = static_cast(std::round(gx)); - auto y_nearest = static_cast(std::round(gy)); + auto x_nearest = static_cast(std::round(gx)); + auto y_nearest = static_cast(std::round(gy)); - float v = image.row(y_nearest)[x_nearest]; - if (!must_in_bound) - { - v = ((x_nearest < w) & (x_nearest > -1) & (y_nearest < h) & (y_nearest > -1)) ? v : 0; - } + float v = image.row(y_nearest)[x_nearest]; + if (!must_in_bound) + { + v = ((x_nearest < w) & (x_nearest > -1) & (y_nearest < h) & (y_nearest > -1)) ? v : 0; + } - *output_ptr = v; + *output_ptr = v; - output_ptr++; - } + output_ptr++; } } } - else if (dims == 4) - { - - } - else - { - - } } - }; - - template - struct ApplyGridSample - { - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) + else if (dims == 4) + { + } + else { - } - }; - - GridSample::GridSample() - { - one_blob_only = false; - support_inplace = false; } +}; - int GridSample::load_param(const ParamDict& pd) +template +struct ApplyGridSample +{ + inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) { - mode = pd.get(0, 0); - padding_mode = pd.get(1, 0); - align_corners = pd.get(6, 0); - - return 0; } +}; - int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - #define HANDLE_PADDING(interp, padding, align_corners) \ - case padding: \ - { \ - ApplyGridSample func; \ - func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ - break; \ - } +GridSample::GridSample() +{ + one_blob_only = false; + support_inplace = false; +} - #define HANDLE_INTERP(interp, align_corners) \ - case interp: \ - { \ - switch (static_cast(padding_mode)) \ - { \ - HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ - } \ - break; \ - } +int GridSample::load_param(const ParamDict& pd) +{ + mode = pd.get(0, 0); + padding_mode = pd.get(1, 0); + align_corners = pd.get(6, 0); + return 0; +} + +int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#define HANDLE_PADDING(interp, padding, align_corners) \ + case padding: \ + { \ + ApplyGridSample func; \ + func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ + break; \ + } +#define HANDLE_INTERP(interp, align_corners) \ + case interp: \ + { \ + switch (static_cast(padding_mode)) \ + { \ + HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ + HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ + } \ + break; \ + } - if (align_corners == true) + if (align_corners == true) + { + switch (static_cast(mode)) { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, true); - HANDLE_INTERP(InterpolationMode::Nearest, true); - HANDLE_INTERP(InterpolationMode::Bicubic, true); - } + HANDLE_INTERP(InterpolationMode::Bilinear, true); + HANDLE_INTERP(InterpolationMode::Nearest, true); + HANDLE_INTERP(InterpolationMode::Bicubic, true); } - else + } + else + { + switch (static_cast(mode)) { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, false); - HANDLE_INTERP(InterpolationMode::Nearest, false); - HANDLE_INTERP(InterpolationMode::Bicubic, false); - } + HANDLE_INTERP(InterpolationMode::Bilinear, false); + HANDLE_INTERP(InterpolationMode::Nearest, false); + HANDLE_INTERP(InterpolationMode::Bicubic, false); } - #undef HANDLE_PADDING - #undef HANDLE_INTERP - - - return 0; } +#undef HANDLE_PADDING +#undef HANDLE_INTERP + + return 0; +} } // namespace ncnn diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index ffbef87730b..52117d085c7 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,312 +26,287 @@ namespace ncnn { - GridSample_x86::GridSample_x86() - { +GridSample_x86::GridSample_x86() +{ #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ - } +} #if __SSE2__ #if __AVX__ - const __m256 v1f = *(__m256*)_ps256_1; - - static __m256 NCNN_FORCEINLINE - grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) +const __m256 v1f = *(__m256*)_ps256_1; + +static __m256 NCNN_FORCEINLINE +grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) +{ + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1f), two), _mm256_sub_ps(w, v1f)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1f), w), v1f), two); +} + +static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) +{ + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); +} + +static __m256 reflect_coord(__m256 x, __m256 high) +{ + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(x, reflect_v); + return x; +} + +static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border { - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1f), two), _mm256_sub_ps(w, v1f)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1f), w), v1f), two); + sx = border_coord(sx, _mm256_sub_ps(w, v1f)); } - - static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) + else if (padding_mode == 3) // reflection { - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); - } - - static __m256 reflect_coord(__m256 x, __m256 high) - { - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(x, reflect_v); - return x; - } - - static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) - { - if (padding_mode == 2) // border + if (align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1f)); + sx = reflect_coord(sx, _mm256_sub_ps(w, v1f)); } - else if (padding_mode == 3) // reflection + else { - if (align_corner) - { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1f)); - } - else - { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1f)); - } + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1f)); } - - return sx; } - static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) - { - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + return sx; +} - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); +static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) +{ + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); - return coord; - } - + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + return coord; +} #endif // __AVX__ #endif // __SSE2__ - int GridSample_x86::forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; +int GridSample_x86::forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) + if (elempack == 8) + { + if (dims == 3) { + const int outW = grid.h; + const int outH = grid.c; - if (dims == 3) - { - const int outW = grid.h; - const int outH = grid.c; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (resize_type == 1) + if (resize_type == 1) + { + if (padding_mode == 1) //zeros { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(outH).row(outW); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); - - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); + const float* gridptr = grid.channel(outH).row(outW); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1f, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1f, n); + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1f, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1f, n); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - - outptr++; - } + outptr++; } } } - else //border bilinear + } + else //border bilinear + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(outH).row(outW); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); - - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); - - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); + const float* gridptr = grid.channel(outH).row(outW); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1f, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1f, n); + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1f, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1f, n); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - outptr++; - } + outptr++; } } } - } + } - if (resize_type == 2) + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } } + } - if (dims == 4) - { - - } + if (dims == 4) + { } + } #endif // __AVX__ - if (elempack == 4) + if (elempack == 4) + { + if (dims == 3) { if (dims == 3) { - if (dims == 3) - { - const int outW = grid.h; - const int outH = grid.c; + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) + if (resize_type == 1) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 2) + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } } if (dims == 4) { - } } diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h index ed9b117c769..88407d05cbb 100644 --- a/src/layer/x86/gridsample_x86.h +++ b/src/layer/x86/gridsample_x86.h @@ -19,13 +19,13 @@ namespace ncnn { - class GridSample_x86 : virtual public GridSample - { - public: - GridSample_x86(); +class GridSample_x86 : virtual public GridSample +{ +public: + GridSample_x86(); - virtual int forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - }; + virtual int forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; } // namespace ncnn From ea336f00d7f47a689d7eed80b4473da93908d651 Mon Sep 17 00:00:00 2001 From: Yoh Date: Fri, 4 Nov 2022 20:03:08 +0800 Subject: [PATCH 013/127] add x86 bilinear --- src/layer/x86/gridsample_x86.cpp | 442 ++++++++++++++++++------------- src/layer/x86/gridsample_x86.h | 2 +- 2 files changed, 265 insertions(+), 179 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 52117d085c7..429fa13a58c 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,151 +26,258 @@ namespace ncnn { -GridSample_x86::GridSample_x86() -{ + GridSample_x86::GridSample_x86() + { #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ -} + } #if __SSE2__ #if __AVX__ -const __m256 v1f = *(__m256*)_ps256_1; - -static __m256 NCNN_FORCEINLINE -grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) -{ - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1f), two), _mm256_sub_ps(w, v1f)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1f), w), v1f), two); -} - -static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) -{ - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); -} - -static __m256 reflect_coord(__m256 x, __m256 high) -{ - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(x, reflect_v); - return x; -} - -static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) -{ - if (padding_mode == 2) // border + const __m256 v1fp8 = *(__m256*)_ps256_1; + const __m256i v1ip8 = _mm256_set1_epi32(1); + const __m256i vn1ip8 = _mm256_set1_epi32(-1); + + static __m256 NCNN_FORCEINLINE + grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1f)); + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); } - else if (padding_mode == 3) // reflection + + static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) { - if (align_corner) + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); + } + + static __m256 reflect_coord(__m256 x, __m256 high) + { + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(x, reflect_v); + return x; + } + + static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) + { + if (padding_mode == 2) // border { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1f)); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - else + else if (padding_mode == 3) // reflection { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1f)); + if (align_corner) + { + sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); + } + else + { + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + } } + + return sx; } - return sx; -} + static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) + { + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); + + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); -static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) -{ - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + return coord; + } - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); - return coord; -} #endif // __AVX__ #endif // __SSE2__ -int GridSample_x86::forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; + int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const + { + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) - { - if (dims == 3) + if (elempack == 8) { - const int outW = grid.h; - const int outH = grid.c; + const auto vElemsizei = _mm256_set1_epi32(elemsize); + if (dims == 3) + { + const int outW = grid.h; + const int outH = grid.c * elempack; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const auto vWi = _mm256_set1_epi32(outW); + const auto vHi = _mm256_set1_epi32(outH); - if (resize_type == 1) - { - if (padding_mode == 1) //zeros + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (resize_type == 1) //zeros { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(outH).row(outW); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y).row(x); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); + + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); + + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), 1); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), 1); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), 1); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } + } + } + } + else //border bilinear + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(outH).row(outW); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1f, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1f, n); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - outptr++; + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, v1fp8, 1); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&y1_in_range), 1); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&x1_in_range), 1); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr++; + } } } } + } - else //border bilinear + + if (resize_type == 2) { - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* outptr = bottom_blob.channel(q); @@ -178,127 +285,105 @@ int GridSample_x86::forward_inplace(const std::vector& bottom_blobs, std::v { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(outH).row(outW); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); - - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); - - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1f, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1f, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); outptr++; } } } } - } - if (resize_type == 2) - { - for (int q = 0; q < channels; q++) + if (resize_type == 3) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } } } } } - if (resize_type == 3) + if (dims == 4) { - for (int q = 0; q < channels; q++) - { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - outptr++; - } - } - } - } - } - if (dims == 4) - { + } } - } #endif // __AVX__ - if (elempack == 4) - { - if (dims == 3) + if (elempack == 4) { if (dims == 3) { - const int outW = grid.h; - const int outH = grid.c; + if (dims == 3) + { + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) - { - for (int q = 0; q < channels; q++) + if (resize_type == 1) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } } } } - } - if (resize_type == 2) - { - for (int q = 0; q < channels; q++) + if (resize_type == 2) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } } } } - } - if (resize_type == 3) - { - for (int q = 0; q < channels; q++) + if (resize_type == 3) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } } } } @@ -307,6 +392,7 @@ int GridSample_x86::forward_inplace(const std::vector& bottom_blobs, std::v if (dims == 4) { + } } diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h index 88407d05cbb..826414eefc9 100644 --- a/src/layer/x86/gridsample_x86.h +++ b/src/layer/x86/gridsample_x86.h @@ -24,7 +24,7 @@ class GridSample_x86 : virtual public GridSample public: GridSample_x86(); - virtual int forward_inplace(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; }; } // namespace ncnn From be7d02db89fd82fa608ff114da3ff04aad95e021 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Fri, 4 Nov 2022 12:04:49 +0000 Subject: [PATCH 014/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 520 +++++++++++++++---------------- 1 file changed, 250 insertions(+), 270 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 429fa13a58c..84af9c88727 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,384 +26,364 @@ namespace ncnn { - GridSample_x86::GridSample_x86() - { +GridSample_x86::GridSample_x86() +{ #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ - } +} #if __SSE2__ #if __AVX__ - const __m256 v1fp8 = *(__m256*)_ps256_1; - const __m256i v1ip8 = _mm256_set1_epi32(1); - const __m256i vn1ip8 = _mm256_set1_epi32(-1); - - static __m256 NCNN_FORCEINLINE - grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) - { - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); - } - - static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) - { - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); - } - - static __m256 reflect_coord(__m256 x, __m256 high) +const __m256 v1fp8 = *(__m256*)_ps256_1; +const __m256i v1ip8 = _mm256_set1_epi32(1); +const __m256i vn1ip8 = _mm256_set1_epi32(-1); + +static __m256 NCNN_FORCEINLINE +grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) +{ + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); +} + +static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) +{ + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); +} + +static __m256 reflect_coord(__m256 x, __m256 high) +{ + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(x, reflect_v); + return x; +} + +static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border { - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(x, reflect_v); - return x; + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - - static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) + else if (padding_mode == 3) // reflection { - if (padding_mode == 2) // border + if (align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); } - else if (padding_mode == 3) // reflection + else { - if (align_corner) - { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); - } - else - { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); - } + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - - return sx; } - static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) - { - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); - - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + return sx; +} - return coord; - } +static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) +{ + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + return coord; +} #endif // __AVX__ #endif // __SSE2__ - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; +int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) + if (elempack == 8) + { + const auto vElemsizei = _mm256_set1_epi32(elemsize); + if (dims == 3) { - const auto vElemsizei = _mm256_set1_epi32(elemsize); - if (dims == 3) - { - const int outW = grid.h; - const int outH = grid.c * elempack; + const int outW = grid.h; + const int outH = grid.c * elempack; - const auto vWi = _mm256_set1_epi32(outW); - const auto vHi = _mm256_set1_epi32(outH); + const auto vWi = _mm256_set1_epi32(outW); + const auto vHi = _mm256_set1_epi32(outH); - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) //zeros + if (resize_type == 1) //zeros + { + if (padding_mode == 1) //zeros { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y).row(x); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); - - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); - - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), 1); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), 1); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), 1); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; - } + const float* gridptr = grid.channel(y).row(x); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); + + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); + + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), 1); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), 1); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), 1); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; } } } - else //border bilinear + } + else //border bilinear + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(outH).row(outW); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); + const float* gridptr = grid.channel(outH).row(outW); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[1]); - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); + __m256 vecH = _mm256_set1_ps(outH); + __m256 vecW = _mm256_set1_ps(outW); - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); + gx = get_coord(gx, vecW, padding_mode, align_corner); + gx = get_coord(gy, vecH, padding_mode, align_corner); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); + auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, v1fp8, 1); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&y1_in_range), 1); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&x1_in_range), 1); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, v1fp8, 1); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&y1_in_range), 1); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&x1_in_range), 1); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr++; - } + outptr++; } } } - } + } - if (resize_type == 2) + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } } + } - if (dims == 4) - { - - } + if (dims == 4) + { } + } #endif // __AVX__ - if (elempack == 4) + if (elempack == 4) + { + if (dims == 3) { if (dims == 3) { - if (dims == 3) - { - const int outW = grid.h; - const int outH = grid.c; + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) + if (resize_type == 1) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 2) + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } } } - - if (dims == 4) - { - - } } -#endif // __SSE2__ - - if (elempack == 1) + if (dims == 4) { - return forward(bottom_blobs, top_blobs, opt); } + } - return 0; +#endif // __SSE2__ + + if (elempack == 1) + { + return forward(bottom_blobs, top_blobs, opt); } + return 0; +} + } // namespace ncnn From 3c2636dd2f1c2f044f8faedf785c4bed85494629 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Tue, 8 Nov 2022 20:43:45 +0800 Subject: [PATCH 015/127] support dims=3 mode=bilinear&nearest --- src/layer/x86/gridsample_x86.cpp | 583 ++++++++++++++++++------------- 1 file changed, 332 insertions(+), 251 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 84af9c88727..5e7dacc6f69 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,364 +26,445 @@ namespace ncnn { -GridSample_x86::GridSample_x86() -{ + GridSample_x86::GridSample_x86() + { #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ -} + } #if __SSE2__ #if __AVX__ -const __m256 v1fp8 = *(__m256*)_ps256_1; -const __m256i v1ip8 = _mm256_set1_epi32(1); -const __m256i vn1ip8 = _mm256_set1_epi32(-1); - -static __m256 NCNN_FORCEINLINE -grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) -{ - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); -} - -static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) -{ - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); -} - -static __m256 reflect_coord(__m256 x, __m256 high) -{ - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(x, reflect_v); - return x; -} - -static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) -{ - if (padding_mode == 2) // border + const __m256 v1fp8 = *(__m256*)_ps256_1; + const __m256i v1ip8 = _mm256_set1_epi32(1); + const __m256i vn1ip8 = _mm256_set1_epi32(-1); + + static __m256 NCNN_FORCEINLINE + grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); } - else if (padding_mode == 3) // reflection + + static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) { - if (align_corner) + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); + } + + static __m256 reflect_coord(__m256 x, __m256 high) + { + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(high, reflect_v); + return x; + } + + static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) + { + if (padding_mode == 2) // border { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - else + else if (padding_mode == 3) // reflection { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + if (align_corner) + { + sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); + } + else + { + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + } } + + return sx; } - return sx; -} + static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) + { + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); -static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) -{ - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + return coord; + } + - return coord; -} #endif // __AVX__ #endif // __SSE2__ -int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; + int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const + { + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) - { - const auto vElemsizei = _mm256_set1_epi32(elemsize); - if (dims == 3) + if (elempack == 8) { - const int outW = grid.h; - const int outH = grid.c * elempack; + const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); + if (dims == 3) + { + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - const auto vWi = _mm256_set1_epi32(outW); - const auto vHi = _mm256_set1_epi32(outH); + const auto vWi = _mm256_set1_epi32(outW); + const auto vHi = _mm256_set1_epi32(outH); - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const auto vHf = _mm256_set1_ps(outH); + const auto vWf = _mm256_set1_ps(outW); - if (resize_type == 1) //zeros - { - if (padding_mode == 1) //zeros + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const auto vElempacki = _mm256_set1_epi32(elempack); + + if (resize_type == 1) //zeros { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y).row(x); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); - - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); - - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + for (int x = 0; x < outW; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } + } + } + } + else //border reflection + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } + } + } + } - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + } - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + if (resize_type == 2) + { + if (padding_mode == 1) //zeros + { + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), 1); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), 1); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), 1); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += 8; + } } } } - } - else //border bilinear - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + else //border reflection { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(outH).row(outW); - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[1]); - - __m256 vecH = _mm256_set1_ps(outH); - __m256 vecW = _mm256_set1_ps(outW); - - gx = get_coord(gx, vecW, padding_mode, align_corner); - gx = get_coord(gy, vecH, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto i_nw_offset = _mm256_mul_epi32(_mm256_add_epi32(_mm256_mul_epi32(x0, vWi), y0), vElemsizei); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElemsizei); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mul_epi32(vWi, vElemsizei)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElemsizei); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_nw_offset, v1fp8, 1); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_ne_offset, *reinterpret_cast<__m256*>(&y1_in_range), 1); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_sw_offset, *reinterpret_cast<__m256*>(&x1_in_range), 1); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), 1); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr++; + outptr += 8; + } } } } } - } - if (resize_type == 2) - { - for (int q = 0; q < channels; q++) + if (resize_type == 3) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr += 8; + } } } } } - if (resize_type == 3) + if (dims == 4) { - for (int q = 0; q < channels; q++) - { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - outptr++; - } - } - } - } - } - if (dims == 4) - { + } } - } #endif // __AVX__ - if (elempack == 4) - { - if (dims == 3) + if (elempack == 4) { if (dims == 3) { - const int outW = grid.h; - const int outH = grid.c; + if (dims == 3) + { + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) - { - for (int q = 0; q < channels; q++) + if (resize_type == 1) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr += 8; + } } } } - } - if (resize_type == 2) - { - for (int q = 0; q < channels; q++) + if (resize_type == 2) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } } } } - } - if (resize_type == 3) - { - for (int q = 0; q < channels; q++) + if (resize_type == 3) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr += 8; + } } } } } } - } - if (dims == 4) - { + if (dims == 4) + { + + } } - } #endif // __SSE2__ - if (elempack == 1) - { - return forward(bottom_blobs, top_blobs, opt); - } + if (elempack == 1) + { + return GridSample::forward(bottom_blobs, top_blobs, opt); + } - return 0; -} + return 0; + } } // namespace ncnn From 1dba62ea61201008c563c28bcc5044fd5f2e8c84 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Tue, 8 Nov 2022 12:45:23 +0000 Subject: [PATCH 016/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 615 +++++++++++++++---------------- 1 file changed, 299 insertions(+), 316 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 5e7dacc6f69..d463ee8f4ca 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,346 +26,324 @@ namespace ncnn { - GridSample_x86::GridSample_x86() - { +GridSample_x86::GridSample_x86() +{ #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ - } +} #if __SSE2__ #if __AVX__ - const __m256 v1fp8 = *(__m256*)_ps256_1; - const __m256i v1ip8 = _mm256_set1_epi32(1); - const __m256i vn1ip8 = _mm256_set1_epi32(-1); - - static __m256 NCNN_FORCEINLINE - grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) - { - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); - } - - static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) - { - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); - } - - static __m256 reflect_coord(__m256 x, __m256 high) +const __m256 v1fp8 = *(__m256*)_ps256_1; +const __m256i v1ip8 = _mm256_set1_epi32(1); +const __m256i vn1ip8 = _mm256_set1_epi32(-1); + +static __m256 NCNN_FORCEINLINE +grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) +{ + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); +} + +static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) +{ + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); +} + +static __m256 reflect_coord(__m256 x, __m256 high) +{ + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(high, reflect_v); + return x; +} + +static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border { - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(high, reflect_v); - return x; + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - - static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) + else if (padding_mode == 3) // reflection { - if (padding_mode == 2) // border + if (align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); } - else if (padding_mode == 3) // reflection + else { - if (align_corner) - { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); - } - else - { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); - } + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - - return sx; } - static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) - { - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + return sx; +} - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); +static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) +{ + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); - return coord; - } - + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + return coord; +} #endif // __AVX__ #endif // __SSE2__ - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; +int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) + if (elempack == 8) + { + const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); + if (dims == 3) { - const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); - if (dims == 3) - { - const auto outW = grid.h; - const auto outH = grid.c * grid.elempack; + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - const auto vWi = _mm256_set1_epi32(outW); - const auto vHi = _mm256_set1_epi32(outH); + const auto vWi = _mm256_set1_epi32(outW); + const auto vHi = _mm256_set1_epi32(outH); - const auto vHf = _mm256_set1_ps(outH); - const auto vWf = _mm256_set1_ps(outW); + const auto vHf = _mm256_set1_ps(outH); + const auto vWf = _mm256_set1_ps(outW); - top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const auto vElempacki = _mm256_set1_epi32(elempack); + const auto vElempacki = _mm256_set1_epi32(elempack); - if (resize_type == 1) //zeros + if (resize_type == 1) //zeros + { + if (padding_mode == 1) //zeros { - if (padding_mode == 1) //zeros - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; - } - } - } - } - else //border reflection + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; - } + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; } } } - } - - if (resize_type == 2) + else //border reflection { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - _mm256_storeu_ps(outptr, _v); + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - outptr += 8; - } + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; } } } - else //border reflection + } + } + + if (resize_type == 2) + { + if (padding_mode == 1) //zeros + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; - } + outptr += 8; } } } } - - if (resize_type == 3) + else //border reflection { for (int q = 0; q < channels; q++) { - const float* outptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + _mm256_storeu_ps(outptr, _v); outptr += 8; } @@ -374,97 +352,102 @@ namespace ncnn { } } - if (dims == 4) + if (resize_type == 3) { - + for (int q = 0; q < channels; q++) + { + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + outptr += 8; + } + } + } } } + if (dims == 4) + { + } + } + #endif // __AVX__ - if (elempack == 4) + if (elempack == 4) + { + if (dims == 3) { if (dims == 3) { - if (dims == 3) - { - const int outW = grid.h; - const int outH = grid.c; + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) + if (resize_type == 1) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr += 8; - } + outptr += 8; } } } + } - if (resize_type == 2) + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr += 8; - } + outptr += 8; } } } } } - - if (dims == 4) - { - - } } -#endif // __SSE2__ - - if (elempack == 1) + if (dims == 4) { - return GridSample::forward(bottom_blobs, top_blobs, opt); } + } - return 0; +#endif // __SSE2__ + + if (elempack == 1) + { + return GridSample::forward(bottom_blobs, top_blobs, opt); } + return 0; +} + } // namespace ncnn From 13482ccd533c7df5755696dba13ad4e010d50a8d Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Wed, 9 Nov 2022 19:32:19 +0800 Subject: [PATCH 017/127] finish dims=3 pack=8 --- src/layer/x86/gridsample_x86.cpp | 784 +++++++++++++++++++------------ 1 file changed, 485 insertions(+), 299 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index d463ee8f4ca..9e6613d2fd0 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,428 +26,614 @@ namespace ncnn { -GridSample_x86::GridSample_x86() -{ + GridSample_x86::GridSample_x86() + { #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ -} + } #if __SSE2__ #if __AVX__ -const __m256 v1fp8 = *(__m256*)_ps256_1; -const __m256i v1ip8 = _mm256_set1_epi32(1); -const __m256i vn1ip8 = _mm256_set1_epi32(-1); - -static __m256 NCNN_FORCEINLINE -grid_sample_unormalize(__m256 w, __m256 coordx, int align_corner) -{ - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); -} - -static NCNN_FORCEINLINE __m256 border_coord(__m256 coord, __m256 border) -{ - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); -} - -static __m256 reflect_coord(__m256 x, __m256 high) -{ - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(high, reflect_v); - return x; -} - -static __m256 compute_coord(__m256 sx, __m256 w, int padding_mode, int align_corner) -{ - if (padding_mode == 2) // border + const __m256 v1fp8 = *(__m256*)_ps256_1; + const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); + const __m256i v1ip8 = _mm256_set1_epi32(1); + const __m256i vn1ip8 = _mm256_set1_epi32(-1); + + static __m256 NCNN_FORCEINLINE + grid_sample_unormalize(const __m256& w, const __m256& coordx, int align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); } - else if (padding_mode == 3) // reflection + + static NCNN_FORCEINLINE __m256 border_coord(const __m256& coord, const __m256& border) { - if (align_corner) + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); + } + + static NCNN_FORCEINLINE __m256 reflect_coord(__m256 x, const __m256& high) + { + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(high, reflect_v); + return x; + } + + static NCNN_FORCEINLINE __m256 compute_coord(__m256 sx, const __m256& w, int padding_mode, int align_corner) + { + if (padding_mode == 2) // border { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - else + else if (padding_mode == 3) // reflection { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + if (align_corner) + { + sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); + } + else + { + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + } } + + return sx; } - return sx; -} + static NCNN_FORCEINLINE __m256 get_coord(const __m256& x, const __m256& w, int padding_mode, int align_corner) + { + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); + + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); -static __m256 get_coord(__m256 x, __m256 w, int padding_mode, int align_corner) -{ - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + return coord; + } + + static NCNN_FORCEINLINE __m256 cubic_interp1d_p0(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) + { + const auto A = _mm256_set1_ps(-0.75f); - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + const auto x0 = _mm256_add_ps(tx, v1fp8); + const auto& x1 = tx; + const auto x2 = _mm256_sub_ps(vn1fp8, tx); + const auto x3 = _mm256_add_ps(x2, v1fp8); + + //should be optimized? :( + const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A , _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); + const __m256 coeffs2 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x2), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs3 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x3), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x3), x3), v1fp8); + + + auto _v = _mm256_mul_ps(coeffs0, x0_v); + _v = _mm256_comp_fmadd_ps(coeffs1, x1, _v); + _v = _mm256_comp_fmadd_ps(coeffs2, x2, _v); + _v = _mm256_comp_fmadd_ps(coeffs3, x3, _v); + + return _v; + } - return coord; -} #endif // __AVX__ #endif // __SSE2__ -int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; + int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const + { + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) - { - const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); - if (dims == 3) + if (elempack == 8) { - const auto outW = grid.h; - const auto outH = grid.c * grid.elempack; + const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); + if (dims == 3) + { + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - const auto vWi = _mm256_set1_epi32(outW); - const auto vHi = _mm256_set1_epi32(outH); + const auto vWi = _mm256_set1_epi32(outW); + const auto vHi = _mm256_set1_epi32(outH); - const auto vHf = _mm256_set1_ps(outH); - const auto vWf = _mm256_set1_ps(outW); + const auto vHf = _mm256_set1_ps(outH); + const auto vWf = _mm256_set1_ps(outW); - top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const auto vElempacki = _mm256_set1_epi32(elempack); + const auto vElempacki = _mm256_set1_epi32(elempack); + const auto vElempackf = _mm256_set1_ps(elempack); - if (resize_type == 1) //zeros - { - if (padding_mode == 1) //zeros + if (resize_type == 1) //zeros { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; + for (int x = 0; x < outW; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } } } } - } - else //border reflection - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + else //border reflection { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } + } + } + } - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + } - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + if (resize_type == 2) + { + if (padding_mode == 1) //zeros + { + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_nw_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += 8; + } } } } - } - } - - if (resize_type == 2) - { - if (padding_mode == 1) //zeros - { - for (int q = 0; q < channels; q++) + else //border reflection { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += 8; + } } } } } - else //border reflection + + if (resize_type == 3) { - for (int q = 0; q < channels; q++) + if (padding_mode == 1) + { + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize(vWf, gx, align_corner); + gy = grid_sample_unormalize(vHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); + auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); + auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); + auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); + + gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vWi, x3)); + + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vHi, y)); + + auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); + auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); + auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); + auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } + } + } + } + else { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize(vWf, gx, align_corner); + gy = grid_sample_unormalize(vHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); + auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); + auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); + auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); + + gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + auto y = _mm256_cvtps_epi32(gy); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); + } - _mm256_storeu_ps(outptr, _v); + auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - outptr += 8; + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } } } } } } - if (resize_type == 3) + if (dims == 4) { - for (int q = 0; q < channels; q++) - { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - outptr += 8; - } - } - } + } } - if (dims == 4) - { - } - } - #endif // __AVX__ - if (elempack == 4) - { - if (dims == 3) + if (elempack == 4) { if (dims == 3) { - const int outW = grid.h; - const int outH = grid.c; + if (dims == 3) + { + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) - { - for (int q = 0; q < channels; q++) + if (resize_type == 1) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr += 8; + for (int x = 0; x < outW; x++) + { + + + + outptr += 8; + } } } } - } - if (resize_type == 2) - { - for (int q = 0; q < channels; q++) + if (resize_type == 2) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + + + + outptr++; + } } } } - } - if (resize_type == 3) - { - for (int q = 0; q < channels; q++) + if (resize_type == 3) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr += 8; + for (int x = 0; x < outW; x++) + { + + + + outptr += 8; + } } } } } } - } - if (dims == 4) - { + if (dims == 4) + { + + } } - } #endif // __SSE2__ - if (elempack == 1) - { - return GridSample::forward(bottom_blobs, top_blobs, opt); - } + if (elempack == 1) + { + return GridSample::forward(bottom_blobs, top_blobs, opt); + } - return 0; -} + return 0; + } } // namespace ncnn From 11d4c3661c3ad9ecb37ae1666248c990801a33b5 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Wed, 9 Nov 2022 11:34:24 +0000 Subject: [PATCH 018/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 872 +++++++++++++++---------------- 1 file changed, 429 insertions(+), 443 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 9e6613d2fd0..bbdd3930c8c 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,614 +26,600 @@ namespace ncnn { - GridSample_x86::GridSample_x86() - { +GridSample_x86::GridSample_x86() +{ #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ - } +} #if __SSE2__ #if __AVX__ - const __m256 v1fp8 = *(__m256*)_ps256_1; - const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); - const __m256i v1ip8 = _mm256_set1_epi32(1); - const __m256i vn1ip8 = _mm256_set1_epi32(-1); - - static __m256 NCNN_FORCEINLINE - grid_sample_unormalize(const __m256& w, const __m256& coordx, int align_corner) +const __m256 v1fp8 = *(__m256*)_ps256_1; +const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); +const __m256i v1ip8 = _mm256_set1_epi32(1); +const __m256i vn1ip8 = _mm256_set1_epi32(-1); + +static __m256 NCNN_FORCEINLINE +grid_sample_unormalize(const __m256& w, const __m256& coordx, int align_corner) +{ + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); +} + +static NCNN_FORCEINLINE __m256 border_coord(const __m256& coord, const __m256& border) +{ + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); +} + +static NCNN_FORCEINLINE __m256 reflect_coord(__m256 x, const __m256& high) +{ + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(high, reflect_v); + return x; +} + +static NCNN_FORCEINLINE __m256 compute_coord(__m256 sx, const __m256& w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border { - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - - static NCNN_FORCEINLINE __m256 border_coord(const __m256& coord, const __m256& border) + else if (padding_mode == 3) // reflection { - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); - } - - static NCNN_FORCEINLINE __m256 reflect_coord(__m256 x, const __m256& high) - { - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(high, reflect_v); - return x; - } - - static NCNN_FORCEINLINE __m256 compute_coord(__m256 sx, const __m256& w, int padding_mode, int align_corner) - { - if (padding_mode == 2) // border + if (align_corner) { - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); } - else if (padding_mode == 3) // reflection + else { - if (align_corner) - { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); - } - else - { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); - } + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); } - - return sx; } - static NCNN_FORCEINLINE __m256 get_coord(const __m256& x, const __m256& w, int padding_mode, int align_corner) - { - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + return sx; +} - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); +static NCNN_FORCEINLINE __m256 get_coord(const __m256& x, const __m256& w, int padding_mode, int align_corner) +{ + // compute the origin coordinates + __m256 sx = grid_sample_unormalize(w, x, align_corner); - return coord; - } - - static NCNN_FORCEINLINE __m256 cubic_interp1d_p0(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) - { - const auto A = _mm256_set1_ps(-0.75f); + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord(sx, w, padding_mode, align_corner); - const auto x0 = _mm256_add_ps(tx, v1fp8); - const auto& x1 = tx; - const auto x2 = _mm256_sub_ps(vn1fp8, tx); - const auto x3 = _mm256_add_ps(x2, v1fp8); + return coord; +} - //should be optimized? :( - const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A , _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); - const __m256 coeffs2 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x2), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs3 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x3), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x3), x3), v1fp8); +static NCNN_FORCEINLINE __m256 cubic_interp1d_p0(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) +{ + const auto A = _mm256_set1_ps(-0.75f); + const auto x0 = _mm256_add_ps(tx, v1fp8); + const auto& x1 = tx; + const auto x2 = _mm256_sub_ps(vn1fp8, tx); + const auto x3 = _mm256_add_ps(x2, v1fp8); - auto _v = _mm256_mul_ps(coeffs0, x0_v); - _v = _mm256_comp_fmadd_ps(coeffs1, x1, _v); - _v = _mm256_comp_fmadd_ps(coeffs2, x2, _v); - _v = _mm256_comp_fmadd_ps(coeffs3, x3, _v); + //should be optimized? :( + const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); + const __m256 coeffs2 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x2), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs3 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x3), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x3), x3), v1fp8); - return _v; - } + auto _v = _mm256_mul_ps(coeffs0, x0_v); + _v = _mm256_comp_fmadd_ps(coeffs1, x1, _v); + _v = _mm256_comp_fmadd_ps(coeffs2, x2, _v); + _v = _mm256_comp_fmadd_ps(coeffs3, x3, _v); + return _v; +} #endif // __AVX__ #endif // __SSE2__ - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; +int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; #if __SSE2__ #if __AVX__ - if (elempack == 8) + if (elempack == 8) + { + const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); + if (dims == 3) { - const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); - if (dims == 3) - { - const auto outW = grid.h; - const auto outH = grid.c * grid.elempack; + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - const auto vWi = _mm256_set1_epi32(outW); - const auto vHi = _mm256_set1_epi32(outH); + const auto vWi = _mm256_set1_epi32(outW); + const auto vHi = _mm256_set1_epi32(outH); - const auto vHf = _mm256_set1_ps(outH); - const auto vWf = _mm256_set1_ps(outW); + const auto vHf = _mm256_set1_ps(outH); + const auto vWf = _mm256_set1_ps(outW); - top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const auto vElempacki = _mm256_set1_epi32(elempack); - const auto vElempackf = _mm256_set1_ps(elempack); + const auto vElempacki = _mm256_set1_epi32(elempack); + const auto vElempackf = _mm256_set1_ps(elempack); - if (resize_type == 1) //zeros + if (resize_type == 1) //zeros + { + if (padding_mode == 1) //zeros { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; - } + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; } } } - else //border reflection + } + else //border reflection + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; - } + outptr += 8; } } } - } + } - if (resize_type == 2) + if (resize_type == 2) + { + if (padding_mode == 1) //zeros { - if (padding_mode == 1) //zeros + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; - } + outptr += 8; } } } - else //border reflection + } + else //border reflection + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = get_coord(gx, vWf, padding_mode, align_corner); + gy = get_coord(gy, vHf, padding_mode, align_corner); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; - } + outptr += 8; } } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + if (padding_mode == 1) { - if (padding_mode == 1) + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize(vWf, gx, align_corner); - gy = grid_sample_unormalize(vHf, gy, align_corner); - - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); - - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 coefficients[4]; + gx = grid_sample_unormalize(vWf, gx, align_corner); + gy = grid_sample_unormalize(vHf, gy, align_corner); - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); - auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); - auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); - auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); - gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + __m256 coefficients[4]; - auto y = _mm256_cvtps_epi32(gy); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vWi, x3)); - - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vHi, y)); - - auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); - auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); - auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); - auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); + auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); + auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); + auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + auto y = _mm256_cvtps_epi32(gy); - coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); - } + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vWi, x3)); + + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vHi, y)); + + auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); + auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); + auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); + auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); + } - auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; - } + outptr += 8; } } } - else + } + else + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize(vWf, gx, align_corner); - gy = grid_sample_unormalize(vHf, gy, align_corner); - - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + gx = grid_sample_unormalize(vWf, gx, align_corner); + gy = grid_sample_unormalize(vHf, gy, align_corner); - __m256 coefficients[4]; + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); - auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); - auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); - auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); - gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); + __m256 coefficients[4]; - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); - - coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); - } + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); + auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); + auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); + auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); + + gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); + } - auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; - } + outptr += 8; } } } } } + } - if (dims == 4) - { - - } + if (dims == 4) + { } + } #endif // __AVX__ - if (elempack == 4) + if (elempack == 4) + { + if (dims == 3) { if (dims == 3) { - if (dims == 3) - { - const int outW = grid.h; - const int outH = grid.c; + const int outW = grid.h; + const int outH = grid.c; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (resize_type == 1) + if (resize_type == 1) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr += 8; - } + outptr += 8; } } } + } - if (resize_type == 2) + if (resize_type == 2) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr++; - } + outptr++; } } } + } - if (resize_type == 3) + if (resize_type == 3) + { + for (int q = 0; q < channels; q++) { - for (int q = 0; q < channels; q++) + const float* outptr = bottom_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - - - - outptr += 8; - } + outptr += 8; } } } } } - - if (dims == 4) - { - - } } -#endif // __SSE2__ - - if (elempack == 1) + if (dims == 4) { - return GridSample::forward(bottom_blobs, top_blobs, opt); } + } - return 0; +#endif // __SSE2__ + + if (elempack == 1) + { + return GridSample::forward(bottom_blobs, top_blobs, opt); } + return 0; +} + } // namespace ncnn From 7f6193a2035168fc83abf312f60c0f3c083aca0a Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Thu, 10 Nov 2022 23:20:29 +0800 Subject: [PATCH 019/127] x86 optimize, mode=nearest(cmath&avx2 round diff) --- src/layer/x86/gridsample_x86.cpp | 1986 +++++++++++++++++++++++------- 1 file changed, 1568 insertions(+), 418 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index bbdd3930c8c..a6b1f16c516 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,565 +26,1213 @@ namespace ncnn { -GridSample_x86::GridSample_x86() -{ + GridSample_x86::GridSample_x86() + { #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ -} + } #if __SSE2__ #if __AVX__ -const __m256 v1fp8 = *(__m256*)_ps256_1; -const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); -const __m256i v1ip8 = _mm256_set1_epi32(1); -const __m256i vn1ip8 = _mm256_set1_epi32(-1); - -static __m256 NCNN_FORCEINLINE -grid_sample_unormalize(const __m256& w, const __m256& coordx, int align_corner) -{ - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); -} - -static NCNN_FORCEINLINE __m256 border_coord(const __m256& coord, const __m256& border) -{ - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); -} - -static NCNN_FORCEINLINE __m256 reflect_coord(__m256 x, const __m256& high) -{ - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(high, reflect_v); - return x; -} - -static NCNN_FORCEINLINE __m256 compute_coord(__m256 sx, const __m256& w, int padding_mode, int align_corner) -{ - if (padding_mode == 2) // border + const __m256 v1fp8 = *(__m256*)_ps256_1; + const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); + const __m256i v1ip8 = _mm256_set1_epi32(1); + const __m256i vn1ip8 = _mm256_set1_epi32(-1); + + static __m256 NCNN_FORCEINLINE + grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) + { + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); + } + + static NCNN_FORCEINLINE __m256 border_coord_p8(const __m256& coord, const __m256& border) { - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); } - else if (padding_mode == 3) // reflection + + static NCNN_FORCEINLINE __m256 reflect_coord_p8(__m256 x, const __m256& high) { - if (align_corner) + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(high, reflect_v); + return x; + } + + static NCNN_FORCEINLINE __m256 compute_coord_p8(__m256 sx, const __m256& w, int padding_mode, int align_corner) + { + if (padding_mode == 2) // border { - sx = reflect_coord(sx, _mm256_sub_ps(w, v1fp8)); + sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); } - else + else if (padding_mode == 3) // reflection { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord(sx, _mm256_sub_ps(w, v1fp8)); + if (align_corner) + { + sx = reflect_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); + } + else + { + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord_p8(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); + } } + + return sx; } - return sx; -} + static NCNN_FORCEINLINE __m256 get_coord_p8(const __m256& x, const __m256& w, int padding_mode, int align_corner) + { + // compute the origin coordinates + __m256 sx = grid_sample_unormalize_p8(w, x, align_corner); -static NCNN_FORCEINLINE __m256 get_coord(const __m256& x, const __m256& w, int padding_mode, int align_corner) -{ - // compute the origin coordinates - __m256 sx = grid_sample_unormalize(w, x, align_corner); + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord_p8(sx, w, padding_mode, align_corner); - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord(sx, w, padding_mode, align_corner); + return coord; + } - return coord; -} + static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) + { + const auto A = _mm256_set1_ps(-0.75f); + + const auto x0 = _mm256_add_ps(tx, v1fp8); + const auto& x1 = tx; + const auto x2 = _mm256_sub_ps(v1fp8, tx); + const auto x3 = _mm256_add_ps(x2, v1fp8); -static NCNN_FORCEINLINE __m256 cubic_interp1d_p0(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) -{ - const auto A = _mm256_set1_ps(-0.75f); + //should be optimized? :( + const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); + const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); + const __m256 coeffs3 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x3), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const auto x0 = _mm256_add_ps(tx, v1fp8); - const auto& x1 = tx; - const auto x2 = _mm256_sub_ps(vn1fp8, tx); - const auto x3 = _mm256_add_ps(x2, v1fp8); - //should be optimized? :( - const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); - const __m256 coeffs2 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x2), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x2), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs3 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x3), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x3), x3), v1fp8); + auto _v = _mm256_mul_ps(coeffs0, x0_v); + _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); - auto _v = _mm256_mul_ps(coeffs0, x0_v); - _v = _mm256_comp_fmadd_ps(coeffs1, x1, _v); - _v = _mm256_comp_fmadd_ps(coeffs2, x2, _v); - _v = _mm256_comp_fmadd_ps(coeffs3, x3, _v); + return _v; + } - return _v; -} #endif // __AVX__ -#endif // __SSE2__ + const __m128 v1fp4 = _mm_set1_ps(1.0f); + const __m128 vn1fp4 = _mm_set1_ps(-1.0f); + const __m128i v1ip4 = _mm_set1_epi32(1); + const __m128i vn1ip4 = _mm_set1_epi32(-1); -int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; + static __m128 NCNN_FORCEINLINE + grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corner) + { + __m128 two = _mm_set1_ps(2.f); - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; + if (align_corner) + return _mm_mul_ps(_mm_div_ps(_mm_add_ps(coordx, v1fp4), two), _mm_sub_ps(w, v1fp4)); + else + return _mm_div_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(coordx, v1fp4), w), v1fp4), two); + } -#if __SSE2__ -#if __AVX__ - if (elempack == 8) + static NCNN_FORCEINLINE __m128 border_coord_p4(const __m128& coord, const __m128& border) + { + return _mm_min_ps(border, _mm_max_ps(coord, _mm_setzero_ps())); + } + + static NCNN_FORCEINLINE __m128 reflect_coord_p4(__m128 x, const __m128& high) + { + /* take the absolute value */ + x = _mm_and_ps(x, *(__m128*)_ps256_inv_sign_mask); + + __m128 reflect_v = _mm_and_ps(_mm_sub_ps(x, high), *(__m128*)_ps256_inv_sign_mask); + x = _mm_sub_ps(high, reflect_v); + return x; + } + + static NCNN_FORCEINLINE __m128 compute_coord_p4(__m128 sx, const __m128& w, int padding_mode, int align_corner) { - const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); - if (dims == 3) + if (padding_mode == 2) // border { - const auto outW = grid.h; - const auto outH = grid.c * grid.elempack; + sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); + } + else if (padding_mode == 3) // reflection + { + if (align_corner) + { + sx = reflect_coord_p4(sx, _mm_sub_ps(w, v1fp4)); + } + else + { + __m128 v0p5f = *(__m128*)_ps256_0p5; + sx = _mm_sub_ps(reflect_coord_p4(_mm_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); + } + } + + return sx; + } + + static NCNN_FORCEINLINE __m128 get_coord_p4(const __m128& x, const __m128& w, int padding_mode, int align_corner) + { + // compute the origin coordinates + __m128 sx = grid_sample_unormalize_p4(w, x, align_corner); + + // correct the coordinates according to the padding_mode + __m128 coord = compute_coord_p4(sx, w, padding_mode, align_corner); + + return coord; + } + + static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) + { + const auto A = _mm_set1_ps(-0.75f); - const auto vWi = _mm256_set1_epi32(outW); - const auto vHi = _mm256_set1_epi32(outH); + const auto x0 = _mm_add_ps(tx, v1fp4); + const auto& x1 = tx; + const auto x2 = _mm_sub_ps(v1fp4, tx); + const auto x3 = _mm_add_ps(x2, v1fp4); - const auto vHf = _mm256_set1_ps(outH); - const auto vWf = _mm256_set1_ps(outW); + //should be optimized? :( + const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); + const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); + const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); + const __m128 coeffs3 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x3), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(4), A)); - top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + auto _v = _mm_mul_ps(coeffs0, x0_v); + _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); + + return _v; + } + +#endif // __SSE2__ + + int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const + { + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + +#if __SSE2__ +#if __AVX__ + const auto vImgWf = _mm256_set1_ps(w); + const auto vImgHf = _mm256_set1_ps(h); + const auto vImgWi = _mm256_set1_epi32(w); + const auto vImgHi = _mm256_set1_epi32(h); + + if (elempack == 8) + { + const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); const auto vElempacki = _mm256_set1_epi32(elempack); const auto vElempackf = _mm256_set1_ps(elempack); - if (resize_type == 1) //zeros + if (dims == 3) { - if (padding_mode == 1) //zeros + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; + + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (resize_type == 1) //zeros { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; + for (int x = 0; x < outW; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } } } } - } - else //border reflection - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + else //border reflection { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vHi, y1)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += elempack; + } } } } + } - } - if (resize_type == 2) - { - if (padding_mode == 1) //zeros + if (resize_type == 2) { - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vHi, iy))); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += elempack; + } } } } - } - else //border reflection - { - for (int q = 0; q < channels; q++) + else //border reflection { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - gx = get_coord(gx, vWf, padding_mode, align_corner); - gy = get_coord(gy, vHf, padding_mode, align_corner); + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += elempack; + } } } } } - } - if (resize_type == 3) - { - if (padding_mode == 1) + if (resize_type == 3) { - for (int q = 0; q < channels; q++) + if (padding_mode == 1) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); + auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); + auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); + auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + + gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); - gx = grid_sample_unormalize(vWf, gx, align_corner); - gy = grid_sample_unormalize(vHf, gy, align_corner); + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); + auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); + auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); + auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); - __m256 coefficients[4]; + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - for (int i = 0; i < 4; i++) + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) { - auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); - auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); - auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); - auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); - - gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vWi, x3)); - - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vHi, y)); - - auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); - auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); - auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); - auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); - - coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); + auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); + auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); + auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + + gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; } + } + } + } + } + } + + if (dims == 4) + { + const int outW = grid.h; + const int outH = grid.d; + const int outD = grid.c * grid.elempack; - auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + top_blob.create(outW, outH, outD, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - _mm256_storeu_ps(outptr, _v); + const auto vImgDf = _mm256_set1_ps(d); + const auto vImgDi = _mm256_set1_epi32(d); - outptr += 8; + if (resize_type == 1) + { + if (padding_mode == 1) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) + { + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0) + , _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) + { + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0) + , _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, vn1fp8, sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } } } } } - else + + if (resize_type == 2) { - for (int q = 0; q < channels; q++) + if (padding_mode == 1) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - gx = grid_sample_unormalize(vWf, gx, align_corner); - gy = grid_sample_unormalize(vHf, gy, align_corner); + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm256_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - __m256 coefficients[4]; + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord(_mm256_add_ps(gy_floor, vn1fp8), vWf, padding_mode, align_corner); - auto gx1 = compute_coord(gy_floor, vWf, padding_mode, align_corner); - auto gx2 = compute_coord(_mm256_add_ps(gy_floor, v1fp8), vWf, padding_mode, align_corner); - auto gx3 = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(2.0f)), vWf, padding_mode, align_corner); - - gy = compute_coord(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vHf, padding_mode, align_corner); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); - - coefficients[i] = cubic_interp1d_p0(x0_val, x1_val, x2_val, x3_val, tx); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) + { + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); + + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm256_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto _v = cubic_interp1d_p0(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(outptr, _v); + _mm256_storeu_ps(outptr, _v); - outptr += 8; + outptr += elempack; + } + } } } } } - } - } - if (dims == 4) - { + if (resize_type == 3) + { + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -1; + } + } } - } #endif // __AVX__ - if (elempack == 4) - { - if (dims == 3) + const auto vImgWfp4 = _mm_set1_ps(w); + const auto vImgHfp4 = _mm_set1_ps(h); + const auto vImgWip4 = _mm_set1_epi32(w); + const auto vImgHip4 = _mm_set1_epi32(h); + + if (elempack == 4) { + const auto vElemsizei = _mm_set1_epi32(elemsize / 8); + const auto vElempacki = _mm_set1_epi32(elempack); + const auto vElempackf = _mm_set1_ps(elempack); + if (dims == 3) { - const int outW = grid.h; - const int outH = grid.c; + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - top_blob.create(outW, outH, channels, elemsize, opt.blob_allocator); + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (resize_type == 1) + if (resize_type == 1) //zeros { - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros + { +#pragma parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range), sizeof(float)); + auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + else //border reflection { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - outptr += 8; + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_nw_offset, vn1fp4, sizeof(float)); + auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); + auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } } } } + } if (resize_type == 2) { - for (int q = 0; q < channels; q++) + if (padding_mode == 1) //zeros + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + else //border reflection { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - outptr++; + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm_set1_ps(-1.0f), sizeof(float)); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } } } } @@ -592,34 +1240,536 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); + gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p4(_mm_add_ps(gx_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); + auto gx1 = compute_coord_p4(gx_floor, vImgWfp4, padding_mode, align_corner); + auto gx2 = compute_coord_p4(_mm_add_ps(gx_floor, v1fp4), vImgWfp4, padding_mode, align_corner); + auto gx3 = compute_coord_p4(_mm_add_ps(gx_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); + + gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + auto y = _mm_cvtps_epi32(gy); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x2)); + auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x3)); + + auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y)); + + auto v0_in_range = _mm_and_si128(x0_in_range, y_in_range); + auto v1_in_range = _mm_and_si128(x1_in_range, y_in_range); + auto v2_in_range = _mm_and_si128(x2_in_range, y_in_range); + auto v3_in_range = _mm_and_si128(x3_in_range, y_in_range); + + auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m128*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m128*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m128*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m128*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); + gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p4(_mm_add_ps(gy_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); + auto gx1 = compute_coord_p4(gy_floor, vImgWfp4, padding_mode, align_corner); + auto gx2 = compute_coord_p4(_mm_add_ps(gy_floor, v1fp4), vImgWfp4, padding_mode, align_corner); + auto gx3 = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); + + gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + auto y = _mm_cvtps_epi32(gy); + + auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x0_offset, vn1fp4, sizeof(float)); + auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, vn1fp4, sizeof(float)); + auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, vn1fp4, sizeof(float)); + auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, vn1fp4, sizeof(float)); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + } + } + + if (dims == 4) + { + const int outW = grid.h; + const int outH = grid.d; + const int outD = grid.c * grid.elempack; + + top_blob.create(outW, outH, outD, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const auto vImgDfp4 = _mm_set1_ps(d); + const auto vImgDip4 = _mm_set1_epi32(d); + + if (resize_type == 1) + { + if (padding_mode == 1) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) + { + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z0)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); + + __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); + v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); + v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); + v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); + + v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); + v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); + v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range), sizeof(float)); + auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range), sizeof(float)); + auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range), sizeof(float)); + auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range), sizeof(float)); + auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) + { + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); + + __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); + v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tnw_offset, vn1fp4, sizeof(float)); + auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + } + } + + if (resize_type == 2) + { + if (padding_mode == 1) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) + { + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); + + gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); + v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDip4, iz))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + } + else { - const float* outptr = bottom_blob.channel(q); - for (int y = 0; y < outH; y++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - for (int x = 0; x < outW; x++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - outptr += 8; + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); + + gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm_set1_ps(-1.0f), sizeof(float)); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; + } + } } } } } + + if (resize_type == 3) + { + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -1; + } } } - if (dims == 4) +#endif // __SSE2__ + + if (elempack == 1) { + return GridSample::forward(bottom_blobs, top_blobs, opt); } - } - -#endif // __SSE2__ - if (elempack == 1) - { - return GridSample::forward(bottom_blobs, top_blobs, opt); + return 0; } - return 0; -} - } // namespace ncnn From 1e32b457172c568519beeb52b482c9fb8ee6f487 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 10 Nov 2022 15:22:17 +0000 Subject: [PATCH 020/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 2613 +++++++++++++++--------------- 1 file changed, 1300 insertions(+), 1313 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index a6b1f16c516..c0fb9432028 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,299 +26,741 @@ namespace ncnn { - GridSample_x86::GridSample_x86() - { +GridSample_x86::GridSample_x86() +{ #if __SSE2__ - support_packing = true; + support_packing = true; #endif // __SSE2__ - } +} #if __SSE2__ #if __AVX__ - const __m256 v1fp8 = *(__m256*)_ps256_1; - const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); - const __m256i v1ip8 = _mm256_set1_epi32(1); - const __m256i vn1ip8 = _mm256_set1_epi32(-1); - - static __m256 NCNN_FORCEINLINE - grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) +const __m256 v1fp8 = *(__m256*)_ps256_1; +const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); +const __m256i v1ip8 = _mm256_set1_epi32(1); +const __m256i vn1ip8 = _mm256_set1_epi32(-1); + +static __m256 NCNN_FORCEINLINE +grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) +{ + __m256 two = _mm256_set1_ps(2.f); + + if (align_corner) + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + else + return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); +} + +static NCNN_FORCEINLINE __m256 border_coord_p8(const __m256& coord, const __m256& border) +{ + return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); +} + +static NCNN_FORCEINLINE __m256 reflect_coord_p8(__m256 x, const __m256& high) +{ + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); + x = _mm256_sub_ps(high, reflect_v); + return x; +} + +static NCNN_FORCEINLINE __m256 compute_coord_p8(__m256 sx, const __m256& w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border + { + sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); + } + else if (padding_mode == 3) // reflection { - __m256 two = _mm256_set1_ps(2.f); - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); + { + sx = reflect_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); + } else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); + { + __m256 v0p5f = *(__m256*)_ps256_0p5; + sx = _mm256_sub_ps(reflect_coord_p8(_mm256_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); + } } - static NCNN_FORCEINLINE __m256 border_coord_p8(const __m256& coord, const __m256& border) - { - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); - } + return sx; +} - static NCNN_FORCEINLINE __m256 reflect_coord_p8(__m256 x, const __m256& high) - { - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); +static NCNN_FORCEINLINE __m256 get_coord_p8(const __m256& x, const __m256& w, int padding_mode, int align_corner) +{ + // compute the origin coordinates + __m256 sx = grid_sample_unormalize_p8(w, x, align_corner); - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(high, reflect_v); - return x; - } + // correct the coordinates according to the padding_mode + __m256 coord = compute_coord_p8(sx, w, padding_mode, align_corner); + + return coord; +} + +static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) +{ + const auto A = _mm256_set1_ps(-0.75f); + + const auto x0 = _mm256_add_ps(tx, v1fp8); + const auto& x1 = tx; + const auto x2 = _mm256_sub_ps(v1fp8, tx); + const auto x3 = _mm256_add_ps(x2, v1fp8); + + //should be optimized? :( + const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); + const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); + const __m256 coeffs3 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x3), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(4), A)); + + auto _v = _mm256_mul_ps(coeffs0, x0_v); + _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); + + return _v; +} - static NCNN_FORCEINLINE __m256 compute_coord_p8(__m256 sx, const __m256& w, int padding_mode, int align_corner) +#endif // __AVX__ + +const __m128 v1fp4 = _mm_set1_ps(1.0f); +const __m128 vn1fp4 = _mm_set1_ps(-1.0f); +const __m128i v1ip4 = _mm_set1_epi32(1); +const __m128i vn1ip4 = _mm_set1_epi32(-1); + +static __m128 NCNN_FORCEINLINE +grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corner) +{ + __m128 two = _mm_set1_ps(2.f); + + if (align_corner) + return _mm_mul_ps(_mm_div_ps(_mm_add_ps(coordx, v1fp4), two), _mm_sub_ps(w, v1fp4)); + else + return _mm_div_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(coordx, v1fp4), w), v1fp4), two); +} + +static NCNN_FORCEINLINE __m128 border_coord_p4(const __m128& coord, const __m128& border) +{ + return _mm_min_ps(border, _mm_max_ps(coord, _mm_setzero_ps())); +} + +static NCNN_FORCEINLINE __m128 reflect_coord_p4(__m128 x, const __m128& high) +{ + /* take the absolute value */ + x = _mm_and_ps(x, *(__m128*)_ps256_inv_sign_mask); + + __m128 reflect_v = _mm_and_ps(_mm_sub_ps(x, high), *(__m128*)_ps256_inv_sign_mask); + x = _mm_sub_ps(high, reflect_v); + return x; +} + +static NCNN_FORCEINLINE __m128 compute_coord_p4(__m128 sx, const __m128& w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border + { + sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); + } + else if (padding_mode == 3) // reflection { - if (padding_mode == 2) // border + if (align_corner) { - sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); + sx = reflect_coord_p4(sx, _mm_sub_ps(w, v1fp4)); } - else if (padding_mode == 3) // reflection + else { - if (align_corner) - { - sx = reflect_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); - } - else - { - __m256 v0p5f = *(__m256*)_ps256_0p5; - sx = _mm256_sub_ps(reflect_coord_p8(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); - } + __m128 v0p5f = *(__m128*)_ps256_0p5; + sx = _mm_sub_ps(reflect_coord_p4(_mm_add_ps(sx, v0p5f), w), v0p5f); + sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); } - - return sx; } - static NCNN_FORCEINLINE __m256 get_coord_p8(const __m256& x, const __m256& w, int padding_mode, int align_corner) - { - // compute the origin coordinates - __m256 sx = grid_sample_unormalize_p8(w, x, align_corner); - - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord_p8(sx, w, padding_mode, align_corner); + return sx; +} - return coord; - } +static NCNN_FORCEINLINE __m128 get_coord_p4(const __m128& x, const __m128& w, int padding_mode, int align_corner) +{ + // compute the origin coordinates + __m128 sx = grid_sample_unormalize_p4(w, x, align_corner); - static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) - { - const auto A = _mm256_set1_ps(-0.75f); + // correct the coordinates according to the padding_mode + __m128 coord = compute_coord_p4(sx, w, padding_mode, align_corner); - const auto x0 = _mm256_add_ps(tx, v1fp8); - const auto& x1 = tx; - const auto x2 = _mm256_sub_ps(v1fp8, tx); - const auto x3 = _mm256_add_ps(x2, v1fp8); + return coord; +} - //should be optimized? :( - const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); - const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); - const __m256 coeffs3 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x3), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(4), A)); +static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) +{ + const auto A = _mm_set1_ps(-0.75f); + const auto x0 = _mm_add_ps(tx, v1fp4); + const auto& x1 = tx; + const auto x2 = _mm_sub_ps(v1fp4, tx); + const auto x3 = _mm_add_ps(x2, v1fp4); - auto _v = _mm256_mul_ps(coeffs0, x0_v); - _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); + //should be optimized? :( + const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); + const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); + const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); + const __m128 coeffs3 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x3), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(4), A)); - return _v; - } + auto _v = _mm_mul_ps(coeffs0, x0_v); + _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); + return _v; +} -#endif // __AVX__ +#endif // __SSE2__ - const __m128 v1fp4 = _mm_set1_ps(1.0f); - const __m128 vn1fp4 = _mm_set1_ps(-1.0f); - const __m128i v1ip4 = _mm_set1_epi32(1); - const __m128i vn1ip4 = _mm_set1_epi32(-1); +int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + const int elempack = bottom_blob.elempack; - static __m128 NCNN_FORCEINLINE - grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corner) - { - __m128 two = _mm_set1_ps(2.f); + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; - if (align_corner) - return _mm_mul_ps(_mm_div_ps(_mm_add_ps(coordx, v1fp4), two), _mm_sub_ps(w, v1fp4)); - else - return _mm_div_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(coordx, v1fp4), w), v1fp4), two); - } +#if __SSE2__ +#if __AVX__ + const auto vImgWf = _mm256_set1_ps(w); + const auto vImgHf = _mm256_set1_ps(h); + const auto vImgWi = _mm256_set1_epi32(w); + const auto vImgHi = _mm256_set1_epi32(h); - static NCNN_FORCEINLINE __m128 border_coord_p4(const __m128& coord, const __m128& border) + if (elempack == 8) { - return _mm_min_ps(border, _mm_max_ps(coord, _mm_setzero_ps())); - } + const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); + const auto vElempacki = _mm256_set1_epi32(elempack); + const auto vElempackf = _mm256_set1_ps(elempack); - static NCNN_FORCEINLINE __m128 reflect_coord_p4(__m128 x, const __m128& high) - { - /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps256_inv_sign_mask); + if (dims == 3) + { + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - __m128 reflect_v = _mm_and_ps(_mm_sub_ps(x, high), *(__m128*)_ps256_inv_sign_mask); - x = _mm_sub_ps(high, reflect_v); - return x; - } + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - static NCNN_FORCEINLINE __m128 compute_coord_p4(__m128 sx, const __m128& w, int padding_mode, int align_corner) - { - if (padding_mode == 2) // border - { - sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); - } - else if (padding_mode == 3) // reflection - { - if (align_corner) + if (resize_type == 1) //zeros { - sx = reflect_coord_p4(sx, _mm_sub_ps(w, v1fp4)); + if (padding_mode == 1) //zeros + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + else //border reflection + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } } - else + + if (resize_type == 2) { - __m128 v0p5f = *(__m128*)_ps256_0p5; - sx = _mm_sub_ps(reflect_coord_p4(_mm_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); + if (padding_mode == 1) //zeros + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + else //border reflection + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } } - } - return sx; - } + if (resize_type == 3) + { + if (padding_mode == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - static NCNN_FORCEINLINE __m128 get_coord_p4(const __m128& x, const __m128& w, int padding_mode, int align_corner) - { - // compute the origin coordinates - __m128 sx = grid_sample_unormalize_p4(w, x, align_corner); + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); - // correct the coordinates according to the padding_mode - __m128 coord = compute_coord_p4(sx, w, padding_mode, align_corner); + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); - return coord; - } + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); - static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) - { - const auto A = _mm_set1_ps(-0.75f); + __m256 coefficients[4]; - const auto x0 = _mm_add_ps(tx, v1fp4); - const auto& x1 = tx; - const auto x2 = _mm_sub_ps(v1fp4, tx); - const auto x3 = _mm_add_ps(x2, v1fp4); + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); + auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); + auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); + auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); - //should be optimized? :( - const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); - const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); - const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); - const __m128 coeffs3 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x3), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(4), A)); + gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); - auto _v = _mm_mul_ps(coeffs0, x0_v); - _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); + auto y = _mm256_cvtps_epi32(gy); - return _v; - } + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + + auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); + auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); + auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); + auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } -#endif // __SSE2__ + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const - { - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& grid = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; + _mm256_storeu_ps(outptr, _v); -#if __SSE2__ -#if __AVX__ - const auto vImgWf = _mm256_set1_ps(w); - const auto vImgHf = _mm256_set1_ps(h); - const auto vImgWi = _mm256_set1_epi32(w); - const auto vImgHi = _mm256_set1_epi32(h); + outptr += elempack; + } + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); + auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); + auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); + auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + + gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - if (elempack == 8) + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(outptr, _v); + + outptr += elempack; + } + } + } + } + } + } + + if (dims == 4) { - const auto vElemsizei = _mm256_set1_epi32(elemsize / 8); - const auto vElempacki = _mm256_set1_epi32(elempack); - const auto vElempackf = _mm256_set1_ps(elempack); + const int outW = grid.h; + const int outH = grid.d; + const int outD = grid.c * grid.elempack; - if (dims == 3) - { - const auto outW = grid.h; - const auto outH = grid.c * grid.elempack; + top_blob.create(outW, outH, outD, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const auto vImgDf = _mm256_set1_ps(d); + const auto vImgDi = _mm256_set1_epi32(d); - if (resize_type == 1) //zeros + if (resize_type == 1) + { + if (padding_mode == 1) { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); auto x_w = _mm256_floor_ps(gx); auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); auto w = _mm256_sub_ps(gx, x_w); auto e = _mm256_sub_ps(v1fp8, w); auto n = _mm256_sub_ps(gy, y_n); auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } auto x0 = _mm256_cvtps_epi32(x_w); auto x1 = _mm256_add_epi32(x0, v1ip8); auto y0 = _mm256_cvtps_epi32(y_n); auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); _mm256_storeu_ps(outptr, _v); @@ -327,62 +769,111 @@ namespace ncnn { } } } - else //border reflection + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); auto x_w = _mm256_floor_ps(gx); auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); auto w = _mm256_sub_ps(gx, x_w); auto e = _mm256_sub_ps(v1fp8, w); auto n = _mm256_sub_ps(gy, y_n); auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } auto x0 = _mm256_cvtps_epi32(x_w); auto x1 = _mm256_add_epi32(x0, v1ip8); auto y0 = _mm256_cvtps_epi32(y_n); auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, vn1fp8, sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); _mm256_storeu_ps(outptr, _v); @@ -391,42 +882,50 @@ namespace ncnn { } } } - } + } - if (resize_type == 2) + if (resize_type == 2) + { + if (padding_mode == 1) { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm256_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); auto ix = _mm256_cvtps_epi32(gx); auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -435,34 +934,42 @@ namespace ncnn { } } } - else //border reflection + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm256_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); auto ix = _mm256_cvtps_epi32(gx); auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -472,618 +979,548 @@ namespace ncnn { } } } + } - if (resize_type == 3) - { - if (padding_mode == 1) - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); - - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + if (resize_type == 3) + { + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -1; + } + } + } - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); +#endif // __AVX__ - __m256 coefficients[4]; + const auto vImgWfp4 = _mm_set1_ps(w); + const auto vImgHfp4 = _mm_set1_ps(h); + const auto vImgWip4 = _mm_set1_epi32(w); + const auto vImgHip4 = _mm_set1_epi32(h); - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); - auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); - auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); - auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); - - gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); - - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); - - auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); - auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); - auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); - auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + if (elempack == 4) + { + const auto vElemsizei = _mm_set1_epi32(elemsize / 8); + const auto vElempacki = _mm_set1_epi32(elempack); + const auto vElempackf = _mm_set1_ps(elempack); - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + if (dims == 3) + { + const auto outW = grid.h; + const auto outH = grid.c * grid.elempack; - _mm256_storeu_ps(outptr, _v); + top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - outptr += elempack; - } + if (resize_type == 1) //zeros + { + if (padding_mode == 1) //zeros + { +#pragma parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) + { + for (int x = 0; x < outW; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range), sizeof(float)); + auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; } } } - else + } + else //border reflection + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) + for (int x = 0; x < outW; x++) { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); - __m256 coefficients[4]; + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); - auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); - auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); - auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); - - gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - _mm256_storeu_ps(outptr, _v); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - outptr += elempack; - } + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + auto nw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_nw_offset, vn1fp4, sizeof(float)); + auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); + auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; } } } } } - if (dims == 4) + if (resize_type == 2) { - const int outW = grid.h; - const int outH = grid.d; - const int outD = grid.c * grid.elempack; - - top_blob.create(outW, outH, outD, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const auto vImgDf = _mm256_set1_ps(d); - const auto vImgDi = _mm256_set1_epi32(d); - - if (resize_type == 1) + if (padding_mode == 1) //zeros { - if (padding_mode == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) + for (int x = 0; x < outW; x++) { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0) - , _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; } } } - else + } + else //border reflection + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) + for (int x = 0; x < outW; x++) { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0) - , _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, vn1fp8, sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + + gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); + gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + + gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm_set1_ps(-1.0f), sizeof(float)); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; } } } } + } - if (resize_type == 2) + if (resize_type == 3) + { + if (padding_mode == 1) { - if (padding_mode == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) + for (int x = 0; x < outW; x++) { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); + gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gz = _mm256_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m128 coefficients[4]; - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p4(_mm_add_ps(gx_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); + auto gx1 = compute_coord_p4(gx_floor, vImgWfp4, padding_mode, align_corner); + auto gx2 = compute_coord_p4(_mm_add_ps(gx_floor, v1fp4), vImgWfp4, padding_mode, align_corner); + auto gx3 = compute_coord_p4(_mm_add_ps(gx_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); - _mm256_storeu_ps(outptr, _v); + auto y = _mm_cvtps_epi32(gy); - outptr += elempack; - } + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); + auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x2)); + auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x3)); + + auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y)); + + auto v0_in_range = _mm_and_si128(x0_in_range, y_in_range); + auto v1_in_range = _mm_and_si128(x1_in_range, y_in_range); + auto v2_in_range = _mm_and_si128(x2_in_range, y_in_range); + auto v3_in_range = _mm_and_si128(x3_in_range, y_in_range); + + auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m128*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m128*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m128*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m128*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(outptr, _v); + + outptr += elempack; } } } - else + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int y = 0; y < outH; y++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) + for (int x = 0; x < outW; x++) { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); + gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); - gx = _mm256_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm256_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gz = _mm256_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m128 coefficients[4]; - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p4(_mm_add_ps(gy_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); + auto gx1 = compute_coord_p4(gy_floor, vImgWfp4, padding_mode, align_corner); + auto gx2 = compute_coord_p4(_mm_add_ps(gy_floor, v1fp4), vImgWfp4, padding_mode, align_corner); + auto gx3 = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); + + gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + auto y = _mm_cvtps_epi32(gy); + + auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + auto x0_offset = _mm_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x0_offset, vn1fp4, sizeof(float)); + auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, vn1fp4, sizeof(float)); + auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, vn1fp4, sizeof(float)); + auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, vn1fp4, sizeof(float)); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(outptr, _v); + _mm_storeu_ps(outptr, _v); - outptr += elempack; - } - } + outptr += elempack; } } } } - - if (resize_type == 3) - { - NCNN_LOGE("unsupported bicubic when dims == 4"); - return -1; - } } } -#endif // __AVX__ - - const auto vImgWfp4 = _mm_set1_ps(w); - const auto vImgHfp4 = _mm_set1_ps(h); - const auto vImgWip4 = _mm_set1_epi32(w); - const auto vImgHip4 = _mm_set1_epi32(h); - - if (elempack == 4) + if (dims == 4) { - const auto vElemsizei = _mm_set1_epi32(elemsize / 8); - const auto vElempacki = _mm_set1_epi32(elempack); - const auto vElempackf = _mm_set1_ps(elempack); + const int outW = grid.h; + const int outH = grid.d; + const int outD = grid.c * grid.elempack; - if (dims == 3) - { - const auto outW = grid.h; - const auto outH = grid.c * grid.elempack; + top_blob.create(outW, outH, outD, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - top_blob.create(outW, outH, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const auto vImgDfp4 = _mm_set1_ps(d); + const auto vImgDip4 = _mm_set1_epi32(d); - if (resize_type == 1) //zeros + if (resize_type == 1) + { + if (padding_mode == 1) { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); auto x_w = _mm_floor_ps(gx); auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); auto w = _mm_sub_ps(gx, x_w); auto e = _mm_sub_ps(v1fp4, w); auto n = _mm_sub_ps(gy, y_n); auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } auto x0 = _mm_cvtps_epi32(x_w); auto x1 = _mm_add_epi32(x0, v1ip4); auto y0 = _mm_cvtps_epi32(y_n); auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z0)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range), sizeof(float)); - auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); + v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); + v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); + v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); + + v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); + v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); + v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } - auto _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range), sizeof(float)); + auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range), sizeof(float)); + auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range), sizeof(float)); + auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range), sizeof(float)); + auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); _mm_storeu_ps(outptr, _v); @@ -1092,62 +1529,111 @@ namespace ncnn { } } } - else //border reflection + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); auto x_w = _mm_floor_ps(gx); auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); auto w = _mm_sub_ps(gx, x_w); auto e = _mm_sub_ps(v1fp4, w); auto n = _mm_sub_ps(gy, y_n); auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } auto x0 = _mm_cvtps_epi32(x_w); auto x1 = _mm_add_epi32(x0, v1ip4); auto y0 = _mm_cvtps_epi32(y_n); auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - auto nw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_nw_offset, vn1fp4, sizeof(float)); - auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); - auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); + v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } - auto _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + auto tnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tnw_offset, vn1fp4, sizeof(float)); + auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); _mm_storeu_ps(outptr, _v); @@ -1156,42 +1642,50 @@ namespace ncnn { } } } - } + } - if (resize_type == 2) + if (resize_type == 2) + { + if (padding_mode == 1) { - if (padding_mode == 1) //zeros + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); auto ix = _mm_cvtps_epi32(gx); auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); + v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDip4, iz))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); + i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); _mm_storeu_ps(outptr, _v); @@ -1200,196 +1694,42 @@ namespace ncnn { } } } - else //border reflection + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); + for (int z = 0; z < outD; z++) { - float* outptr = top_blob.channel(q); for (int y = 0; y < outH; y++) { for (int x = 0; x < outW; x++) { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); + gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + gz = _mm_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); auto ix = _mm_cvtps_epi32(gx); auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f), sizeof(float)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } - } - } - - if (resize_type == 3) - { - if (padding_mode == 1) - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); - gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); - - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); - - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p4(_mm_add_ps(gx_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); - auto gx1 = compute_coord_p4(gx_floor, vImgWfp4, padding_mode, align_corner); - auto gx2 = compute_coord_p4(_mm_add_ps(gx_floor, v1fp4), vImgWfp4, padding_mode, align_corner); - auto gx3 = compute_coord_p4(_mm_add_ps(gx_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); - - gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); - - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); - - auto y = _mm_cvtps_epi32(gy); - - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x2)); - auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x3)); - - auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y)); - - auto v0_in_range = _mm_and_si128(x0_in_range, y_in_range); - auto v1_in_range = _mm_and_si128(x1_in_range, y_in_range); - auto v2_in_range = _mm_and_si128(x2_in_range, y_in_range); - auto v3_in_range = _mm_and_si128(x3_in_range, y_in_range); - - auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m128*>(&v0_in_range), sizeof(float)); - auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m128*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m128*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m128*>(&v3_in_range), sizeof(float)); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); - gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); - - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); - - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p4(_mm_add_ps(gy_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); - auto gx1 = compute_coord_p4(gy_floor, vImgWfp4, padding_mode, align_corner); - auto gx2 = compute_coord_p4(_mm_add_ps(gy_floor, v1fp4), vImgWfp4, padding_mode, align_corner); - auto gx3 = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); - - gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); - - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); - - auto y = _mm_cvtps_epi32(gy); - - auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x0_offset, vn1fp4, sizeof(float)); - auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, vn1fp4, sizeof(float)); - auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, vn1fp4, sizeof(float)); - auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, vn1fp4, sizeof(float)); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + i_offset, _mm_set1_ps(-1.0f), sizeof(float)); _mm_storeu_ps(outptr, _v); @@ -1401,375 +1741,22 @@ namespace ncnn { } } - if (dims == 4) + if (resize_type == 3) { - const int outW = grid.h; - const int outH = grid.d; - const int outD = grid.c * grid.elempack; - - top_blob.create(outW, outH, outD, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const auto vImgDfp4 = _mm_set1_ps(d); - const auto vImgDip4 = _mm_set1_epi32(d); - - if (resize_type == 1) - { - if (padding_mode == 1) - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); - - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); - - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z0)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); - - __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); - v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); - v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); - v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); - - v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); - v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); - v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range), sizeof(float)); - auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range), sizeof(float)); - auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range), sizeof(float)); - auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range), sizeof(float)); - auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); - - auto _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } - } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); - - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); - - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); - - __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); - v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tnw_offset, vn1fp4, sizeof(float)); - auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); - - auto _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } - } - } - } - - if (resize_type == 2) - { - if (padding_mode == 1) - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gz = _mm_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); - - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); - v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDip4, iz))); - - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - - auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } - } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - gx = _mm_round_ps(gx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gy = _mm_round_ps(gy, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - gz = _mm_round_ps(gz, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); - - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - - auto _v = _mm_mask_i32gather_ps(_mm_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f), sizeof(float)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } - } - } - } - - if (resize_type == 3) - { - NCNN_LOGE("unsupported bicubic when dims == 4"); - return -1; - } + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -1; } } + } #endif // __SSE2__ - if (elempack == 1) - { - return GridSample::forward(bottom_blobs, top_blobs, opt); - } - - return 0; + if (elempack == 1) + { + return GridSample::forward(bottom_blobs, top_blobs, opt); } + return 0; +} + } // namespace ncnn From f4e0e3cdc75a4570e1f3397d0cd69c433463d1f2 Mon Sep 17 00:00:00 2001 From: Yoh Date: Sun, 13 Nov 2022 18:19:05 +0800 Subject: [PATCH 021/127] gather function need avx2 --- src/CMakeLists.txt | 3 - src/layer/gridsample.cpp | 344 +----------------- src/layer/gridsample.h | 14 - src/layer/x86/gridsample_x86.cpp | 207 ++++++----- tools/pnnx/src/CMakeLists.txt | 3 - tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 26 -- .../pnnx/src/pass_ncnn/solve_batch_index.cpp | 4 - 7 files changed, 118 insertions(+), 483 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f25df3e19b9..bc04fa350ed 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -156,12 +156,9 @@ ncnn_add_layer(Deconvolution3D) ncnn_add_layer(DeconvolutionDepthWise3D) ncnn_add_layer(Einsum) ncnn_add_layer(DeformableConv2D) -<<<<<<< HEAD -======= ncnn_add_layer(GLU) ncnn_add_layer(Fold) ncnn_add_layer(Unfold) ->>>>>>> master ncnn_add_layer(GridSample) if(NCNN_VULKAN) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index b93ab4e7bce..a5157cb4bb9 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -3,311 +3,20 @@ // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -<<<<<<< HEAD -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -======= // coord compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to coord writing, software distributed ->>>>>>> master // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "gridsample.h" -<<<<<<< HEAD -#include -#include - -namespace ncnn { -enum InterpolationMode -{ - Bilinear = 1, - Nearest = 2, - Bicubic = 3 -}; - -enum PaddingMode -{ - Zeros = 1, - Border = 2, - Reflection = 3 -}; - -static inline float clip_coordinates(float in, int64_t clip_limit) -{ - return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); -} - -static inline float reflect_coordinates(float in, int64_t twice_low, - int64_t twice_high) -{ - if (twice_low == twice_high) - { - return static_cast(0); - } - float min = static_cast(twice_low) / 2; - float span = static_cast(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - float extra = std::fmod(in, span); - int flips = static_cast(std::floor(in / span)); - if (flips % 2 == 0) - { - return extra + min; - } - else - { - return span - extra + min; - } -} - -static inline float compute_coordinates(float coord, int64_t size, - PaddingMode padding_mode, - bool align_corners) -{ - if (padding_mode == PaddingMode::Border) - { - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } - else if (padding_mode == PaddingMode::Reflection) - { - // reflect coordinates by image borders - if (align_corners) - { - coord = reflect_coordinates(coord, 0, 2 * (size - 1)); - } - else - { - coord = reflect_coordinates(coord, -1, 2 * size - 1); - } - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } - return coord; -} - -static inline float grid_sampler_unnormalize(float coord, int64_t size, - bool align_corners) -{ - if (align_corners) - { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1) / 2) * (size - 1); - } - else - { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1) * size - 1) / 2; - } -} - -static inline float grid_sampler_compute_source_index( - float coord, - int64_t size, - PaddingMode padding_mode, - bool align_corners) -{ - coord = grid_sampler_unnormalize(coord, size, align_corners); - coord = compute_coordinates(coord, size, padding_mode, align_corners); - return coord; -} - -template -struct ApplyGridSample; - -template -struct ApplyGridSample -{ - const bool must_in_bound = padding != PaddingMode::Zeros; - inline std::tuple compute_interp_params_d3(float x, float y) const - { - auto x_w = std::floor(x); - auto y_n = std::floor(y); - - auto w = x - x_w; - auto e = 1.0f - w; - auto n = y - y_n; - auto s = 1.0f - n; - - auto nw = s * e; - auto ne = s * w; - auto sw = n * e; - auto se = n * w; - - return std::make_tuple(nw, ne, sw, se); - } - - inline int forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) - { - const int dims = input.dims; - const int w = input.w; - const int h = input.h; - const int outW = grid.h; - const int outH = grid.c; - const int channels = input.c; - - if (dims == 3) - { - output.create(outW, outH, input.c); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = static_cast(output.channel(q).data); - - const Mat image = input.channel(q); - - //const float* gxy_ptr = static_cast(grid.data); - - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gxy_ptr = grid.channel(y).row(x); - auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); - - auto interp_params = compute_interp_params_d3(gx, gy); - - auto nw = std::get<0>(interp_params); - auto ne = std::get<1>(interp_params); - auto sw = std::get<2>(interp_params); - auto se = std::get<3>(interp_params); - - auto i_x = static_cast(std::floor(gx)); - auto i_y = static_cast(std::floor(gy)); - - float v = 0.0f; - if (must_in_bound) - { - //out of range, val is 0 https://github.com/pytorch/pytorch/blob/435e78e5237d9fb3e433fff6ce028569db937264/aten/src/ATen/native/cpu/GridSamplerKernel.cpp#L520 - auto nw_val = image.row(i_y)[i_x]; - auto ne_val = i_x + 1 < w ? image.row(i_y)[i_x + 1] : 0; - auto sw_val = i_y + 1 < h ? image.row(i_y + 1)[i_x] : 0; - auto se_val = ((i_x + 1 < w) & (i_y + 1 < h)) ? image.row(i_y + 1)[i_x + 1] : 0; - - v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; - } - else //PaddingMode::Zeors - { - auto x0 = i_x; - auto x1 = i_x + 1; - auto y0 = i_y; - auto y1 = i_y + 1; - - auto x0_in_range = (x0 > -1) & (x0 < w); - auto x1_in_range = (x1 > -1) & (x1 < w); - auto y0_in_range = (y0 > -1) & (y0 < h); - auto y1_in_range = (y1 > -1) & (y1 < h); - - auto v00_in_range = x0_in_range & y0_in_range; - auto v01_in_range = x0_in_range & y1_in_range; - auto v10_in_range = x1_in_range & y0_in_range; - auto v11_in_range = x1_in_range & y1_in_range; - - auto nw_val = v00_in_range ? image.row(y0)[x0] : 0; - auto ne_val = v10_in_range ? image.row(y0)[x1] : 0; - auto sw_val = v01_in_range ? image.row(y1)[x0] : 0; - auto se_val = v11_in_range ? image.row(y1)[x1] : 0; - - v = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; - } - - *output_ptr = v; - - output_ptr++; - } - } - } - } - else if (dims == 4) - { - } - else - { - return -100; - } - } -}; - -template -struct ApplyGridSample -{ - const bool must_in_bound = padding != PaddingMode::Zeros; - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) - { - const int dims = input.dims; - const int w = input.w; - const int h = input.h; - const int outW = grid.h; - const int outH = grid.c; - const int channels = input.c; - - if (dims == 3) - { - output.create(outW, outH, input.c); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* output_ptr = static_cast(output.channel(q).data); - - const Mat image = input.channel(q); - - //const float* gxy_ptr = static_cast(grid.data); - - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gxy_ptr = grid.channel(y).row(x); - auto gx = grid_sampler_compute_source_index(gxy_ptr[0], w, padding, align_corners); - auto gy = grid_sampler_compute_source_index(gxy_ptr[1], h, padding, align_corners); - - auto x_nearest = static_cast(std::round(gx)); - auto y_nearest = static_cast(std::round(gy)); - - float v = image.row(y_nearest)[x_nearest]; - if (!must_in_bound) - { - v = ((x_nearest < w) & (x_nearest > -1) & (y_nearest < h) & (y_nearest > -1)) ? v : 0; - } - - *output_ptr = v; - - output_ptr++; - } - } - } - } - else if (dims == 4) - { - } - else - { - } - } -}; - -template -struct ApplyGridSample -{ - inline void forward(const Mat& input, const Mat& grid, Mat& output, const Option& opt) - { - } -}; -======= - #include namespace ncnn { ->>>>>>> master + GridSample::GridSample() { @@ -317,11 +26,6 @@ GridSample::GridSample() int GridSample::load_param(const ParamDict& pd) { -<<<<<<< HEAD - mode = pd.get(0, 0); - padding_mode = pd.get(1, 0); - align_corners = pd.get(6, 0); -======= sample_type = pd.get(0, 1); padding_mode = pd.get(1, 1); align_corner = pd.get(2, 0); @@ -337,55 +41,10 @@ int GridSample::load_param(const ParamDict& pd) NCNN_LOGE("unsupported padding mode %d", padding_mode); return -1; } ->>>>>>> master return 0; } -<<<<<<< HEAD -int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ -#define HANDLE_PADDING(interp, padding, align_corners) \ - case padding: \ - { \ - ApplyGridSample func; \ - func.forward(bottom_blobs[0], bottom_blobs[1], top_blobs[0], opt); \ - break; \ - } - -#define HANDLE_INTERP(interp, align_corners) \ - case interp: \ - { \ - switch (static_cast(padding_mode)) \ - { \ - HANDLE_PADDING(interp, PaddingMode::Zeros, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Border, align_corners) \ - HANDLE_PADDING(interp, PaddingMode::Reflection, align_corners) \ - } \ - break; \ - } - - if (align_corners == true) - { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, true); - HANDLE_INTERP(InterpolationMode::Nearest, true); - HANDLE_INTERP(InterpolationMode::Bicubic, true); - } - } - else - { - switch (static_cast(mode)) - { - HANDLE_INTERP(InterpolationMode::Bilinear, false); - HANDLE_INTERP(InterpolationMode::Nearest, false); - HANDLE_INTERP(InterpolationMode::Bicubic, false); - } - } -#undef HANDLE_PADDING -#undef HANDLE_INTERP -======= // Restore normalized location to acutal image location // When align_corners is true: // Normalized location (-1, -1) points to the top-left pixel. @@ -785,7 +444,6 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& return -1; } } ->>>>>>> master return 0; } diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index bd44b8b0c19..0ea540eb4ba 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -26,15 +26,6 @@ class GridSample : public Layer virtual int load_param(const ParamDict& pd); -<<<<<<< HEAD - virtual int forward(const std::vector& bottom_blob, std::vector& top_blobs, const Option& opt) const; - -public: - // param - int mode; //1 bilinear 2 nearest 3 bicubic - int padding_mode; //1 zeros 2 border 3 reflection - bool align_corners; -======= virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: @@ -42,13 +33,8 @@ class GridSample : public Layer int sample_type; // 1=bilinear 2=nearest 3=bicubic int padding_mode; // 1=zeros 2=border 3=reflection int align_corner; ->>>>>>> master }; } // namespace ncnn -<<<<<<< HEAD -#endif // LAYER_UNARYOP_H -======= #endif // LAYER_GRIDSAMPLE_H ->>>>>>> master diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index c0fb9432028..132908c40eb 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -15,7 +15,7 @@ #include "gridsample_x86.h" #if __SSE2__ -#include +#include #include "sse_mathfun.h" #if __AVX__ #include @@ -149,9 +149,9 @@ static NCNN_FORCEINLINE __m128 border_coord_p4(const __m128& coord, const __m128 static NCNN_FORCEINLINE __m128 reflect_coord_p4(__m128 x, const __m128& high) { /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps256_inv_sign_mask); + x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); - __m128 reflect_v = _mm_and_ps(_mm_sub_ps(x, high), *(__m128*)_ps256_inv_sign_mask); + __m128 reflect_v = _mm_and_ps(_mm_sub_ps(x, high), *(__m128*)_ps_inv_sign_mask); x = _mm_sub_ps(high, reflect_v); return x; } @@ -170,7 +170,7 @@ static NCNN_FORCEINLINE __m128 compute_coord_p4(__m128 sx, const __m128& w, int } else { - __m128 v0p5f = *(__m128*)_ps256_0p5; + __m128 v0p5f = *(__m128*)_ps_0p5; sx = _mm_sub_ps(reflect_coord_p4(_mm_add_ps(sx, v0p5f), w), v0p5f); sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); } @@ -213,6 +213,30 @@ static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m12 return _v; } +static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) +{ +#if __AVX__ + __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[4], maski[4]; + memcpy(offseti, &offset, 4 * sizeof(int)); + memcpy(maski, &mask, 4 * sizeof(int)); + + float data[4]; + for (int i = 0; i < 4; i++) + { + if (maski[i] & 0x01) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m128 v = _mm_loadu_ps(data); +#endif // __AVX__ + + return v; +} + #endif // __SSE2__ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -231,13 +255,14 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v00_in_range), sizeof(float)); - auto ne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range), sizeof(float)); - auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + auto nw_val = mask_gather_ps(ptr, i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); + auto ne_val = mask_gather_ps(ptr, i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); + auto sw_val = mask_gather_ps(ptr, i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); + auto se_val = mask_gather_ps(ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); auto _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -1086,7 +1112,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); - auto sw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); - auto se_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range), sizeof(float)); + auto nw_val = mask_gather_ps(ptr, i_nw_offset, vn1fp4); + auto ne_val = mask_gather_ps(ptr, i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto sw_val = mask_gather_ps(ptr, i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto se_val = mask_gather_ps(ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); auto _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -1150,11 +1176,11 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); + auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(outptr, _v); @@ -1193,7 +1219,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f), sizeof(float)); + auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), + i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(outptr, _v); @@ -1229,11 +1255,11 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v0_in_range), sizeof(float)); - auto x1_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m128*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m128*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m128*>(&v3_in_range), sizeof(float)); + auto x0_val = mask_gather_ps(ptr, x0_offset, *reinterpret_cast<__m128*>(&v0_in_range)); + auto x1_val = mask_gather_ps(ptr, x1_offset, *reinterpret_cast<__m128*>(&v1_in_range)); + auto x2_val = mask_gather_ps(ptr, x2_offset, *reinterpret_cast<__m128*>(&v2_in_range)); + auto x3_val = mask_gather_ps(ptr, x3_offset, *reinterpret_cast<__m128*>(&v3_in_range)); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } @@ -1318,7 +1344,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v000_in_range), sizeof(float)); - auto tne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range), sizeof(float)); - auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range), sizeof(float)); - auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range), sizeof(float)); - auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); + auto tnw_val = mask_gather_ps(ptr, i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); + auto tne_val = mask_gather_ps(ptr, i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); + auto tsw_val = mask_gather_ps(ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); + auto tse_val = mask_gather_ps(ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); + auto bne_val = mask_gather_ps(ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); auto _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -1532,7 +1558,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); - auto tsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range), sizeof(float)); + auto tnw_val = mask_gather_ps(ptr, i_tnw_offset, vn1fp4); + auto tne_val = mask_gather_ps(ptr, i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto tsw_val = mask_gather_ps(ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto tse_val = mask_gather_ps(ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + auto bne_val = mask_gather_ps(ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); auto _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -1645,11 +1671,11 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range), sizeof(float)); + auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(outptr, _v); @@ -1697,7 +1723,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f), sizeof(float)); + auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), + i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(outptr, _v); @@ -1741,12 +1767,13 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector>>>>>> master pass_ncnn/F_grid_sample.cpp pass_ncnn/F_group_norm.cpp pass_ncnn/F_hardsigmoid.cpp diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 46122ef4b39..85ce0b3ed88 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -33,17 +33,10 @@ class F_grid_sample : public GraphRewriterPass { return R"PNNXIR(7767517 4 3 -<<<<<<< HEAD -pnnx.Input input_0 0 1 input0 -pnnx.Input input_1 0 1 input1 -F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners -pnnx.Output output 1 0 out -======= pnnx.Input input_0 0 1 input0 pnnx.Input input_1 0 1 input1 F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners pnnx.Output output 1 0 out ->>>>>>> master )PNNXIR"; } @@ -54,21 +47,12 @@ pnnx.Output output 1 0 out const char* name_str() const { -<<<<<<< HEAD - return "grid_sample"; -======= return "gridsample"; ->>>>>>> master } void write(Operator* op, const std::map& captured_params) const { const std::string& mode = captured_params.at("mode").s; -<<<<<<< HEAD - const std::string& padding_mode = captured_params.at("padding_mode").s; - -======= ->>>>>>> master if (mode == "bilinear") op->params["0"] = 1; if (mode == "nearest") @@ -76,10 +60,7 @@ pnnx.Output output 1 0 out if (mode == "bicubic") op->params["0"] = 3; -<<<<<<< HEAD -======= const std::string& padding_mode = captured_params.at("padding_mode").s; ->>>>>>> master if (padding_mode == "zeros") op->params["1"] = 1; if (padding_mode == "border") @@ -87,19 +68,12 @@ pnnx.Output output 1 0 out if (padding_mode == "reflection") op->params["1"] = 3; -<<<<<<< HEAD - op->params["3"] = captured_params.at("align_corners").b ? 1 : 0; // align_corners -======= op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; ->>>>>>> master } }; REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) -<<<<<<< HEAD -======= ->>>>>>> master } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp index 97f6321acad..73e8e08eb39 100644 --- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp +++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp @@ -290,10 +290,6 @@ void solve_batch_index(Graph& graph) { if (op->type == std::string("F.grid_sample")) { -<<<<<<< HEAD - // grid_sample's grid input may be a 5d tensor :( -======= ->>>>>>> master op->inputs[1]->params["__batch_index"] = 0; } From 0d1e40327dded23c5c0b52514dd2973c8b3d2353 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Sun, 13 Nov 2022 10:20:52 +0000 Subject: [PATCH 022/127] apply code-format changes --- src/layer/gridsample.cpp | 1 - src/layer/x86/gridsample_x86.cpp | 60 +++++++++--------- tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 72 +++++++++++----------- 3 files changed, 65 insertions(+), 68 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index a5157cb4bb9..993f645029c 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -17,7 +17,6 @@ namespace ncnn { - GridSample::GridSample() { one_blob_only = false; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 132908c40eb..4e103ca0370 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -1018,7 +1018,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(outptr, _v); @@ -1219,7 +1218,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f)); + i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(outptr, _v); @@ -1259,7 +1258,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v100_in_range)); auto tsw_val = mask_gather_ps(ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); auto tse_val = mask_gather_ps(ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - + auto bnw_val = mask_gather_ps(ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); auto bne_val = mask_gather_ps(ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); auto bsw_val = mask_gather_ps(ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); @@ -1558,7 +1557,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range)); auto tsw_val = mask_gather_ps(ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); auto tse_val = mask_gather_ps(ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - + auto bnw_val = mask_gather_ps(ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); auto bne_val = mask_gather_ps(ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); auto bsw_val = mask_gather_ps(ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); @@ -1675,7 +1674,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(outptr, _v); @@ -1723,7 +1722,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f)); + i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(outptr, _v); @@ -1773,7 +1772,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector>>>>>> master + >>>>>>> master // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -17,13 +17,13 @@ // specific language governing permissions and limitations under the License. #include "pass_ncnn.h" -<<<<<<< HEAD + <<<<<<< HEAD #include -======= ->>>>>>> master - -namespace pnnx { + ======= + >>>>>>> master + namespace pnnx +{ namespace ncnn { class F_grid_sample : public GraphRewriterPass @@ -38,42 +38,42 @@ pnnx.Input input_1 0 1 input1 F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners pnnx.Output output 1 0 out )PNNXIR"; - } + } - const char* type_str() const - { - return "GridSample"; - } + const char* type_str() const + { + return "GridSample"; + } - const char* name_str() const - { - return "gridsample"; - } + const char* name_str() const + { + return "gridsample"; + } - void write(Operator* op, const std::map& captured_params) const - { - const std::string& mode = captured_params.at("mode").s; - if (mode == "bilinear") - op->params["0"] = 1; - if (mode == "nearest") - op->params["0"] = 2; - if (mode == "bicubic") - op->params["0"] = 3; + void write(Operator* op, const std::map& captured_params) const + { + const std::string& mode = captured_params.at("mode").s; + if (mode == "bilinear") + op->params["0"] = 1; + if (mode == "nearest") + op->params["0"] = 2; + if (mode == "bicubic") + op->params["0"] = 3; - const std::string& padding_mode = captured_params.at("padding_mode").s; - if (padding_mode == "zeros") - op->params["1"] = 1; - if (padding_mode == "border") - op->params["1"] = 2; - if (padding_mode == "reflection") - op->params["1"] = 3; + const std::string& padding_mode = captured_params.at("padding_mode").s; + if (padding_mode == "zeros") + op->params["1"] = 1; + if (padding_mode == "border") + op->params["1"] = 2; + if (padding_mode == "reflection") + op->params["1"] = 3; - op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; - } -}; + op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; + } + }; -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) + REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) -} // namespace ncnn + } // namespace ncnn } // namespace pnnx From 7c2ae50b98a2110dbe6d94c03045ef194ea430de Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 18 Nov 2022 21:37:43 +0800 Subject: [PATCH 023/127] solve round can't align and strongthen unit test --- src/layer/gridsample.cpp | 15 ++- src/layer/x86/gridsample_x86.cpp | 197 ++++++++++++++++--------------- src/layer/x86/x86_usability.h | 35 ++++++ tests/test_gridsample.cpp | 122 +++++++++---------- 4 files changed, 209 insertions(+), 160 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 993f645029c..32dce144c8e 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -236,8 +236,8 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& sample_x = grid_sample_unormalize(w, sample_x, align_corner); sample_y = grid_sample_unormalize(h, sample_y, align_corner); - int x0 = static_cast(round(sample_x)); - int y0 = static_cast(round(sample_y)); + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); float v = get_value_bounded(image, x0, y0, padding_mode, align_corner); @@ -418,13 +418,18 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& float sample_y = gridptr[1]; float sample_z = gridptr[2]; + if (padding_mode == 2) + { + int a = 10; + } + sample_x = grid_sample_unormalize(w, sample_x, align_corner); sample_y = grid_sample_unormalize(h, sample_y, align_corner); sample_z = grid_sample_unormalize(d, sample_z, align_corner); - int x0 = static_cast(round(sample_x)); - int y0 = static_cast(round(sample_y)); - int z0 = static_cast(round(sample_z)); + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + int z0 = static_cast(floor(sample_z + 0.5f)); float v = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 4e103ca0370..eb4065d79a5 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -36,9 +36,6 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ const __m256 v1fp8 = *(__m256*)_ps256_1; -const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); -const __m256i v1ip8 = _mm256_set1_epi32(1); -const __m256i vn1ip8 = _mm256_set1_epi32(-1); static __m256 NCNN_FORCEINLINE grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) @@ -48,7 +45,7 @@ grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corne if (align_corner) return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); else - return _mm256_div_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(coordx, v1fp8), w), v1fp8), two); + return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coordx, v1fp8), w, v1fp8), two); } static NCNN_FORCEINLINE __m256 border_coord_p8(const __m256& coord, const __m256& border) @@ -80,7 +77,7 @@ static NCNN_FORCEINLINE __m256 compute_coord_p8(__m256 sx, const __m256& w, int } else { - __m256 v0p5f = *(__m256*)_ps256_0p5; + __m256 v0p5f = _mm256_set1_ps(0.5f); sx = _mm256_sub_ps(reflect_coord_p8(_mm256_add_ps(sx, v0p5f), w), v0p5f); sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); } @@ -107,13 +104,12 @@ static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m25 const auto x0 = _mm256_add_ps(tx, v1fp8); const auto& x1 = tx; const auto x2 = _mm256_sub_ps(v1fp8, tx); - const auto x3 = _mm256_add_ps(x2, v1fp8); + //const auto x3 = _mm256_add_ps(x2, v1fp8); - //should be optimized? :( const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); - const __m256 coeffs3 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x3), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x3), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); auto _v = _mm256_mul_ps(coeffs0, x0_v); _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); @@ -126,9 +122,6 @@ static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m25 #endif // __AVX__ const __m128 v1fp4 = _mm_set1_ps(1.0f); -const __m128 vn1fp4 = _mm_set1_ps(-1.0f); -const __m128i v1ip4 = _mm_set1_epi32(1); -const __m128i vn1ip4 = _mm_set1_epi32(-1); static __m128 NCNN_FORCEINLINE grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corner) @@ -138,7 +131,7 @@ grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corne if (align_corner) return _mm_mul_ps(_mm_div_ps(_mm_add_ps(coordx, v1fp4), two), _mm_sub_ps(w, v1fp4)); else - return _mm_div_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(coordx, v1fp4), w), v1fp4), two); + return _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(coordx, v1fp4), w, v1fp4), two); } static NCNN_FORCEINLINE __m128 border_coord_p4(const __m128& coord, const __m128& border) @@ -197,13 +190,12 @@ static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m12 const auto x0 = _mm_add_ps(tx, v1fp4); const auto& x1 = tx; const auto x2 = _mm_sub_ps(v1fp4, tx); - const auto x3 = _mm_add_ps(x2, v1fp4); + //const auto x3 = _mm_add_ps(x2, v1fp4); - //should be optimized? :( const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); - const __m128 coeffs3 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x3), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x3), _mm_mul_ps(_mm_set1_ps(4), A)); + const __m128 coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(v1fp4, coeffs0), coeffs1), coeffs2); auto _v = _mm_mul_ps(coeffs0, x0_v); _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); @@ -258,6 +250,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -458,7 +454,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -498,7 +497,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -962,7 +961,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -1018,6 +1021,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(outptr, _v); @@ -1218,7 +1225,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f)); + i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(outptr, _v); @@ -1258,7 +1265,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(outptr, _v); @@ -1722,7 +1729,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f)); + i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(outptr, _v); diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 669cec0a738..57f2c1d36f9 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -163,6 +163,14 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128 { return _mm_sub_ps(_c, _mm_mul_ps(_a, _b)); } +static NCNN_FORCEINLINE __m128 _mm_comp_fmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c) +{ + return _mm_sub_ps(_mm_mul_ps(_a, _b), _c); +} +static NCNN_FORCEINLINE __m128 _mm_comp_fnmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c) +{ + return _mm_sub_ps(_c, _mm_mul_ps(_mm_mul_ps(_a, _b), _mm_set1_ps(-1))); +} #else static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c) { @@ -173,6 +181,14 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128 // return -a * b + c return _mm_fnmadd_ps(_a, _b, _c); } +static NCNN_FORCEINLINE __m128 _mm_comp_fmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c) +{ + return _mm_fmsub_ps(_a, _b, _c); +} +static NCNN_FORCEINLINE __m128 _mm_comp_fnmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c) +{ + return _mm_fnmsub_ps(_a, _b, _c); +} #endif // !__FMA__ #if __AVX__ @@ -185,9 +201,18 @@ static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m { return _mm256_sub_ps(_c, _mm256_mul_ps(_a, _b)); } +static NCNN_FORCEINLINE __m256 _mm256_comp_fmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c) +{ + return _mm256_sub_ps(_mm256_mul_ps(_a, _b), _c); +} +static NCNN_FORCEINLINE __m256 _mm256_comp_fnmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c) +{ + return _mm256_sub_ps(_c, _mm256_mul_ps(_mm256_mul_ps(_a, _b), _mm256_set1_ps(-1))); +} #else static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) { + // return a * b + c return _mm256_fmadd_ps(_a, _b, _c); } static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) @@ -195,6 +220,16 @@ static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m // return -a * b + c return _mm256_fnmadd_ps(_a, _b, _c); } +static NCNN_FORCEINLINE __m256 _mm256_comp_fmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c) +{ + // return a * b - c + return _mm256_fmsub_ps(_a, _b, _c); +} +static NCNN_FORCEINLINE __m256 _mm256_comp_fnmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c) +{ + // return -(a * b) - c + return _mm256_fnmsub_ps(_a, _b, _c); +} #endif static NCNN_FORCEINLINE __m256 _mm256_fmadd_1_ps(const __m256& a, const __m256& b, float c) diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 70c96b30480..7763d3c6441 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -15,6 +15,8 @@ #include "layer/gridsample.h" #include "testutil.h" +#include + static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner) { ncnn::ParamDict pd; @@ -42,81 +44,81 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample static int test_gridsample_0() { return 0 - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 1); + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 3, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 2, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 2, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 3, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 3, 1); } static int test_gridsample_1() { return 0 - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 1) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 0) - || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 1); + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 3, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 2, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 2, 1) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 3, 0) + || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 3, 1); } static int test_gridsample_2() { return 0 - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 1); } static int test_gridsample_3() { return 0 - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 1); } int main() From f13130cb938e39bd3cd27ea826d126f4ed44f31e Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 21 Nov 2022 20:48:51 +0800 Subject: [PATCH 024/127] optimize code [WIP] --- src/layer/x86/gridsample_x86.cpp | 545 ++++++++++++++++++++++++++----- tests/test_grid_sample.cpp | 0 2 files changed, 471 insertions(+), 74 deletions(-) delete mode 100644 tests/test_grid_sample.cpp diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index eb4065d79a5..52084e9207c 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -36,6 +36,11 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ const __m256 v1fp8 = *(__m256*)_ps256_1; +const auto vn1fp8 = _mm256_set1_ps(-1.0f); +const auto v1ip8 = _mm256_set1_epi32(1); +const auto vn1ip8 = _mm256_set1_epi32(-1); + +#include "gridsample_bilinear_pack8.h" static __m256 NCNN_FORCEINLINE grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) @@ -250,10 +255,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + Mat dst = top_blob.channel(q); + const Mat image = bottom_blob.channel(q); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_bilinear_image_pack8(image, dst, grid, padding_mode, align_corner); } } else //border reflection @@ -463,8 +404,8 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(grid_p1.data); + + top_blob.create(grid.h, grid.c, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -1; + + if (sample_type == 1) + { + if (padding_mode == 1) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int j = 0; + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); +#if __AVX__ + for (; j + 7 < size; j += 8) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); + + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } +#endif // __AVX__ + for (; j < size; j++) + { + + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int j = 0; + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); +#if __AVX__ + for (; j + 7 < size; j += 8) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); + + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } +#endif // __AVX__ + for (; j < size; j++) + { + + } + } + } + } + else if (sample_type == 2) + { + if (padding_mode == 1) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int j = 0; + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); +#if __AVX__ + for (; j + 7 < size; j += 8) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); + + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } +#endif // __AVX__ + for (; j < size; j++) + { + + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int j = 0; + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); +#if __AVX__ + for (; j + 7 < size; j += 8) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); + + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + gx = compute_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = compute_coord_p8(gy, vImgHf, padding_mode, align_corner); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } +#endif // __AVX__ + for (; j < size; j++) + { + + } + } + } + } + else if (sample_type == 3) + { + if (padding_mode == 1) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int j = 0; + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); +#if __AVX__ + for (; j + 7 < size; j += 8) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); + + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); + auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); + auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); + auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + + gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + + auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); + auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); + auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); + auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } +#endif // __AVX__ + for (; j < size; j++) + { + + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + int j = 0; + float* outptr = top_blob.channel(q); + const float* ptr = static_cast(bottom_blob.channel(q).data); +#if __AVX__ + for (; j + 7 < size; j += 8) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); + + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + + gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); + gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + for (int i = 0; i < 4; i++) + { + auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); + auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); + auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); + auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + + gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto y = _mm256_cvtps_epi32(gy); + + auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); + auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); + auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); + auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(outptr, _v); + + outptr += 8; + } +#endif // __AVX__ + for (; j < size; j++) + { + + } + } + } + } + } + + if (dims == 4) + { + int size = w * h * d; + if (sample_type == 1) + { + + } + else if (sample_type == 2) + { + + } + else + { + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -1; + } + } + return 0; +#endif // __SSE2__ + return GridSample::forward(bottom_blobs, top_blobs, opt); } diff --git a/tests/test_grid_sample.cpp b/tests/test_grid_sample.cpp deleted file mode 100644 index e69de29bb2d..00000000000 From 109d6370563008dc6677876ea8c01ce6e2e2c6dc Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 22 Nov 2022 20:53:07 +0800 Subject: [PATCH 025/127] optimize dims = 3 pack8 --- src/layer/x86/gridsample_bicubic_pack8.h | 711 ++++++++++++++++++++++ src/layer/x86/gridsample_bilinear_pack8.h | 560 +++++++++++++++++ src/layer/x86/gridsample_nearest_pack8.h | 376 ++++++++++++ src/layer/x86/gridsample_x86.cpp | 402 +++--------- 4 files changed, 1740 insertions(+), 309 deletions(-) create mode 100644 src/layer/x86/gridsample_bicubic_pack8.h create mode 100644 src/layer/x86/gridsample_bilinear_pack8.h create mode 100644 src/layer/x86/gridsample_nearest_pack8.h diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h new file mode 100644 index 00000000000..3224175def2 --- /dev/null +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -0,0 +1,711 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) +{ + const auto A = _mm256_set1_ps(-0.75f); + + const auto x0 = _mm256_add_ps(tx, v1fp8); + const auto& x1 = tx; + const auto x2 = _mm256_sub_ps(v1fp8, tx); + //const auto x3 = _mm256_add_ps(x2, v1fp8); + + const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); + const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); + const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); + + auto _v = _mm256_mul_ps(coeffs0, x0_v); + _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); + + return _v; +} + +static void gridsample_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempackf = _mm256_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); + auto gx1 = gx_floor; + auto gx2 = _mm256_add_ps(gx_floor, v1fp8); + auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + auto y = _mm256_cvtps_epi32(gy); + + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + + v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + + auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempackf = _mm256_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); + auto gx1 = gx_floor; + auto gx2 = _mm256_add_ps(gx_floor, v1fp8); + auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + auto y = _mm256_cvtps_epi32(gy); + + auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + + v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + + auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bicubic_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempackf = _mm256_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm256_set1_ps(2.f); + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); + auto gx1 = gx_floor; + auto gx2 = _mm256_add_ps(gx_floor, v1fp8); + auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + auto y = _mm256_cvtps_epi32(gy); + + auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bicubic_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempackf = _mm256_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm256_set1_ps(2.f); + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); + auto gx1 = gx_floor; + auto gx2 = _mm256_add_ps(gx_floor, v1fp8); + auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + auto y = _mm256_cvtps_epi32(gy); + + auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bicubic_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempackf = _mm256_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm256_set1_ps(2.f); + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); + auto gx1 = gx_floor; + auto gx2 = _mm256_add_ps(gx_floor, v1fp8); + auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const auto v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx0 = _mm256_add_ps(gx0, v0p5fp8); + + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); + + gx0 = _mm256_sub_ps(gx0, v0p5fp8); + + _mm256_sub_ps(gx0, v0p5fp8); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + + + // x1 + gx1 = _mm256_add_ps(gx1, v0p5fp8); + + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); + + gx1 = _mm256_sub_ps(gx1, v0p5fp8); + + _mm256_sub_ps(gx1, v0p5fp8); + + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + + // x2 + gx2 = _mm256_add_ps(gx2, v0p5fp8); + + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); + + gx2 = _mm256_sub_ps(gx2, v0p5fp8); + + _mm256_sub_ps(gx2, v0p5fp8); + + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + + // x3 + gx3 = _mm256_add_ps(gx3, v0p5fp8); + + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); + + gx3 = _mm256_sub_ps(gx3, v0p5fp8); + + _mm256_sub_ps(gx3, v0p5fp8); + + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + } + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + { + //y + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + auto y = _mm256_cvtps_epi32(gy); + + auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bicubic_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + float* outptr = static_cast(dst.data); + + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempackf = _mm256_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm256_set1_ps(2.f); + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + auto gx_floor = _mm256_floor_ps(gx); + auto gy_floor = _mm256_floor_ps(gy); + + const auto tx = _mm256_sub_ps(gx, gx_floor); + const auto ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); + auto gx1 = gx_floor; + auto gx2 = _mm256_add_ps(gx_floor, v1fp8); + auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const auto v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + auto reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(border_x, reflectx0_v); + + + // x1 + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(border_x, reflectx1_v); + + // x2 + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(border_x, reflectx2_v); + + // x3 + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(border_x, reflectx3_v); + } + + auto x0 = _mm256_cvtps_epi32(gx0); + auto x1 = _mm256_cvtps_epi32(gx1); + auto x2 = _mm256_cvtps_epi32(gx2); + auto x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + { + //y + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + auto y = _mm256_cvtps_epi32(gy); + + auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h new file mode 100644 index 00000000000..24c1d09a216 --- /dev/null +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -0,0 +1,560 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + auto v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + float* outptr = static_cast(dst.data); + + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h new file mode 100644 index 00000000000..ac50d783133 --- /dev/null +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -0,0 +1,376 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm256_set1_ps(2.f); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + auto v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + float* outptr = static_cast(dst.data); + + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm256_set1_ps(2.f); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + + // y + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 52084e9207c..d2e38f96f79 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -41,6 +41,8 @@ const auto v1ip8 = _mm256_set1_epi32(1); const auto vn1ip8 = _mm256_set1_epi32(-1); #include "gridsample_bilinear_pack8.h" +#include "gridsample_nearest_pack8.h" +#include "gridsample_bicubic_pack8.h" static __m256 NCNN_FORCEINLINE grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) @@ -102,28 +104,6 @@ static NCNN_FORCEINLINE __m256 get_coord_p8(const __m256& x, const __m256& w, in return coord; } -static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) -{ - const auto A = _mm256_set1_ps(-0.75f); - - const auto x0 = _mm256_add_ps(tx, v1fp8); - const auto& x1 = tx; - const auto x2 = _mm256_sub_ps(v1fp8, tx); - //const auto x3 = _mm256_add_ps(x2, v1fp8); - - const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); - const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); - const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); - - auto _v = _mm256_mul_ps(coeffs0, x0_v); - _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); - - return _v; -} - #endif // __AVX__ const __m128 v1fp4 = _mm_set1_ps(1.0f); @@ -250,6 +230,11 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_bilinear_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_bilinear_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); } } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_bilinear_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_bilinear_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample sample_type error\n"); + return -100; + } } if (sample_type == 2) { - if (padding_mode == 1) //zeros + if (padding_mode == 1) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_nearest_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_nearest_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); } } - else //border reflection + else if(padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - gx = compute_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = compute_coord_p8(gy, vImgHf, padding_mode, align_corner); - - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_nearest_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); } + else + { + gridsample_nearest_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); + } + } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_nearest_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_nearest_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample sample_type error\n"); + return -100; } } @@ -438,160 +347,35 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); - - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); - - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); - auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); - auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); - auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); - - gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); - - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); - - auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); - auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); - auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); - auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_bicubic_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_bicubic_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); } } - else + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); - - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); - - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); - auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); - auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); - auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); - - gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto y = _mm256_cvtps_epi32(gy); - - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); - - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_bicubic_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_bicubic_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); + } + } + else if(padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_bicubic_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_bicubic_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); } } } @@ -929,7 +713,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Wed, 23 Nov 2022 19:25:01 +0800 Subject: [PATCH 026/127] finish pack4 and pack8 [WIP] --- src/layer/gridsample.cpp | 5 - src/layer/x86/gridsample_bicubic_pack4.h | 711 ++++++++++ src/layer/x86/gridsample_bicubic_pack8.h | 12 +- src/layer/x86/gridsample_bilinear_pack4.h | 1456 +++++++++++++++++++++ src/layer/x86/gridsample_bilinear_pack8.h | 906 ++++++++++++- src/layer/x86/gridsample_nearest_pack4.h | 836 ++++++++++++ src/layer/x86/gridsample_nearest_pack8.h | 476 ++++++- src/layer/x86/gridsample_x86.cpp | 1360 ++++--------------- 8 files changed, 4662 insertions(+), 1100 deletions(-) create mode 100644 src/layer/x86/gridsample_bicubic_pack4.h create mode 100644 src/layer/x86/gridsample_bilinear_pack4.h create mode 100644 src/layer/x86/gridsample_nearest_pack4.h diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 32dce144c8e..83e73eecb3d 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -418,11 +418,6 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& float sample_y = gridptr[1]; float sample_z = gridptr[2]; - if (padding_mode == 2) - { - int a = 10; - } - sample_x = grid_sample_unormalize(w, sample_x, align_corner); sample_y = grid_sample_unormalize(h, sample_y, align_corner); sample_z = grid_sample_unormalize(d, sample_z, align_corner); diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h new file mode 100644 index 00000000000..b01521b5276 --- /dev/null +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -0,0 +1,711 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) +{ + const auto A = _mm_set1_ps(-0.75f); + + const auto x0 = _mm_add_ps(tx, v1fp4); + const auto& x1 = tx; + const auto x2 = _mm_sub_ps(v1fp4, tx); + //const auto x3 = _mm_add_ps(x2, v1fp4); + + const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); + const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); + const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); + const __m128 coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(v1fp4, coeffs0), coeffs1), coeffs2); + + auto _v = _mm_mul_ps(coeffs0, x0_v); + _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); + + return _v; +} + +static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempackf = _mm_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + } + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + auto gx0 = _mm_add_ps(gx_floor, vn1fp4); + auto gx1 = gx_floor; + auto gx2 = _mm_add_ps(gx_floor, v1fp4); + auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); + + __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); + + auto y = _mm_cvtps_epi32(gy); + + auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); + + v0_in_range[i] = _mm_and_si128(x0_in_range, y_in_range); + v1_in_range[i] = _mm_and_si128(x1_in_range, y_in_range); + v2_in_range[i] = _mm_and_si128(x2_in_range, y_in_range); + v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); + + auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); + auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); + auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); + auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempackf = _mm_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + } + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + auto gx0 = _mm_add_ps(gx_floor, vn1fp4); + auto gx1 = gx_floor; + auto gx2 = _mm_add_ps(gx_floor, v1fp4); + auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); + auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); + + __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); + + auto y = _mm_cvtps_epi32(gy); + + auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); + + v0_in_range[i] = _mm_and_si128(x0_in_range, y_in_range); + v1_in_range[i] = _mm_and_si128(x1_in_range, y_in_range); + v2_in_range[i] = _mm_and_si128(x2_in_range, y_in_range); + v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); + + auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); + auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); + auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); + auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempackf = _mm_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm_set1_ps(2.f); + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + auto gx0 = _mm_add_ps(gx_floor, vn1fp4); + auto gx1 = gx_floor; + auto gx2 = _mm_add_ps(gx_floor, v1fp4); + auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + + gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); + gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); + gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); + gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + auto y = _mm_cvtps_epi32(gy); + + auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempackf = _mm_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm_set1_ps(2.f); + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + auto gx0 = _mm_add_ps(gx_floor, vn1fp4); + auto gx1 = gx_floor; + auto gx2 = _mm_add_ps(gx_floor, v1fp4); + auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + + gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); + gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); + gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); + gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + auto y = _mm_cvtps_epi32(gy); + + auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempackf = _mm_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm_set1_ps(2.f); + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + auto gx0 = _mm_add_ps(gx_floor, vn1fp4); + auto gx1 = gx_floor; + auto gx2 = _mm_add_ps(gx_floor, v1fp4); + auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + const auto v0p5fp4 = _mm_set1_ps(0.5f); + { + // x0 + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx0 = _mm_add_ps(gx0, v0p5fp4); + + gx0 = _mm_and_ps(gx0, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx0 = _mm_sub_ps(vImgWf, reflectx0_v); + + gx0 = _mm_sub_ps(gx0, v0p5fp4); + + _mm_sub_ps(gx0, v0p5fp4); + + gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); + + + // x1 + gx1 = _mm_add_ps(gx1, v0p5fp4); + + gx1 = _mm_and_ps(gx1, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx1 = _mm_sub_ps(vImgWf, reflectx1_v); + + gx1 = _mm_sub_ps(gx1, v0p5fp4); + + _mm_sub_ps(gx1, v0p5fp4); + + gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); + + // x2 + gx2 = _mm_add_ps(gx2, v0p5fp4); + + gx2 = _mm_and_ps(gx2, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx2 = _mm_sub_ps(vImgWf, reflectx2_v); + + gx2 = _mm_sub_ps(gx2, v0p5fp4); + + _mm_sub_ps(gx2, v0p5fp4); + + gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); + + // x3 + gx3 = _mm_add_ps(gx3, v0p5fp4); + + gx3 = _mm_and_ps(gx3, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx3 = _mm_sub_ps(vImgWf, reflectx3_v); + + gx3 = _mm_sub_ps(gx3, v0p5fp4); + + _mm_sub_ps(gx3, v0p5fp4); + + gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); + } + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); + + { + //y + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_add_ps(gy, v0p5fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(vImgHf, reflecty_v); + + gy = _mm_sub_ps(gy, v0p5fp4); + + _mm_sub_ps(gy, v0p5fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + auto y = _mm_cvtps_epi32(gy); + + auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + float* outptr = static_cast(dst.data); + + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempackf = _mm_set1_ps(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm_set1_ps(2.f); + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + auto gx_floor = _mm_floor_ps(gx); + auto gy_floor = _mm_floor_ps(gy); + + const auto tx = _mm_sub_ps(gx, gx_floor); + const auto ty = _mm_sub_ps(gy, gy_floor); + + __m128 coefficients[4]; + + auto gx0 = _mm_add_ps(gx_floor, vn1fp4); + auto gx1 = gx_floor; + auto gx2 = _mm_add_ps(gx_floor, v1fp4); + auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + const auto v0p5fp4 = _mm_set1_ps(0.5f); + { + // x0 + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx0 = _mm_and_ps(gx0, *(__m128*)_ps256_inv_sign_mask); + auto reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, border_x), *(__m128*)_ps256_inv_sign_mask); + gx0 = _mm_sub_ps(border_x, reflectx0_v); + + + // x1 + gx1 = _mm_and_ps(gx1, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, border_x), *(__m128*)_ps256_inv_sign_mask); + gx1 = _mm_sub_ps(border_x, reflectx1_v); + + // x2 + gx2 = _mm_and_ps(gx2, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, border_x), *(__m128*)_ps256_inv_sign_mask); + gx2 = _mm_sub_ps(border_x, reflectx2_v); + + // x3 + gx3 = _mm_and_ps(gx3, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, border_x), *(__m128*)_ps256_inv_sign_mask); + gx3 = _mm_sub_ps(border_x, reflectx3_v); + } + + auto x0 = _mm_cvtps_epi32(gx0); + auto x1 = _mm_cvtps_epi32(gx1); + auto x2 = _mm_cvtps_epi32(gx2); + auto x3 = _mm_cvtps_epi32(gx3); + + __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); + + { + //y + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(border_y, reflecty_v); + } + + auto y = _mm_cvtps_epi32(gy); + + auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + + coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); + } + + auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index 3224175def2..e189297bc01 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -34,7 +34,7 @@ static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m25 return _v; } -static void gridsample_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -137,7 +137,7 @@ static void gridsample_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, } } -static void gridsample_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -237,7 +237,7 @@ static void gridsample_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& dst, } } -static void gridsample_bicubic_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -328,7 +328,7 @@ static void gridsample_bicubic_align0_border_blob_pack8(const Mat& src, Mat& dst } } -static void gridsample_bicubic_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -420,7 +420,7 @@ static void gridsample_bicubic_align1_border_blob_pack8(const Mat& src, Mat& dst } } -static void gridsample_bicubic_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -584,7 +584,7 @@ static void gridsample_bicubic_align0_reflection_blob_pack8(const Mat& src, Mat& } } -static void gridsample_bicubic_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { float* outptr = static_cast(dst.data); diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h new file mode 100644 index 00000000000..7b9005057d8 --- /dev/null +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -0,0 +1,1456 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); + auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); + auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); + auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); + auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); + auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); + auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + auto v0p5fp4 = _mm_set1_ps(0.5f); + gx = _mm_add_ps(gx, v0p5fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(vImgWf, reflectx_v); + + gx = _mm_sub_ps(gx, v0p5fp4); + + _mm_sub_ps(gx, v0p5fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_add_ps(gy, v0p5fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(vImgHf, reflecty_v); + + gy = _mm_sub_ps(gy, v0p5fp4); + + _mm_sub_ps(gy, v0p5fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(border_x, reflectx_v); + + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(border_y, reflecty_v); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + + auto _v = _mm_mul_ps(nw_val, nw); + _v = _mm_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm_comp_fmadd_ps(se_val, se, _v); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + + +static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + // z + gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + + __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); + v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); + v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); + v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); + + v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); + v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); + v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); + auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); + auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); + auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); + auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + // z + gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + + __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); + v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); + v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); + v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); + + v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); + v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); + v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); + auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); + auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); + auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); + auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + + // z + gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); + + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + + __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); + v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + + // z + gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); + + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + + __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); + v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + auto v0p5fp4 = _mm_set1_ps(0.5f); + gx = _mm_add_ps(gx, v0p5fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(vImgWf, reflectx_v); + + gx = _mm_sub_ps(gx, v0p5fp4); + + _mm_sub_ps(gx, v0p5fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_add_ps(gy, v0p5fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(vImgHf, reflecty_v); + + gy = _mm_sub_ps(gy, v0p5fp4); + + _mm_sub_ps(gy, v0p5fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + + // z + gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_add_ps(gz, v0p5fp4); + + gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps256_inv_sign_mask); + gz = _mm_sub_ps(vImgDf, reflectz_v); + + gz = _mm_sub_ps(gz, v0p5fp4); + + _mm_sub_ps(gz, v0p5fp4); + + gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + + __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); + v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(border_x, reflectx_v); + + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(border_y, reflecty_v); + + + // z + gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps256_inv_sign_mask); + gz = _mm_sub_ps(border_z, reflectz_v); + } + + auto x_w = _mm_floor_ps(gx); + auto y_n = _mm_floor_ps(gy); + auto z_t = _mm_floor_ps(gz); + + auto w = _mm_sub_ps(gx, x_w); + auto e = _mm_sub_ps(v1fp4, w); + auto n = _mm_sub_ps(gy, y_n); + auto s = _mm_sub_ps(v1fp4, n); + auto t = _mm_sub_ps(gz, z_t); + auto b = _mm_sub_ps(v1fp4, t); + + __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm_mul_ps(s, e); + auto ne = _mm_mul_ps(s, w); + auto sw = _mm_mul_ps(n, e); + auto se = _mm_mul_ps(n, w); + + tnw = _mm_mul_ps(b, nw); + tne = _mm_mul_ps(b, ne); + tsw = _mm_mul_ps(b, sw); + tse = _mm_mul_ps(b, se); + + bnw = _mm_mul_ps(t, nw); + bne = _mm_mul_ps(t, ne); + bsw = _mm_mul_ps(t, sw); + bse = _mm_mul_ps(t, se); + } + + auto x0 = _mm_cvtps_epi32(x_w); + auto x1 = _mm_add_epi32(x0, v1ip4); + auto y0 = _mm_cvtps_epi32(y_n); + auto y1 = _mm_add_epi32(y0, v1ip4); + auto z0 = _mm_cvtps_epi32(z_t); + auto z1 = _mm_add_epi32(z0, v1ip4); + + auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + + __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); + + v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); + v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); + v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) + , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + + auto _v = _mm_mul_ps(tnw_val, tnw); + _v = _mm_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm_comp_fmadd_ps(bse_val, bse, _v); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index 24c1d09a216..12574051fb4 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void gridsample_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -95,7 +95,7 @@ static void gridsample_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst } } -static void gridsample_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -179,7 +179,7 @@ static void gridsample_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst } } -static void gridsample_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -267,7 +267,7 @@ static void gridsample_bilinear_align0_border_blob_pack8(const Mat& src, Mat& ds } } -static void gridsample_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -354,7 +354,7 @@ static void gridsample_bilinear_align1_border_blob_pack8(const Mat& src, Mat& ds } } -static void gridsample_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -464,10 +464,8 @@ static void gridsample_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat } } -static void gridsample_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - float* outptr = static_cast(dst.data); - const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); const auto vImgWi = _mm256_set1_epi32(src.w); @@ -557,4 +555,896 @@ static void gridsample_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat } } } +} + + +static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + auto v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_add_ps(gz, v0p5fp8); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(vImgDf, reflectz_v); + + gz = _mm256_sub_ps(gz, v0p5fp8); + + _mm256_sub_ps(gz, v0p5fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(border_z, reflectz_v); + } + + auto x_w = _mm256_floor_ps(gx); + auto y_n = _mm256_floor_ps(gy); + auto z_t = _mm256_floor_ps(gz); + + auto w = _mm256_sub_ps(gx, x_w); + auto e = _mm256_sub_ps(v1fp8, w); + auto n = _mm256_sub_ps(gy, y_n); + auto s = _mm256_sub_ps(v1fp8, n); + auto t = _mm256_sub_ps(gz, z_t); + auto b = _mm256_sub_ps(v1fp8, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + auto nw = _mm256_mul_ps(s, e); + auto ne = _mm256_mul_ps(s, w); + auto sw = _mm256_mul_ps(n, e); + auto se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + auto x0 = _mm256_cvtps_epi32(x_w); + auto x1 = _mm256_add_epi32(x0, v1ip8); + auto y0 = _mm256_cvtps_epi32(y_n); + auto y1 = _mm256_add_epi32(y0, v1ip8); + auto z0 = _mm256_cvtps_epi32(z_t); + auto z1 = _mm256_add_epi32(z0, v1ip8); + + auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + // (W*H*z + W*y + x) * elempack + vec(8) + auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + + for (int q = 0; q < dst.c; q++) + { + auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + + auto _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } } \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h new file mode 100644 index 00000000000..047ee521c1e --- /dev/null +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -0,0 +1,836 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm_set1_ps(2.f); + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + auto v0p5fp4 = _mm_set1_ps(0.5f); + gx = _mm_add_ps(gx, v0p5fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(vImgWf, reflectx_v); + + gx = _mm_sub_ps(gx, v0p5fp4); + + _mm_sub_ps(gx, v0p5fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_add_ps(gy, v0p5fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(vImgHf, reflecty_v); + + gy = _mm_sub_ps(gy, v0p5fp4); + + _mm_sub_ps(gy, v0p5fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + } + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + + const auto two = _mm_set1_ps(2.f); + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(border_x, reflectx_v); + + + // y + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(border_y, reflecty_v); + } + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + + +static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + // z + gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + // z + gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + + // z + gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); + + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm_set1_ps(2.f); + + // x + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + + // z + gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); + + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); + } + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + const auto two = _mm_set1_ps(2.f); + gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); + gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); + gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + auto v0p5fp4 = _mm_set1_ps(0.5f); + gx = _mm_add_ps(gx, v0p5fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(vImgWf, reflectx_v); + + gx = _mm_sub_ps(gx, v0p5fp4); + + _mm_sub_ps(gx, v0p5fp4); + + gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); + + + // y + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_add_ps(gy, v0p5fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(vImgHf, reflecty_v); + + gy = _mm_sub_ps(gy, v0p5fp4); + + _mm_sub_ps(gy, v0p5fp4); + + gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); + + + // z + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_add_ps(gz, v0p5fp4); + + gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps256_inv_sign_mask); + gz = _mm_sub_ps(vImgDf, reflectz_v); + + gz = _mm_sub_ps(gz, v0p5fp4); + + _mm_sub_ps(gz, v0p5fp4); + + gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); + } + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm_set1_ps(src.w); + const auto vImgHf = _mm_set1_ps(src.h); + const auto vImgDf = _mm_set1_ps(src.d); + const auto vImgWi = _mm_set1_epi32(src.w); + const auto vImgHi = _mm_set1_epi32(src.h); + const auto vImgDi = _mm_set1_epi32(src.d); + + const auto vElempacki = _mm_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm_set1_ps(gridptr[0]); + auto gy = _mm_set1_ps(gridptr[grid.elempack]); + auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + + const auto two = _mm_set1_ps(2.f); + gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); + gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); + gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); + + gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + + gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + gx = _mm_sub_ps(border_x, reflectx_v); + + + // y + const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + + gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + gy = _mm_sub_ps(border_y, reflecty_v); + + + // z + const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + + gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps256_inv_sign_mask); + gz = _mm_sub_ps(border_z, reflectz_v); + } + + auto ix = _mm_cvtps_epi32(gx); + auto iy = _mm_cvtps_epi32(gy); + auto iz = _mm_cvtps_epi32(gz); + + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) + , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + + _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index ac50d783133..fd10e64b985 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static void gridsample_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -64,7 +64,7 @@ static void gridsample_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, } } -static void gridsample_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -116,7 +116,7 @@ static void gridsample_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, } } -static void gridsample_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -174,7 +174,7 @@ static void gridsample_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst } } -static void gridsample_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -232,7 +232,7 @@ static void gridsample_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst } } -static void gridsample_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); @@ -311,10 +311,8 @@ static void gridsample_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& } } -static void gridsample_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - float* outptr = static_cast(dst.data); - const auto vImgWf = _mm256_set1_ps(src.w); const auto vImgHf = _mm256_set1_ps(src.h); const auto vImgWi = _mm256_set1_epi32(src.w); @@ -373,4 +371,466 @@ static void gridsample_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& } } } +} + + +static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const auto two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + const auto two = _mm256_set1_ps(2.f); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + auto v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + + // y + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + + // z + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_add_ps(gz, v0p5fp8); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(vImgDf, reflectz_v); + + gz = _mm256_sub_ps(gz, v0p5fp8); + + _mm256_sub_ps(gz, v0p5fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const auto vImgWf = _mm256_set1_ps(src.w); + const auto vImgHf = _mm256_set1_ps(src.h); + const auto vImgDf = _mm256_set1_ps(src.d); + const auto vImgWi = _mm256_set1_epi32(src.w); + const auto vImgHi = _mm256_set1_epi32(src.h); + const auto vImgDi = _mm256_set1_epi32(src.d); + + const auto vElempacki = _mm256_set1_epi32(src.elempack); + +#pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + auto gx = _mm256_set1_ps(gridptr[0]); + auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + + const auto two = _mm256_set1_ps(2.f); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + + // y + const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + + + // z + const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(border_z, reflectz_v); + } + + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); + auto iz = _mm256_cvtps_epi32(gz); + + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) + , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } } \ No newline at end of file diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index d2e38f96f79..53fffc87b8f 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -107,6 +107,37 @@ static NCNN_FORCEINLINE __m256 get_coord_p8(const __m256& x, const __m256& w, in #endif // __AVX__ const __m128 v1fp4 = _mm_set1_ps(1.0f); +const auto vn1fp4 = _mm_set1_ps(-1.0f); +const auto v1ip4 = _mm_set1_epi32(1); +const auto vn1ip4 = _mm_set1_epi32(-1); + +static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) +{ +#if __AVX__ + __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[4], maski[4]; + memcpy(offseti, &offset, 4 * sizeof(int)); + memcpy(maski, &mask, 4 * sizeof(int)); + + float data[4]; + for (int i = 0; i < 4; i++) + { + if (maski[i] & 0x01) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m128 v = _mm_loadu_ps(data); +#endif // __AVX__ + + return v; +} + +#include "gridsample_bicubic_pack4.h" +#include "gridsample_bilinear_pack4.h" +#include "gridsample_nearest_pack4.h" static __m128 NCNN_FORCEINLINE grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corner) @@ -168,52 +199,6 @@ static NCNN_FORCEINLINE __m128 get_coord_p4(const __m128& x, const __m128& w, in return coord; } -static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) -{ - const auto A = _mm_set1_ps(-0.75f); - - const auto x0 = _mm_add_ps(tx, v1fp4); - const auto& x1 = tx; - const auto x2 = _mm_sub_ps(v1fp4, tx); - //const auto x3 = _mm_add_ps(x2, v1fp4); - - const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); - const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); - const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); - const __m128 coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(v1fp4, coeffs0), coeffs1), coeffs2); - - auto _v = _mm_mul_ps(coeffs0, x0_v); - _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); - - return _v; -} - -static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) -{ -#if __AVX__ - __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[4], maski[4]; - memcpy(offseti, &offset, 4 * sizeof(int)); - memcpy(maski, &mask, 4 * sizeof(int)); - - float data[4]; - for (int i = 0; i < 4; i++) - { - if (maski[i] & 0x01) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m128 v = _mm_loadu_ps(data); -#endif // __AVX__ - - return v; -} - #endif // __SSE2__ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -230,31 +215,14 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + gridsample_3d_bilinear_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_bilinear_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); } } - else + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tnw_offset, vn1fp8, sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + + gridsample_3d_bilinear_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_bilinear_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); } } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_3d_bilinear_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_bilinear_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample sample_type error\n"); + return -100; + } + } if (sample_type == 2) { if (padding_mode == 1) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = get_coord_p8(gz, vImgDf, padding_mode, align_corner); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); - - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); - - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + gridsample_3d_nearest_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_nearest_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); } } - else + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); - gz = grid_sample_unormalize_p8(vImgDf, gz, align_corner); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - gx = compute_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = compute_coord_p8(gy, vImgHf, padding_mode, align_corner); - gz = compute_coord_p8(gz, vImgDf, padding_mode, align_corner); - - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); - - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - - _mm256_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + gridsample_3d_nearest_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_nearest_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); } } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_3d_nearest_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_nearest_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample sample_type error\n"); + return -100; + } } if (sample_type == 3) @@ -746,9 +462,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); - - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = mask_gather_ps(ptr, i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); - auto ne_val = mask_gather_ps(ptr, i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); - auto sw_val = mask_gather_ps(ptr, i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); - auto se_val = mask_gather_ps(ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - - auto _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_2d_bilinear_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_bilinear_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); } } - else //border reflection + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); - - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); - - auto nw_val = mask_gather_ps(ptr, i_nw_offset, vn1fp4); - auto ne_val = mask_gather_ps(ptr, i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto sw_val = mask_gather_ps(ptr, i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto se_val = mask_gather_ps(ptr, i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - - auto _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_2d_bilinear_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_bilinear_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_2d_bilinear_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_bilinear_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); } } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } } if (sample_type == 2) { - if (padding_mode == 1) //zeros + if (padding_mode == 1) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); - - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - - auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_2d_nearest_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_nearest_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); } } - else //border reflection + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - - auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_2d_nearest_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_nearest_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); } } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_2d_nearest_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_nearest_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample sample_type error\n"); + return -100; + } } if (sample_type == 3) { if (padding_mode == 1) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); - gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); - - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); - - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p4(_mm_add_ps(gx_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); - auto gx1 = compute_coord_p4(gx_floor, vImgWfp4, padding_mode, align_corner); - auto gx2 = compute_coord_p4(_mm_add_ps(gx_floor, v1fp4), vImgWfp4, padding_mode, align_corner); - auto gx3 = compute_coord_p4(_mm_add_ps(gx_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); - - gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); - - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); - - auto y = _mm_cvtps_epi32(gy); - - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x2)); - auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x3)); - - auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y)); - - auto v0_in_range = _mm_and_si128(x0_in_range, y_in_range); - auto v1_in_range = _mm_and_si128(x1_in_range, y_in_range); - auto v2_in_range = _mm_and_si128(x2_in_range, y_in_range); - auto v3_in_range = _mm_and_si128(x3_in_range, y_in_range); - - auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm_cvtps_epi32(x3_offset_f); - - auto x0_val = mask_gather_ps(ptr, x0_offset, *reinterpret_cast<__m128*>(&v0_in_range)); - auto x1_val = mask_gather_ps(ptr, x1_offset, *reinterpret_cast<__m128*>(&v1_in_range)); - auto x2_val = mask_gather_ps(ptr, x2_offset, *reinterpret_cast<__m128*>(&v2_in_range)); - auto x3_val = mask_gather_ps(ptr, x3_offset, *reinterpret_cast<__m128*>(&v3_in_range)); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_2d_bicubic_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_bicubic_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); } } - else + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - - gx = grid_sample_unormalize_p4(vImgWfp4, gx, align_corner); - gy = grid_sample_unormalize_p4(vImgHfp4, gy, align_corner); - - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); - - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p4(_mm_add_ps(gy_floor, vn1fp4), vImgWfp4, padding_mode, align_corner); - auto gx1 = compute_coord_p4(gy_floor, vImgWfp4, padding_mode, align_corner); - auto gx2 = compute_coord_p4(_mm_add_ps(gy_floor, v1fp4), vImgWfp4, padding_mode, align_corner); - auto gx3 = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(2.0f)), vImgWfp4, padding_mode, align_corner); - - gy = compute_coord_p4(_mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)), vImgHfp4, padding_mode, align_corner); - - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); - - auto y = _mm_cvtps_epi32(gy); - - auto x0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto x3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWfp4), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - auto x0_offset = _mm_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm_cvtps_epi32(x3_offset_f); - - auto x0_val = mask_gather_ps(ptr, x0_offset, vn1fp4); - auto x1_val = mask_gather_ps(ptr, x1_offset, vn1fp4); - auto x2_val = mask_gather_ps(ptr, x2_offset, vn1fp4); - auto x3_val = mask_gather_ps(ptr, x3_offset, vn1fp4); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } + gridsample_2d_bicubic_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_bicubic_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); } } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_2d_bicubic_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_2d_bicubic_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } } } @@ -1166,335 +626,84 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); - - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); - - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z0)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); - - __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); - v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); - v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); - v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); - - v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); - v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); - v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = mask_gather_ps(ptr, i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); - auto tne_val = mask_gather_ps(ptr, i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); - auto tsw_val = mask_gather_ps(ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); - auto tse_val = mask_gather_ps(ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - - auto bnw_val = mask_gather_ps(ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); - auto bne_val = mask_gather_ps(ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - - auto _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + gridsample_3d_bilinear_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_bilinear_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); } } - else + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); - - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); - - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWip4, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHip4, y1)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDip4, z1)); - - __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); - v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWip4), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWip4, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - - auto tnw_val = mask_gather_ps(ptr, i_tnw_offset, vn1fp4); - auto tne_val = mask_gather_ps(ptr, i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto tsw_val = mask_gather_ps(ptr, i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto tse_val = mask_gather_ps(ptr, i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - - auto bnw_val = mask_gather_ps(ptr, i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - auto bne_val = mask_gather_ps(ptr, i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(ptr, i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(ptr, i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - - auto _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + + gridsample_3d_bilinear_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_bilinear_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_3d_bilinear_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_bilinear_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); } } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } } if (sample_type == 2) { if (padding_mode == 1) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); - - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWip4, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHip4, iy))); - v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDip4, iz))); - - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - - auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m128*>(&v_in_range)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + gridsample_3d_nearest_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_nearest_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); } } - else + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); - for (int z = 0; z < outD; z++) - { - for (int y = 0; y < outH; y++) - { - for (int x = 0; x < outW; x++) - { - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - gx = get_coord_p4(gx, vImgWfp4, padding_mode, align_corner); - gy = get_coord_p4(gy, vImgHfp4, padding_mode, align_corner); - gz = get_coord_p4(gz, vImgDfp4, padding_mode, align_corner); - - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); - - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWip4, vImgHip4), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWip4), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - - auto _v = mask_gather_ps(static_cast(bottom_blob.channel(q).data), - i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(outptr, _v); - - outptr += elempack; - } - } - } + gridsample_3d_nearest_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + else + { + gridsample_3d_nearest_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); + } + } + else if (padding_mode == 3) + { + if (align_corner == 0) + { + gridsample_3d_nearest_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); } + else + { + gridsample_3d_nearest_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); + } + } + else + { + NCNN_LOGE("gridsample sample_type error\n"); + return -100; } } @@ -1541,6 +750,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data); + + + int nn = size >> 3; + int remain = size; #if __AVX__ - for (; j + 7 < size; j += 8) +#pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < nn; j ++) { auto tmp_x = _mm256_loadu_ps(gridptr + j); auto gy = _mm256_loadu_ps(gridptr + j + 8); @@ -1718,20 +927,23 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(&v_in_range), sizeof(float)); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; + _mm256_storeu_ps(top_blob.channel(q).row(0) + j * 8, _v); + } } + + remain = remain & 7; #endif // __AVX__ - for (; j < size; j++) +#pragma omp parallel for num_threads(opt.num_threads) + for (int j = size - remain; j < nn; j++) { } - } + } else { @@ -1781,6 +993,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Wed, 23 Nov 2022 11:27:33 +0000 Subject: [PATCH 027/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_pack4.h | 68 ++++++++-------- src/layer/x86/gridsample_bicubic_pack8.h | 68 ++++++++-------- src/layer/x86/gridsample_bilinear_pack4.h | 75 ++++++----------- src/layer/x86/gridsample_bilinear_pack8.h | 57 +++++-------- src/layer/x86/gridsample_nearest_pack4.h | 75 +++++++---------- src/layer/x86/gridsample_nearest_pack8.h | 75 +++++++---------- src/layer/x86/gridsample_x86.cpp | 99 ++++++++++------------- 7 files changed, 207 insertions(+), 310 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index b01521b5276..a75b284f808 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -43,7 +43,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d const auto vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -88,7 +88,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); @@ -103,13 +103,13 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -146,7 +146,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d const auto vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -188,7 +188,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); @@ -203,13 +203,13 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -246,7 +246,7 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& const auto vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -294,13 +294,13 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& auto y = _mm_cvtps_epi32(gy); auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -337,7 +337,7 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& const auto vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -386,13 +386,13 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& auto y = _mm_cvtps_epi32(gy); auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -429,7 +429,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M const auto vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -475,7 +475,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); - // x1 gx1 = _mm_add_ps(gx1, v0p5fp4); @@ -528,7 +527,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - + { //y const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -550,13 +549,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M auto y = _mm_cvtps_epi32(gy); auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -595,7 +594,7 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M const auto vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -633,7 +632,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M auto reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, border_x), *(__m128*)_ps256_inv_sign_mask); gx0 = _mm_sub_ps(border_x, reflectx0_v); - // x1 gx1 = _mm_and_ps(gx1, *(__m128*)_ps256_inv_sign_mask); @@ -676,13 +674,13 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M auto y = _mm_cvtps_epi32(gy); auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index e189297bc01..d38997a8ac4 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -43,7 +43,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d const auto vElempackf = _mm256_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -88,7 +88,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); @@ -103,13 +103,13 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -146,7 +146,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d const auto vElempackf = _mm256_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -188,7 +188,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); @@ -203,13 +203,13 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -246,7 +246,7 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& const auto vElempackf = _mm256_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -294,13 +294,13 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& auto y = _mm256_cvtps_epi32(gy); auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -337,7 +337,7 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& const auto vElempackf = _mm256_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -386,13 +386,13 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& auto y = _mm256_cvtps_epi32(gy); auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -429,7 +429,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M const auto vElempackf = _mm256_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -475,7 +475,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - // x1 gx1 = _mm256_add_ps(gx1, v0p5fp8); @@ -528,7 +527,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - + { //y const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -550,13 +549,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M auto y = _mm256_cvtps_epi32(gy); auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -595,7 +594,7 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M const auto vElempackf = _mm256_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -633,7 +632,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M auto reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); gx0 = _mm256_sub_ps(border_x, reflectx0_v); - // x1 gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); @@ -676,13 +674,13 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M auto y = _mm256_cvtps_epi32(gy); auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index 7b9005057d8..22328d78cf4 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -21,7 +21,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -72,7 +72,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -104,7 +104,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -155,12 +155,11 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); @@ -188,7 +187,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -209,7 +208,6 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); @@ -243,12 +241,11 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); @@ -276,7 +273,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -297,7 +294,6 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); @@ -331,7 +327,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -363,7 +359,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -396,7 +392,6 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); @@ -441,7 +436,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -473,7 +468,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -497,7 +492,6 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); - // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); @@ -534,7 +528,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -557,7 +551,6 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, } } - static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm_set1_ps(src.w); @@ -569,7 +562,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -658,8 +651,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); @@ -675,7 +667,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - + auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); @@ -709,7 +701,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -798,8 +790,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); @@ -849,7 +840,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -873,7 +864,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); @@ -881,7 +871,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - // z gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); @@ -942,8 +931,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); @@ -953,7 +941,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); @@ -994,7 +981,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1018,7 +1005,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); @@ -1026,7 +1012,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - // z gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); @@ -1087,8 +1072,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); @@ -1098,7 +1082,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); @@ -1139,7 +1122,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1174,7 +1157,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -1192,7 +1174,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - // z gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); const auto border_z = _mm_sub_ps(vImgDf, v1fp4); @@ -1263,8 +1244,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); @@ -1274,7 +1254,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); @@ -1315,7 +1294,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1341,7 +1320,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); - // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -1351,7 +1329,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); - // z gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); const auto border_z = _mm_sub_ps(vImgDf, v1fp4); @@ -1414,8 +1391,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0) - , _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); @@ -1425,7 +1401,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index 12574051fb4..bf557ed99d6 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -21,7 +21,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -72,7 +72,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -104,7 +104,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -155,12 +155,11 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); @@ -188,7 +187,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -209,7 +208,6 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); @@ -243,12 +241,11 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); @@ -276,7 +273,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -297,7 +294,6 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); @@ -331,7 +327,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -363,7 +359,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -396,7 +392,6 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); @@ -441,7 +436,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -473,7 +468,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -497,7 +492,6 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); - // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); @@ -534,7 +528,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, // (W*y + x) * elempack + vec(8) auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -557,7 +551,6 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, } } - static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); @@ -569,7 +562,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -674,7 +667,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - + auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); @@ -708,7 +701,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -847,7 +840,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -871,7 +864,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); @@ -879,7 +871,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - // z gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); @@ -950,7 +941,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); @@ -991,7 +981,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1015,7 +1005,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); @@ -1023,7 +1012,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - // z gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); @@ -1094,7 +1082,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); @@ -1135,7 +1122,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1170,7 +1157,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -1188,7 +1174,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - // z gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); @@ -1269,7 +1254,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); @@ -1310,7 +1294,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1336,7 +1320,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); - // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -1346,7 +1329,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); - // z gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); @@ -1419,7 +1401,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - for (int q = 0; q < dst.c; q++) { auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index 047ee521c1e..3e0bc9ab74c 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -21,7 +21,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -49,10 +49,10 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d auto iy = _mm_cvtps_epi32(gy); auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -73,7 +73,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -101,10 +101,10 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d auto iy = _mm_cvtps_epi32(gy); auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -125,7 +125,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -146,7 +146,6 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); @@ -162,7 +161,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& auto iy = _mm_cvtps_epi32(gy); auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -183,7 +182,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -204,7 +203,6 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); @@ -220,7 +218,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& auto iy = _mm_cvtps_epi32(gy); auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -241,7 +239,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -277,7 +275,6 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -299,7 +296,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M auto iy = _mm_cvtps_epi32(gy); auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -320,7 +317,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -347,7 +344,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); - // y const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -361,7 +357,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M auto iy = _mm_cvtps_epi32(gy); auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -373,7 +369,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M } } - static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm_set1_ps(src.w); @@ -385,7 +380,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -421,11 +416,10 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d auto iz = _mm_cvtps_epi32(gz); auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -449,7 +443,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -485,11 +479,10 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d auto iz = _mm_cvtps_epi32(gz); auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -513,7 +506,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -537,7 +530,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); @@ -545,7 +537,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - // z gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); @@ -562,8 +553,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& auto iy = _mm_cvtps_epi32(gy); auto iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -587,7 +577,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -611,7 +601,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); @@ -619,7 +608,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - // z gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); @@ -636,8 +624,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& auto iy = _mm_cvtps_epi32(gy); auto iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -661,7 +648,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -702,7 +689,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - // y const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -719,7 +705,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - // z const auto border_z = _mm_sub_ps(vImgDf, v1fp4); @@ -741,8 +726,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M auto iy = _mm_cvtps_epi32(gy); auto iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -766,7 +750,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M const auto vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -798,7 +782,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); - // y const auto border_y = _mm_sub_ps(vImgHf, v1fp4); @@ -807,7 +790,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); - // z const auto border_z = _mm_sub_ps(vImgDf, v1fp4); @@ -821,8 +803,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M auto iy = _mm_cvtps_epi32(gy); auto iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz) - , _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index fd10e64b985..21be4c1c0a5 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -21,7 +21,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -49,10 +49,10 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d auto iy = _mm256_cvtps_epi32(gy); auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -73,7 +73,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -101,10 +101,10 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d auto iy = _mm256_cvtps_epi32(gy); auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -125,7 +125,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -146,7 +146,6 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); @@ -162,7 +161,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& auto iy = _mm256_cvtps_epi32(gy); auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -183,7 +182,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -204,7 +203,6 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); @@ -220,7 +218,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& auto iy = _mm256_cvtps_epi32(gy); auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -241,7 +239,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -277,7 +275,6 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -299,7 +296,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M auto iy = _mm256_cvtps_epi32(gy); auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -320,7 +317,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -347,7 +344,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); - // y const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -361,7 +357,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M auto iy = _mm256_cvtps_epi32(gy); auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -373,7 +369,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M } } - static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const auto vImgWf = _mm256_set1_ps(src.w); @@ -385,7 +380,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -421,11 +416,10 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d auto iz = _mm256_cvtps_epi32(gz); auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -449,7 +443,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -485,11 +479,10 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d auto iz = _mm256_cvtps_epi32(gz); auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -513,7 +506,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -537,7 +530,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); @@ -545,7 +537,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - // z gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); @@ -562,8 +553,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& auto iy = _mm256_cvtps_epi32(gy); auto iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -587,7 +577,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -611,7 +601,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); @@ -619,7 +608,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - // z gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); @@ -636,8 +624,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& auto iy = _mm256_cvtps_epi32(gy); auto iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -661,7 +648,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -702,7 +689,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -719,7 +705,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - // z const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); @@ -741,8 +726,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M auto iy = _mm256_cvtps_epi32(gy); auto iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -766,7 +750,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M const auto vElempacki = _mm256_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -798,7 +782,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); - // y const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); @@ -807,7 +790,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); - // z const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); @@ -821,8 +803,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M auto iy = _mm256_cvtps_epi32(gy); auto iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz) - , _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 53fffc87b8f..d03d8cdfe4b 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -243,7 +243,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector> 3; - int remain = size; + int nn = size >> 3; + int remain = size; #if __AVX__ -#pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < nn; j ++) - { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < nn; j++) + { + auto tmp_x = _mm256_loadu_ps(gridptr + j); + auto gy = _mm256_loadu_ps(gridptr + j + 8); - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); + gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + auto ix = _mm256_cvtps_epi32(gx); + auto iy = _mm256_cvtps_epi32(gy); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < channels; q++) - { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + for (int q = 0; q < channels; q++) + { + auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), + i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(0) + j * 8, _v); - } + _mm256_storeu_ps(top_blob.channel(q).row(0) + j * 8, _v); } + } - remain = remain & 7; + remain = remain & 7; #endif // __AVX__ -#pragma omp parallel for num_threads(opt.num_threads) - for (int j = size - remain; j < nn; j++) - { - - } - + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = size - remain; j < nn; j++) + { + } } else { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { int j = 0; @@ -977,7 +965,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(outptr, _v); @@ -986,17 +974,16 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 24 Nov 2022 02:42:25 +0800 Subject: [PATCH 028/127] finish pack16 and support c++03 --- src/layer/x86/avx512_mathfun.h | 1 + src/layer/x86/gridsample_bicubic_pack16.h | 709 ++++++++++ src/layer/x86/gridsample_bicubic_pack4.h | 524 +++---- src/layer/x86/gridsample_bicubic_pack8.h | 444 +++--- src/layer/x86/gridsample_bilinear_pack16.h | 1431 ++++++++++++++++++++ src/layer/x86/gridsample_bilinear_pack4.h | 1184 ++++++++-------- src/layer/x86/gridsample_bilinear_pack8.h | 1202 ++++++++-------- src/layer/x86/gridsample_nearest_pack16.h | 814 +++++++++++ src/layer/x86/gridsample_nearest_pack4.h | 472 +++---- src/layer/x86/gridsample_nearest_pack8.h | 412 +++--- src/layer/x86/gridsample_x86.cpp | 402 ++++-- 11 files changed, 5332 insertions(+), 2263 deletions(-) create mode 100644 src/layer/x86/gridsample_bicubic_pack16.h create mode 100644 src/layer/x86/gridsample_bilinear_pack16.h create mode 100644 src/layer/x86/gridsample_nearest_pack16.h diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index 0513d5e1be1..728ebbe121a 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -44,6 +44,7 @@ _PS512_CONST_TYPE(mant_mask, int, 0x7f800000); _PS512_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); _PS512_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS512_CONST_TYPE(inv_sign_mask, int, ~0x80000000); _PI32_CONST512(0, 0); _PI32_CONST512(1, 1); diff --git a/src/layer/x86/gridsample_bicubic_pack16.h b/src/layer/x86/gridsample_bicubic_pack16.h new file mode 100644 index 00000000000..c2038edf877 --- /dev/null +++ b/src/layer/x86/gridsample_bicubic_pack16.h @@ -0,0 +1,709 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static NCNN_FORCEINLINE __m512 cubic_interp1d_p16(const __m512& x0_v, const __m512& x1_v, const __m512& x2_v, const __m512& x3_v, const __m512& tx) +{ + const __m512 A = _mm512_set1_ps(-0.75f); + + const __m512 x0 = _mm512_add_ps(tx, v1fp16); + const __m512& x1 = tx; + const __m512 x2 = _mm512_sub_ps(v1fp16, tx); + //const __m512 x3 = _mm512_add_ps(x2, v1fp16); + + const __m512 coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); + const __m512 coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), v1fp16); + const __m512 coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), v1fp16); + const __m512 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(v1fp16, coeffs0), coeffs1), coeffs2); + + __m512 _v = _mm512_mul_ps(coeffs0, x0_v); + _v = _mm512_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm512_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm512_fmadd_ps(coeffs3, x3_v, _v); + + return _v; +} + +static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512 vElempackf = _mm512_set1_ps(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + } + + __m512 gx_floor = _mm512_floor_ps(gx); + __m512 gy_floor = _mm512_floor_ps(gy); + + const __m512 tx = _mm512_sub_ps(gx, gx_floor); + const __m512 ty = _mm512_sub_ps(gy, gy_floor); + + __m512 coefficients[4]; + + __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx1 = gx_floor; + __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); + + __m512i x0 = _mm512_cvtps_epi32(gx0); + __m512i x1 = _mm512_cvtps_epi32(gx1); + __m512i x2 = _mm512_cvtps_epi32(gx2); + __m512i x3 = _mm512_cvtps_epi32(gx3); + + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x2); + __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x3); + + __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __mmask16 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); + + __m512i y = _mm512_cvtps_epi32(gy); + + __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y); + + v0_in_range[i] = x0_in_range & y_in_range; + v1_in_range[i] = x1_in_range & y_in_range; + v2_in_range[i] = x2_in_range & y_in_range; + v3_in_range[i] = x3_in_range & y_in_range; + + __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v0_in_range[i], v0_offset[i], src.channel(q), sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v1_in_range[i], v1_offset[i], src.channel(q), sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v2_in_range[i], v2_offset[i], src.channel(q), sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v3_in_range[i], v3_offset[i], src.channel(q), sizeof(float)); + + coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512 vElempackf = _mm512_set1_ps(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + } + + __m512 gx_floor = _mm512_floor_ps(gx); + __m512 gy_floor = _mm512_floor_ps(gy); + + const __m512 tx = _mm512_sub_ps(gx, gx_floor); + const __m512 ty = _mm512_sub_ps(gy, gy_floor); + + __m512 coefficients[4]; + + __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx1 = gx_floor; + __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); + + __m512i x0 = _mm512_cvtps_epi32(gx0); + __m512i x1 = _mm512_cvtps_epi32(gx1); + __m512i x2 = _mm512_cvtps_epi32(gx2); + __m512i x3 = _mm512_cvtps_epi32(gx3); + + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x2); + __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x3); + + __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __mmask16 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); + + __m512i y = _mm512_cvtps_epi32(gy); + + __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y); + + v0_in_range[i] = x0_in_range & y_in_range; + v1_in_range[i] = x1_in_range & y_in_range; + v2_in_range[i] = x2_in_range & y_in_range; + v3_in_range[i] = x3_in_range & y_in_range; + + __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v0_in_range[i], v0_offset[i], src.channel(q), sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v1_in_range[i], v1_offset[i], src.channel(q), sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v2_in_range[i], v2_offset[i], src.channel(q), sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v3_in_range[i], v3_offset[i], src.channel(q), sizeof(float)); + + coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512 vElempackf = _mm512_set1_ps(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + const __m512 two = _mm512_set1_ps(2.f); + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + __m512 gx_floor = _mm512_floor_ps(gx); + __m512 gy_floor = _mm512_floor_ps(gy); + + const __m512 tx = _mm512_sub_ps(gx, gx_floor); + const __m512 ty = _mm512_sub_ps(gy, gy_floor); + + __m512 coefficients[4]; + + __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx1 = gx_floor; + __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); + + gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); + gx1 = _mm512_min_ps(border_x, _mm512_max_ps(gx1, _mm512_setzero_ps())); + gx2 = _mm512_min_ps(border_x, _mm512_max_ps(gx2, _mm512_setzero_ps())); + gx3 = _mm512_min_ps(border_x, _mm512_max_ps(gx3, _mm512_setzero_ps())); + + __m512i x0 = _mm512_cvtps_epi32(gx0); + __m512i x1 = _mm512_cvtps_epi32(gx1); + __m512i x2 = _mm512_cvtps_epi32(gx2); + __m512i x3 = _mm512_cvtps_epi32(gx3); + + __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + __m512i y = _mm512_cvtps_epi32(gy); + + __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); + + coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512 vElempackf = _mm512_set1_ps(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + const __m512 two = _mm512_set1_ps(2.f); + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + __m512 gx_floor = _mm512_floor_ps(gx); + __m512 gy_floor = _mm512_floor_ps(gy); + + const __m512 tx = _mm512_sub_ps(gx, gx_floor); + const __m512 ty = _mm512_sub_ps(gy, gy_floor); + + __m512 coefficients[4]; + + __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx1 = gx_floor; + __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); + + gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); + gx1 = _mm512_min_ps(border_x, _mm512_max_ps(gx1, _mm512_setzero_ps())); + gx2 = _mm512_min_ps(border_x, _mm512_max_ps(gx2, _mm512_setzero_ps())); + gx3 = _mm512_min_ps(border_x, _mm512_max_ps(gx3, _mm512_setzero_ps())); + + __m512i x0 = _mm512_cvtps_epi32(gx0); + __m512i x1 = _mm512_cvtps_epi32(gx1); + __m512i x2 = _mm512_cvtps_epi32(gx2); + __m512i x3 = _mm512_cvtps_epi32(gx3); + + __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + __m512i y = _mm512_cvtps_epi32(gy); + + __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); + + coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512 vElempackf = _mm512_set1_ps(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + const __m512 two = _mm512_set1_ps(2.f); + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + __m512 gx_floor = _mm512_floor_ps(gx); + __m512 gy_floor = _mm512_floor_ps(gy); + + const __m512 tx = _mm512_sub_ps(gx, gx_floor); + const __m512 ty = _mm512_sub_ps(gy, gy_floor); + + __m512 coefficients[4]; + + __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx1 = gx_floor; + __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); + const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); + { + // x0 + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx0 = _mm512_add_ps(gx0, v0p5fp16); + + gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx0_v = _mm512_and_ps(_mm512_sub_ps(gx0, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx0 = _mm512_sub_ps(vImgWf, reflectx0_v); + + gx0 = _mm512_sub_ps(gx0, v0p5fp16); + + _mm512_sub_ps(gx0, v0p5fp16); + + gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); + + // x1 + gx1 = _mm512_add_ps(gx1, v0p5fp16); + + gx1 = _mm512_and_ps(gx1, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx1_v = _mm512_and_ps(_mm512_sub_ps(gx1, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx1 = _mm512_sub_ps(vImgWf, reflectx1_v); + + gx1 = _mm512_sub_ps(gx1, v0p5fp16); + + _mm512_sub_ps(gx1, v0p5fp16); + + gx1 = _mm512_min_ps(border_x, _mm512_max_ps(gx1, _mm512_setzero_ps())); + + // x2 + gx2 = _mm512_add_ps(gx2, v0p5fp16); + + gx2 = _mm512_and_ps(gx2, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx2_v = _mm512_and_ps(_mm512_sub_ps(gx2, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx2 = _mm512_sub_ps(vImgWf, reflectx2_v); + + gx2 = _mm512_sub_ps(gx2, v0p5fp16); + + _mm512_sub_ps(gx2, v0p5fp16); + + gx2 = _mm512_min_ps(border_x, _mm512_max_ps(gx2, _mm512_setzero_ps())); + + // x3 + gx3 = _mm512_add_ps(gx3, v0p5fp16); + + gx3 = _mm512_and_ps(gx3, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx3_v = _mm512_and_ps(_mm512_sub_ps(gx3, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx3 = _mm512_sub_ps(vImgWf, reflectx3_v); + + gx3 = _mm512_sub_ps(gx3, v0p5fp16); + + _mm512_sub_ps(gx3, v0p5fp16); + + gx3 = _mm512_min_ps(border_x, _mm512_max_ps(gx3, _mm512_setzero_ps())); + } + + __m512i x0 = _mm512_cvtps_epi32(gx0); + __m512i x1 = _mm512_cvtps_epi32(gx1); + __m512i x2 = _mm512_cvtps_epi32(gx2); + __m512i x3 = _mm512_cvtps_epi32(gx3); + + __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); + + { + //y + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_add_ps(gy, v0p5fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(vImgHf, reflecty_v); + + gy = _mm512_sub_ps(gy, v0p5fp16); + + _mm512_sub_ps(gy, v0p5fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + __m512i y = _mm512_cvtps_epi32(gy); + + __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); + + coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + float* outptr = static_cast(dst.data); + + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512 vElempackf = _mm512_set1_ps(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + const __m512 two = _mm512_set1_ps(2.f); + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + __m512 gx_floor = _mm512_floor_ps(gx); + __m512 gy_floor = _mm512_floor_ps(gy); + + const __m512 tx = _mm512_sub_ps(gx, gx_floor); + const __m512 ty = _mm512_sub_ps(gy, gy_floor); + + __m512 coefficients[4]; + + __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx1 = gx_floor; + __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); + const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); + { + // x0 + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); + __m512 reflectx0_v = _mm512_and_ps(_mm512_sub_ps(gx0, border_x), *(__m512*)_ps512_inv_sign_mask); + gx0 = _mm512_sub_ps(border_x, reflectx0_v); + + // x1 + gx1 = _mm512_and_ps(gx1, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx1_v = _mm512_and_ps(_mm512_sub_ps(gx1, border_x), *(__m512*)_ps512_inv_sign_mask); + gx1 = _mm512_sub_ps(border_x, reflectx1_v); + + // x2 + gx2 = _mm512_and_ps(gx2, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx2_v = _mm512_and_ps(_mm512_sub_ps(gx2, border_x), *(__m512*)_ps512_inv_sign_mask); + gx2 = _mm512_sub_ps(border_x, reflectx2_v); + + // x3 + gx3 = _mm512_and_ps(gx3, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx3_v = _mm512_and_ps(_mm512_sub_ps(gx3, border_x), *(__m512*)_ps512_inv_sign_mask); + gx3 = _mm512_sub_ps(border_x, reflectx3_v); + } + + __m512i x0 = _mm512_cvtps_epi32(gx0); + __m512i x1 = _mm512_cvtps_epi32(gx1); + __m512i x2 = _mm512_cvtps_epi32(gx2); + __m512i x3 = _mm512_cvtps_epi32(gx3); + + __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); + + { + //y + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(border_y, reflecty_v); + } + + __m512i y = _mm512_cvtps_epi32(gy); + + __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < dst.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); + + coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index a75b284f808..94b66b30053 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -14,19 +14,19 @@ static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) { - const auto A = _mm_set1_ps(-0.75f); + const __m128 A = _mm_set1_ps(-0.75f); - const auto x0 = _mm_add_ps(tx, v1fp4); - const auto& x1 = tx; - const auto x2 = _mm_sub_ps(v1fp4, tx); - //const auto x3 = _mm_add_ps(x2, v1fp4); + const __m128 x0 = _mm_add_ps(tx, v1fp4); + const __m128& x1 = tx; + const __m128 x2 = _mm_sub_ps(v1fp4, tx); + //const __m128 x3 = _mm_add_ps(x2, v1fp4); const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); const __m128 coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(v1fp4, coeffs0), coeffs1), coeffs2); - auto _v = _mm_mul_ps(coeffs0, x0_v); + __m128 _v = _mm_mul_ps(coeffs0, x0_v); _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); @@ -36,26 +36,26 @@ static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m12 static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempackf = _mm_set1_ps(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); @@ -64,52 +64,52 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); } - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = _mm_floor_ps(gx); + __m128 gy_floor = _mm_floor_ps(gy); - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); + const __m128 tx = _mm_sub_ps(gx, gx_floor); + const __m128 ty = _mm_sub_ps(gy, gy_floor); __m128 coefficients[4]; - auto gx0 = _mm_add_ps(gx_floor, vn1fp4); - auto gx1 = gx_floor; - auto gx2 = _mm_add_ps(gx_floor, v1fp4); - auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); + __m128 gx1 = gx_floor; + __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); + __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); + __m128i x0 = _mm_cvtps_epi32(gx0); + __m128i x1 = _mm_cvtps_epi32(gx1); + __m128i x2 = _mm_cvtps_epi32(gx2); + __m128i x3 = _mm_cvtps_epi32(gx3); - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); + __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); + __m128i x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - auto y = _mm_cvtps_epi32(gy); + __m128i y = _mm_cvtps_epi32(gy); - auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); + __m128i y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); v0_in_range[i] = _mm_and_si128(x0_in_range, y_in_range); v1_in_range[i] = _mm_and_si128(x1_in_range, y_in_range); v2_in_range[i] = _mm_and_si128(x2_in_range, y_in_range); v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); - auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -121,15 +121,15 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); - auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); - auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); - auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -139,77 +139,77 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempackf = _mm_set1_ps(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); } - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = _mm_floor_ps(gx); + __m128 gy_floor = _mm_floor_ps(gy); - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); + const __m128 tx = _mm_sub_ps(gx, gx_floor); + const __m128 ty = _mm_sub_ps(gy, gy_floor); __m128 coefficients[4]; - auto gx0 = _mm_add_ps(gx_floor, vn1fp4); - auto gx1 = gx_floor; - auto gx2 = _mm_add_ps(gx_floor, v1fp4); - auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); + __m128 gx1 = gx_floor; + __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); + __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); + __m128i x0 = _mm_cvtps_epi32(gx0); + __m128i x1 = _mm_cvtps_epi32(gx1); + __m128i x2 = _mm_cvtps_epi32(gx2); + __m128i x3 = _mm_cvtps_epi32(gx3); - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); + __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); + __m128i x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - auto y = _mm_cvtps_epi32(gy); + __m128i y = _mm_cvtps_epi32(gy); - auto y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); + __m128i y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); v0_in_range[i] = _mm_and_si128(x0_in_range, y_in_range); v1_in_range[i] = _mm_and_si128(x1_in_range, y_in_range); v2_in_range[i] = _mm_and_si128(x2_in_range, y_in_range); v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); - auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -221,15 +221,15 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); - auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); - auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); - auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -239,51 +239,51 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempackf = _mm_set1_ps(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - const auto two = _mm_set1_ps(2.f); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 two = _mm_set1_ps(2.f); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = _mm_floor_ps(gx); + __m128 gy_floor = _mm_floor_ps(gy); - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); + const __m128 tx = _mm_sub_ps(gx, gx_floor); + const __m128 ty = _mm_sub_ps(gy, gy_floor); __m128 coefficients[4]; - auto gx0 = _mm_add_ps(gx_floor, vn1fp4); - auto gx1 = gx_floor; - auto gx2 = _mm_add_ps(gx_floor, v1fp4); - auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); + __m128 gx1 = gx_floor; + __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); + __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); + __m128i x0 = _mm_cvtps_epi32(gx0); + __m128i x1 = _mm_cvtps_epi32(gx1); + __m128i x2 = _mm_cvtps_epi32(gx2); + __m128i x3 = _mm_cvtps_epi32(gx3); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -291,16 +291,16 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - auto y = _mm_cvtps_epi32(gy); + __m128i y = _mm_cvtps_epi32(gy); - auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -312,15 +312,15 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& { for (int i = 0; i < 4; i++) { - auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -330,52 +330,52 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempackf = _mm_set1_ps(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - const auto two = _mm_set1_ps(2.f); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 two = _mm_set1_ps(2.f); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = _mm_floor_ps(gx); + __m128 gy_floor = _mm_floor_ps(gy); - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); + const __m128 tx = _mm_sub_ps(gx, gx_floor); + const __m128 ty = _mm_sub_ps(gy, gy_floor); __m128 coefficients[4]; - auto gx0 = _mm_add_ps(gx_floor, vn1fp4); - auto gx1 = gx_floor; - auto gx2 = _mm_add_ps(gx_floor, v1fp4); - auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); + __m128 gx1 = gx_floor; + __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); + __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); + __m128i x0 = _mm_cvtps_epi32(gx0); + __m128i x1 = _mm_cvtps_epi32(gx1); + __m128i x2 = _mm_cvtps_epi32(gx2); + __m128i x3 = _mm_cvtps_epi32(gx3); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -383,16 +383,16 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - auto y = _mm_cvtps_epi32(gy); + __m128i y = _mm_cvtps_epi32(gy); - auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -404,15 +404,15 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& { for (int i = 0; i < 4; i++) { - auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -422,51 +422,51 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempackf = _mm_set1_ps(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - const auto two = _mm_set1_ps(2.f); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 two = _mm_set1_ps(2.f); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = _mm_floor_ps(gx); + __m128 gy_floor = _mm_floor_ps(gy); - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); + const __m128 tx = _mm_sub_ps(gx, gx_floor); + const __m128 ty = _mm_sub_ps(gy, gy_floor); __m128 coefficients[4]; - auto gx0 = _mm_add_ps(gx_floor, vn1fp4); - auto gx1 = gx_floor; - auto gx2 = _mm_add_ps(gx_floor, v1fp4); - auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - const auto v0p5fp4 = _mm_set1_ps(0.5f); + __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); + __m128 gx1 = gx_floor; + __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); + __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + const __m128 v0p5fp4 = _mm_set1_ps(0.5f); { // x0 - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx0 = _mm_add_ps(gx0, v0p5fp4); - gx0 = _mm_and_ps(gx0, *(__m128*)_ps256_inv_sign_mask); + gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); - auto reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, vImgWf), *(__m128*)_ps_inv_sign_mask); gx0 = _mm_sub_ps(vImgWf, reflectx0_v); gx0 = _mm_sub_ps(gx0, v0p5fp4); @@ -478,9 +478,9 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M // x1 gx1 = _mm_add_ps(gx1, v0p5fp4); - gx1 = _mm_and_ps(gx1, *(__m128*)_ps256_inv_sign_mask); + gx1 = _mm_and_ps(gx1, *(__m128*)_ps_inv_sign_mask); - auto reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, vImgWf), *(__m128*)_ps_inv_sign_mask); gx1 = _mm_sub_ps(vImgWf, reflectx1_v); gx1 = _mm_sub_ps(gx1, v0p5fp4); @@ -492,9 +492,9 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M // x2 gx2 = _mm_add_ps(gx2, v0p5fp4); - gx2 = _mm_and_ps(gx2, *(__m128*)_ps256_inv_sign_mask); + gx2 = _mm_and_ps(gx2, *(__m128*)_ps_inv_sign_mask); - auto reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, vImgWf), *(__m128*)_ps_inv_sign_mask); gx2 = _mm_sub_ps(vImgWf, reflectx2_v); gx2 = _mm_sub_ps(gx2, v0p5fp4); @@ -506,9 +506,9 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M // x3 gx3 = _mm_add_ps(gx3, v0p5fp4); - gx3 = _mm_and_ps(gx3, *(__m128*)_ps256_inv_sign_mask); + gx3 = _mm_and_ps(gx3, *(__m128*)_ps_inv_sign_mask); - auto reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, vImgWf), *(__m128*)_ps_inv_sign_mask); gx3 = _mm_sub_ps(vImgWf, reflectx3_v); gx3 = _mm_sub_ps(gx3, v0p5fp4); @@ -518,10 +518,10 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); } - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); + __m128i x0 = _mm_cvtps_epi32(gx0); + __m128i x1 = _mm_cvtps_epi32(gx1); + __m128i x2 = _mm_cvtps_epi32(gx2); + __m128i x3 = _mm_cvtps_epi32(gx3); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -530,13 +530,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M { //y - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_add_ps(gy, v0p5fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(vImgHf, reflecty_v); gy = _mm_sub_ps(gy, v0p5fp4); @@ -546,16 +546,16 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - auto y = _mm_cvtps_epi32(gy); + __m128i y = _mm_cvtps_epi32(gy); - auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -567,15 +567,15 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M { for (int i = 0; i < 4; i++) { - auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -587,74 +587,74 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M { float* outptr = static_cast(dst.data); - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempackf = _mm_set1_ps(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - const auto two = _mm_set1_ps(2.f); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 two = _mm_set1_ps(2.f); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - auto gx_floor = _mm_floor_ps(gx); - auto gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = _mm_floor_ps(gx); + __m128 gy_floor = _mm_floor_ps(gy); - const auto tx = _mm_sub_ps(gx, gx_floor); - const auto ty = _mm_sub_ps(gy, gy_floor); + const __m128 tx = _mm_sub_ps(gx, gx_floor); + const __m128 ty = _mm_sub_ps(gy, gy_floor); __m128 coefficients[4]; - auto gx0 = _mm_add_ps(gx_floor, vn1fp4); - auto gx1 = gx_floor; - auto gx2 = _mm_add_ps(gx_floor, v1fp4); - auto gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - const auto v0p5fp4 = _mm_set1_ps(0.5f); + __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); + __m128 gx1 = gx_floor; + __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); + __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); + const __m128 v0p5fp4 = _mm_set1_ps(0.5f); { // x0 - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx0 = _mm_and_ps(gx0, *(__m128*)_ps256_inv_sign_mask); - auto reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, border_x), *(__m128*)_ps256_inv_sign_mask); + gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); + __m128 reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, border_x), *(__m128*)_ps_inv_sign_mask); gx0 = _mm_sub_ps(border_x, reflectx0_v); // x1 - gx1 = _mm_and_ps(gx1, *(__m128*)_ps256_inv_sign_mask); + gx1 = _mm_and_ps(gx1, *(__m128*)_ps_inv_sign_mask); - auto reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, border_x), *(__m128*)_ps_inv_sign_mask); gx1 = _mm_sub_ps(border_x, reflectx1_v); // x2 - gx2 = _mm_and_ps(gx2, *(__m128*)_ps256_inv_sign_mask); + gx2 = _mm_and_ps(gx2, *(__m128*)_ps_inv_sign_mask); - auto reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, border_x), *(__m128*)_ps_inv_sign_mask); gx2 = _mm_sub_ps(border_x, reflectx2_v); // x3 - gx3 = _mm_and_ps(gx3, *(__m128*)_ps256_inv_sign_mask); + gx3 = _mm_and_ps(gx3, *(__m128*)_ps_inv_sign_mask); - auto reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, border_x), *(__m128*)_ps_inv_sign_mask); gx3 = _mm_sub_ps(border_x, reflectx3_v); } - auto x0 = _mm_cvtps_epi32(gx0); - auto x1 = _mm_cvtps_epi32(gx1); - auto x2 = _mm_cvtps_epi32(gx2); - auto x3 = _mm_cvtps_epi32(gx3); + __m128i x0 = _mm_cvtps_epi32(gx0); + __m128i x1 = _mm_cvtps_epi32(gx1); + __m128i x2 = _mm_cvtps_epi32(gx2); + __m128i x3 = _mm_cvtps_epi32(gx3); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -663,24 +663,24 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M { //y - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); } - auto y = _mm_cvtps_epi32(gy); + __m128i y = _mm_cvtps_epi32(gy); - auto v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -692,15 +692,15 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M { for (int i = 0; i < 4; i++) { - auto x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - auto x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - auto x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - auto x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index d38997a8ac4..d39fbe591b1 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -14,19 +14,19 @@ static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) { - const auto A = _mm256_set1_ps(-0.75f); + const __m256 A = _mm256_set1_ps(-0.75f); - const auto x0 = _mm256_add_ps(tx, v1fp8); - const auto& x1 = tx; - const auto x2 = _mm256_sub_ps(v1fp8, tx); - //const auto x3 = _mm256_add_ps(x2, v1fp8); + const __m256 x0 = _mm256_add_ps(tx, v1fp8); + const __m256& x1 = tx; + const __m256 x2 = _mm256_sub_ps(v1fp8, tx); + //const __m256 x3 = _mm256_add_ps(x2, v1fp8); const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); - auto _v = _mm256_mul_ps(coeffs0, x0_v); + __m256 _v = _mm256_mul_ps(coeffs0, x0_v); _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); @@ -36,12 +36,12 @@ static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m25 static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempackf = _mm256_set1_ps(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -50,12 +50,12 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); @@ -64,28 +64,28 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); } - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 coefficients[4]; - auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); - auto gx1 = gx_floor; - auto gx2 = _mm256_add_ps(gx_floor, v1fp8); - auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; @@ -93,22 +93,22 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - auto y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); - auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); @@ -121,15 +121,15 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -139,12 +139,12 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempackf = _mm256_set1_ps(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -153,39 +153,39 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); } - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 coefficients[4]; - auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); - auto gx1 = gx_floor; - auto gx2 = _mm256_add_ps(gx_floor, v1fp8); - auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; @@ -193,22 +193,22 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - auto y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); - auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); @@ -221,15 +221,15 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -239,12 +239,12 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempackf = _mm256_set1_ps(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -253,37 +253,37 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto two = _mm256_set1_ps(2.f); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 coefficients[4]; - auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); - auto gx1 = gx_floor; - auto gx2 = _mm256_add_ps(gx_floor, v1fp8); - auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -291,15 +291,15 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - auto y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); @@ -312,15 +312,15 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& { for (int i = 0; i < 4; i++) { - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -330,12 +330,12 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempackf = _mm256_set1_ps(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -344,38 +344,38 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto two = _mm256_set1_ps(2.f); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 coefficients[4]; - auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); - auto gx1 = gx_floor; - auto gx2 = _mm256_add_ps(gx_floor, v1fp8); - auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -383,15 +383,15 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - auto y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); @@ -404,15 +404,15 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& { for (int i = 0; i < 4; i++) { - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -422,12 +422,12 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempackf = _mm256_set1_ps(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -436,37 +436,37 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto two = _mm256_set1_ps(2.f); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 coefficients[4]; - auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); - auto gx1 = gx_floor; - auto gx2 = _mm256_add_ps(gx_floor, v1fp8); - auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const auto v0p5fp8 = _mm256_set1_ps(0.5f); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); { // x0 - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx0 = _mm256_add_ps(gx0, v0p5fp8); gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - auto reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); gx0 = _mm256_sub_ps(gx0, v0p5fp8); @@ -480,7 +480,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - auto reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); gx1 = _mm256_sub_ps(gx1, v0p5fp8); @@ -494,7 +494,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - auto reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); gx2 = _mm256_sub_ps(gx2, v0p5fp8); @@ -508,7 +508,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - auto reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); gx3 = _mm256_sub_ps(gx3, v0p5fp8); @@ -518,10 +518,10 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); } - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -530,13 +530,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { //y - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_add_ps(gy, v0p5fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(vImgHf, reflecty_v); gy = _mm256_sub_ps(gy, v0p5fp8); @@ -546,15 +546,15 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - auto y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); @@ -567,15 +567,15 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { for (int i = 0; i < 4; i++) { - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -587,12 +587,12 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { float* outptr = static_cast(dst.data); - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempackf = _mm256_set1_ps(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -601,60 +601,60 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto two = _mm256_set1_ps(2.f); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 coefficients[4]; - auto gx0 = _mm256_add_ps(gx_floor, vn1fp8); - auto gx1 = gx_floor; - auto gx2 = _mm256_add_ps(gx_floor, v1fp8); - auto gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const auto v0p5fp8 = _mm256_set1_ps(0.5f); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); { // x0 - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - auto reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); gx0 = _mm256_sub_ps(border_x, reflectx0_v); // x1 gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - auto reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); gx1 = _mm256_sub_ps(border_x, reflectx1_v); // x2 gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - auto reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); gx2 = _mm256_sub_ps(border_x, reflectx2_v); // x3 gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - auto reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); gx3 = _mm256_sub_ps(border_x, reflectx3_v); } - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) @@ -663,23 +663,23 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { //y - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); } - auto y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - auto v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - auto v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); @@ -692,15 +692,15 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { for (int i = 0; i < 4; i++) { - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } diff --git a/src/layer/x86/gridsample_bilinear_pack16.h b/src/layer/x86/gridsample_bilinear_pack16.h new file mode 100644 index 00000000000..d15ef5df3b5 --- /dev/null +++ b/src/layer/x86/gridsample_bilinear_pack16.h @@ -0,0 +1,1431 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + + __mmask16 v00_in_range = x0_in_range & y0_in_range; + __mmask16 v01_in_range = x0_in_range & y1_in_range; + __mmask16 v10_in_range = x1_in_range & y0_in_range; + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + // (W*y + x) * elempack + vec(8) + __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); + __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v00_in_range, i_nw_offset, src.channel(q), sizeof(float)); + __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v10_in_range, i_ne_offset, src.channel(q), sizeof(float)); + __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v01_in_range, i_sw_offset, src.channel(q), sizeof(float)); + __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(nw_val, nw); + _v = _mm512_fmadd_ps(ne_val, ne, _v); + _v = _mm512_fmadd_ps(sw_val, sw, _v); + _v = _mm512_fmadd_ps(se_val, se, _v); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + + __mmask16 v00_in_range = x0_in_range & y0_in_range; + __mmask16 v01_in_range = x0_in_range & y1_in_range; + __mmask16 v10_in_range = x1_in_range & y0_in_range; + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + // (W*y + x) * elempack + vec(8) + __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); + __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v00_in_range, i_nw_offset, src.channel(q), sizeof(float)); + __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v10_in_range, i_ne_offset, src.channel(q), sizeof(float)); + __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v01_in_range, i_sw_offset, src.channel(q), sizeof(float)); + __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(nw_val, nw); + _v = _mm512_fmadd_ps(ne_val, ne, _v); + _v = _mm512_fmadd_ps(sw_val, sw, _v); + _v = _mm512_fmadd_ps(se_val, se, _v); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + // (W*y + x) * elempack + vec(8) + __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); + __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); + __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); + __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); + __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(nw_val, nw); + _v = _mm512_fmadd_ps(ne_val, ne, _v); + _v = _mm512_fmadd_ps(sw_val, sw, _v); + _v = _mm512_fmadd_ps(se_val, se, _v); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + // (W*y + x) * elempack + vec(8) + __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); + __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); + __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); + __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); + __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(nw_val, nw); + _v = _mm512_fmadd_ps(ne_val, ne, _v); + _v = _mm512_fmadd_ps(sw_val, sw, _v); + _v = _mm512_fmadd_ps(se_val, se, _v); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + __m512 v0p5fp16 = _mm512_set1_ps(0.5f); + gx = _mm512_add_ps(gx, v0p5fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(vImgWf, reflectx_v); + + gx = _mm512_sub_ps(gx, v0p5fp16); + + _mm512_sub_ps(gx, v0p5fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_add_ps(gy, v0p5fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(vImgHf, reflecty_v); + + gy = _mm512_sub_ps(gy, v0p5fp16); + + _mm512_sub_ps(gy, v0p5fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + // (W*y + x) * elempack + vec(8) + __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); + __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); + __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); + __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); + __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(nw_val, nw); + _v = _mm512_fmadd_ps(ne_val, ne, _v); + _v = _mm512_fmadd_ps(sw_val, sw, _v); + _v = _mm512_fmadd_ps(se_val, se, _v); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(border_x, reflectx_v); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(border_y, reflecty_v); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + // (W*y + x) * elempack + vec(8) + __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); + __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); + __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); + __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); + __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(nw_val, nw); + _v = _mm512_fmadd_ps(ne_val, ne, _v); + _v = _mm512_fmadd_ps(sw_val, sw, _v); + _v = _mm512_fmadd_ps(se_val, se, _v); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + // z + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + __m512 z_t = _mm512_floor_ps(gz); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 t = _mm512_sub_ps(gz, z_t); + __m512 b = _mm512_sub_ps(v1fp16, t); + + __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + tnw = _mm512_mul_ps(b, nw); + tne = _mm512_mul_ps(b, ne); + tsw = _mm512_mul_ps(b, sw); + tse = _mm512_mul_ps(b, se); + + bnw = _mm512_mul_ps(t, nw); + bne = _mm512_mul_ps(t, ne); + bsw = _mm512_mul_ps(t, sw); + bse = _mm512_mul_ps(t, se); + } + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i z0 = _mm512_cvtps_epi32(z_t); + __m512i z1 = _mm512_add_epi32(z0, v1ip16); + + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z0); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + + __mmask16 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __mmask16 v00_in_range = x0_in_range & y0_in_range; + __mmask16 v01_in_range = x0_in_range & y1_in_range; + __mmask16 v10_in_range = x1_in_range & y0_in_range; + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + v000_in_range = v00_in_range & z0_in_range; + v010_in_range = v01_in_range & z0_in_range; + v100_in_range = v10_in_range & z0_in_range; + v110_in_range = v11_in_range & z0_in_range; + + v001_in_range = v00_in_range & z1_in_range; + v011_in_range = v01_in_range & z1_in_range; + v101_in_range = v10_in_range & z1_in_range; + v111_in_range = v11_in_range & z1_in_range; + } + + // (W*H*z + W*y + x) * elempack + vec(8) + __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); + __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); + + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); + __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v000_in_range, i_tnw_offset, src.channel(q), sizeof(float)); + __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v100_in_range, i_tne_offset, src.channel(q), sizeof(float)); + __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v010_in_range, i_tsw_offset, src.channel(q), sizeof(float)); + __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); + + __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v001_in_range, i_bnw_offset, src.channel(q), sizeof(float)); + __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); + __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); + __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(tnw_val, tnw); + _v = _mm512_fmadd_ps(tne_val, tne, _v); + _v = _mm512_fmadd_ps(tsw_val, tsw, _v); + _v = _mm512_fmadd_ps(tse_val, tse, _v); + + _v = _mm512_fmadd_ps(bnw_val, bnw, _v); + _v = _mm512_fmadd_ps(bne_val, bne, _v); + _v = _mm512_fmadd_ps(bsw_val, bsw, _v); + _v = _mm512_fmadd_ps(bse_val, bse, _v); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + // z + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + __m512 z_t = _mm512_floor_ps(gz); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 t = _mm512_sub_ps(gz, z_t); + __m512 b = _mm512_sub_ps(v1fp16, t); + + __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + tnw = _mm512_mul_ps(b, nw); + tne = _mm512_mul_ps(b, ne); + tsw = _mm512_mul_ps(b, sw); + tse = _mm512_mul_ps(b, se); + + bnw = _mm512_mul_ps(t, nw); + bne = _mm512_mul_ps(t, ne); + bsw = _mm512_mul_ps(t, sw); + bse = _mm512_mul_ps(t, se); + } + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i z0 = _mm512_cvtps_epi32(z_t); + __m512i z1 = _mm512_add_epi32(z0, v1ip16); + + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z0); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + + __mmask16 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __mmask16 v00_in_range = x0_in_range & y0_in_range; + __mmask16 v01_in_range = x0_in_range & y1_in_range; + __mmask16 v10_in_range = x1_in_range & y0_in_range; + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + v000_in_range = v00_in_range & z0_in_range; + v010_in_range = v01_in_range & z0_in_range; + v100_in_range = v10_in_range & z0_in_range; + v110_in_range = v11_in_range & z0_in_range; + + v001_in_range = v00_in_range & z1_in_range; + v011_in_range = v01_in_range & z1_in_range; + v101_in_range = v10_in_range & z1_in_range; + v111_in_range = v11_in_range & z1_in_range; + } + + // (W*H*z + W*y + x) * elempack + vec(8) + __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); + __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); + + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); + __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v000_in_range, i_tnw_offset, src.channel(q), sizeof(float)); + __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v100_in_range, i_tne_offset, src.channel(q), sizeof(float)); + __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v010_in_range, i_tsw_offset, src.channel(q), sizeof(float)); + __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); + + __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v001_in_range, i_bnw_offset, src.channel(q), sizeof(float)); + __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); + __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); + __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(tnw_val, tnw); + _v = _mm512_fmadd_ps(tne_val, tne, _v); + _v = _mm512_fmadd_ps(tsw_val, tsw, _v); + _v = _mm512_fmadd_ps(tse_val, tse, _v); + + _v = _mm512_fmadd_ps(bnw_val, bnw, _v); + _v = _mm512_fmadd_ps(bne_val, bne, _v); + _v = _mm512_fmadd_ps(bsw_val, bsw, _v); + _v = _mm512_fmadd_ps(bse_val, bse, _v); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + // z + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + __m512 z_t = _mm512_floor_ps(gz); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 t = _mm512_sub_ps(gz, z_t); + __m512 b = _mm512_sub_ps(v1fp16, t); + + __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + tnw = _mm512_mul_ps(b, nw); + tne = _mm512_mul_ps(b, ne); + tsw = _mm512_mul_ps(b, sw); + tse = _mm512_mul_ps(b, se); + + bnw = _mm512_mul_ps(t, nw); + bne = _mm512_mul_ps(t, ne); + bsw = _mm512_mul_ps(t, sw); + bse = _mm512_mul_ps(t, se); + } + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i z0 = _mm512_cvtps_epi32(z_t); + __m512i z1 = _mm512_add_epi32(z0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + + __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + v110_in_range = x1_in_range & y1_in_range; + + v011_in_range = y1_in_range & z1_in_range; + v101_in_range = x1_in_range & z1_in_range; + v111_in_range = v11_in_range & z1_in_range; + } + + // (W*H*z + W*y + x) * elempack + vec(8) + __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); + __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); + + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); + __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); + __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); + __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); + __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); + + __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); + __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); + __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); + __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(tnw_val, tnw); + _v = _mm512_fmadd_ps(tne_val, tne, _v); + _v = _mm512_fmadd_ps(tsw_val, tsw, _v); + _v = _mm512_fmadd_ps(tse_val, tse, _v); + + _v = _mm512_fmadd_ps(bnw_val, bnw, _v); + _v = _mm512_fmadd_ps(bne_val, bne, _v); + _v = _mm512_fmadd_ps(bsw_val, bsw, _v); + _v = _mm512_fmadd_ps(bse_val, bse, _v); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + // z + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + __m512 z_t = _mm512_floor_ps(gz); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 t = _mm512_sub_ps(gz, z_t); + __m512 b = _mm512_sub_ps(v1fp16, t); + + __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + tnw = _mm512_mul_ps(b, nw); + tne = _mm512_mul_ps(b, ne); + tsw = _mm512_mul_ps(b, sw); + tse = _mm512_mul_ps(b, se); + + bnw = _mm512_mul_ps(t, nw); + bne = _mm512_mul_ps(t, ne); + bsw = _mm512_mul_ps(t, sw); + bse = _mm512_mul_ps(t, se); + } + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i z0 = _mm512_cvtps_epi32(z_t); + __m512i z1 = _mm512_add_epi32(z0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + + __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + v110_in_range = x1_in_range& y1_in_range; + + v011_in_range = y1_in_range& z1_in_range; + v101_in_range = x1_in_range& z1_in_range; + v111_in_range = v11_in_range & z1_in_range; + } + + // (W*H*z + W*y + x) * elempack + vec(8) + __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); + __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); + + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); + __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); + __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); + __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); + __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); + + __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); + __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); + __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); + __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(tnw_val, tnw); + _v = _mm512_fmadd_ps(tne_val, tne, _v); + _v = _mm512_fmadd_ps(tsw_val, tsw, _v); + _v = _mm512_fmadd_ps(tse_val, tse, _v); + + _v = _mm512_fmadd_ps(bnw_val, bnw, _v); + _v = _mm512_fmadd_ps(bne_val, bne, _v); + _v = _mm512_fmadd_ps(bsw_val, bsw, _v); + _v = _mm512_fmadd_ps(bse_val, bse, _v); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + __m512 v0p5fp16 = _mm512_set1_ps(0.5f); + gx = _mm512_add_ps(gx, v0p5fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(vImgWf, reflectx_v); + + gx = _mm512_sub_ps(gx, v0p5fp16); + + _mm512_sub_ps(gx, v0p5fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_add_ps(gy, v0p5fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(vImgHf, reflecty_v); + + gy = _mm512_sub_ps(gy, v0p5fp16); + + _mm512_sub_ps(gy, v0p5fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + // z + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_add_ps(gz, v0p5fp16); + + gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, vImgDf), *(__m512*)_ps512_inv_sign_mask); + gz = _mm512_sub_ps(vImgDf, reflectz_v); + + gz = _mm512_sub_ps(gz, v0p5fp16); + + _mm512_sub_ps(gz, v0p5fp16); + + gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + __m512 z_t = _mm512_floor_ps(gz); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 t = _mm512_sub_ps(gz, z_t); + __m512 b = _mm512_sub_ps(v1fp16, t); + + __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + tnw = _mm512_mul_ps(b, nw); + tne = _mm512_mul_ps(b, ne); + tsw = _mm512_mul_ps(b, sw); + tse = _mm512_mul_ps(b, se); + + bnw = _mm512_mul_ps(t, nw); + bne = _mm512_mul_ps(t, ne); + bsw = _mm512_mul_ps(t, sw); + bse = _mm512_mul_ps(t, se); + } + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i z0 = _mm512_cvtps_epi32(z_t); + __m512i z1 = _mm512_add_epi32(z0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + + __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + v110_in_range = x1_in_range & y1_in_range; + + v011_in_range = y1_in_range & z1_in_range; + v101_in_range = x1_in_range & z1_in_range; + v111_in_range = v11_in_range & z1_in_range; + } + + // (W*H*z + W*y + x) * elempack + vec(8) + __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); + __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); + + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); + __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); + __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); + __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); + __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); + + __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); + __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); + __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); + __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(tnw_val, tnw); + _v = _mm512_fmadd_ps(tne_val, tne, _v); + _v = _mm512_fmadd_ps(tsw_val, tsw, _v); + _v = _mm512_fmadd_ps(tse_val, tse, _v); + + _v = _mm512_fmadd_ps(bnw_val, bnw, _v); + _v = _mm512_fmadd_ps(bne_val, bne, _v); + _v = _mm512_fmadd_ps(bsw_val, bsw, _v); + _v = _mm512_fmadd_ps(bse_val, bse, _v); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(border_x, reflectx_v); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(border_y, reflecty_v); + + // z + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, border_z), *(__m512*)_ps512_inv_sign_mask); + gz = _mm512_sub_ps(border_z, reflectz_v); + } + + __m512 x_w = _mm512_floor_ps(gx); + __m512 y_n = _mm512_floor_ps(gy); + __m512 z_t = _mm512_floor_ps(gz); + + __m512 w = _mm512_sub_ps(gx, x_w); + __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 n = _mm512_sub_ps(gy, y_n); + __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 t = _mm512_sub_ps(gz, z_t); + __m512 b = _mm512_sub_ps(v1fp16, t); + + __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m512 nw = _mm512_mul_ps(s, e); + __m512 ne = _mm512_mul_ps(s, w); + __m512 sw = _mm512_mul_ps(n, e); + __m512 se = _mm512_mul_ps(n, w); + + tnw = _mm512_mul_ps(b, nw); + tne = _mm512_mul_ps(b, ne); + tsw = _mm512_mul_ps(b, sw); + tse = _mm512_mul_ps(b, se); + + bnw = _mm512_mul_ps(t, nw); + bne = _mm512_mul_ps(t, ne); + bsw = _mm512_mul_ps(t, sw); + bse = _mm512_mul_ps(t, se); + } + + __m512i x0 = _mm512_cvtps_epi32(x_w); + __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i y0 = _mm512_cvtps_epi32(y_n); + __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i z0 = _mm512_cvtps_epi32(z_t); + __m512i z1 = _mm512_add_epi32(z0, v1ip16); + + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + + __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __mmask16 v11_in_range = x1_in_range & y1_in_range; + + v110_in_range = x1_in_range & y1_in_range; + + v011_in_range = y1_in_range & z1_in_range; + v101_in_range = x1_in_range & z1_in_range; + v111_in_range = v11_in_range & z1_in_range; + } + + // (W*H*z + W*y + x) * elempack + vec(8) + __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); + __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); + + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); + __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); + __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); + + for (int q = 0; q < dst.c; q++) + { + __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); + __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); + __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); + __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); + + __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); + __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); + __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); + __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); + + __m512 _v = _mm512_mul_ps(tnw_val, tnw); + _v = _mm512_fmadd_ps(tne_val, tne, _v); + _v = _mm512_fmadd_ps(tsw_val, tsw, _v); + _v = _mm512_fmadd_ps(tse_val, tse, _v); + + _v = _mm512_fmadd_ps(bnw_val, bnw, _v); + _v = _mm512_fmadd_ps(bne_val, bne, _v); + _v = _mm512_fmadd_ps(bsw_val, bsw, _v); + _v = _mm512_fmadd_ps(bse_val, bse, _v); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index 22328d78cf4..b9e8067a750 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -14,26 +14,26 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); @@ -42,49 +42,49 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); - auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); - auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); - auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - auto _v = _mm_mul_ps(nw_val, nw); + __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); _v = _mm_comp_fmadd_ps(sw_val, sw, _v); _v = _mm_comp_fmadd_ps(se_val, se, _v); @@ -97,26 +97,26 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); @@ -125,49 +125,49 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); - auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); - auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); - auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - auto _v = _mm_mul_ps(nw_val, nw); + __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); _v = _mm_comp_fmadd_ps(sw_val, sw, _v); _v = _mm_comp_fmadd_ps(se_val, se, _v); @@ -180,80 +180,80 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - auto _v = _mm_mul_ps(nw_val, nw); + __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); _v = _mm_comp_fmadd_ps(sw_val, sw, _v); _v = _mm_comp_fmadd_ps(se_val, se, _v); @@ -266,80 +266,80 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - auto _v = _mm_mul_ps(nw_val, nw); + __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); _v = _mm_comp_fmadd_ps(sw_val, sw, _v); _v = _mm_comp_fmadd_ps(se_val, se, _v); @@ -352,38 +352,38 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - auto v0p5fp4 = _mm_set1_ps(0.5f); + __m128 v0p5fp4 = _mm_set1_ps(0.5f); gx = _mm_add_ps(gx, v0p5fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(vImgWf, reflectx_v); gx = _mm_sub_ps(gx, v0p5fp4); @@ -395,13 +395,13 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_add_ps(gy, v0p5fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(vImgHf, reflecty_v); gy = _mm_sub_ps(gy, v0p5fp4); @@ -411,44 +411,44 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - auto _v = _mm_mul_ps(nw_val, nw); + __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); _v = _mm_comp_fmadd_ps(sw_val, sw, _v); _v = _mm_comp_fmadd_ps(se_val, se, _v); @@ -461,86 +461,86 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - auto i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); + __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); + __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); + __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - auto ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); - auto _v = _mm_mul_ps(nw_val, nw); + __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); _v = _mm_comp_fmadd_ps(sw_val, sw, _v); _v = _mm_comp_fmadd_ps(se_val, se, _v); @@ -553,16 +553,16 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -571,13 +571,13 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); @@ -589,23 +589,23 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); + __m128 z_t = _mm_floor_ps(gz); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); + __m128 t = _mm_sub_ps(gz, z_t); + __m128 b = _mm_sub_ps(v1fp4, t); __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); tnw = _mm_mul_ps(b, nw); tne = _mm_mul_ps(b, ne); @@ -618,26 +618,26 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& bse = _mm_mul_ps(t, se); } - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); + __m128i z0 = _mm_cvtps_epi32(z_t); + __m128i z1 = _mm_add_epi32(z0, v1ip4); - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); + __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); @@ -651,29 +651,29 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); - auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); - auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); - auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); - auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - auto _v = _mm_mul_ps(tnw_val, tnw); + __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm_comp_fmadd_ps(tse_val, tse, _v); @@ -692,16 +692,16 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -710,13 +710,13 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); @@ -728,23 +728,23 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); + __m128 z_t = _mm_floor_ps(gz); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); + __m128 t = _mm_sub_ps(gz, z_t); + __m128 b = _mm_sub_ps(v1fp4, t); __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); tnw = _mm_mul_ps(b, nw); tne = _mm_mul_ps(b, ne); @@ -757,26 +757,26 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& bse = _mm_mul_ps(t, se); } - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); + __m128i z0 = _mm_cvtps_epi32(z_t); + __m128i z1 = _mm_add_epi32(z0, v1ip4); - auto x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); + __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - auto v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - auto v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); + __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); + __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); @@ -790,29 +790,29 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); - auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); - auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); - auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); - auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - auto _v = _mm_mul_ps(tnw_val, tnw); + __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm_comp_fmadd_ps(tse_val, tse, _v); @@ -831,16 +831,16 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -849,53 +849,53 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); // z gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); + __m128 z_t = _mm_floor_ps(gz); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); + __m128 t = _mm_sub_ps(gz, z_t); + __m128 b = _mm_sub_ps(v1fp4, t); __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); tnw = _mm_mul_ps(b, nw); tne = _mm_mul_ps(b, ne); @@ -908,20 +908,20 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& bse = _mm_mul_ps(t, se); } - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); + __m128i z0 = _mm_cvtps_epi32(z_t); + __m128i z1 = _mm_add_epi32(z0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); @@ -931,29 +931,29 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - auto _v = _mm_mul_ps(tnw_val, tnw); + __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm_comp_fmadd_ps(tse_val, tse, _v); @@ -972,16 +972,16 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -990,53 +990,53 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); // z gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); + __m128 z_t = _mm_floor_ps(gz); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); + __m128 t = _mm_sub_ps(gz, z_t); + __m128 b = _mm_sub_ps(v1fp4, t); __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); tnw = _mm_mul_ps(b, nw); tne = _mm_mul_ps(b, ne); @@ -1049,20 +1049,20 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& bse = _mm_mul_ps(t, se); } - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); + __m128i z0 = _mm_cvtps_epi32(z_t); + __m128i z1 = _mm_add_epi32(z0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); @@ -1072,29 +1072,29 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - auto _v = _mm_mul_ps(tnw_val, tnw); + __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm_comp_fmadd_ps(tse_val, tse, _v); @@ -1113,16 +1113,16 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1131,24 +1131,24 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - auto v0p5fp4 = _mm_set1_ps(0.5f); + __m128 v0p5fp4 = _mm_set1_ps(0.5f); gx = _mm_add_ps(gx, v0p5fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(vImgWf, reflectx_v); gx = _mm_sub_ps(gx, v0p5fp4); @@ -1159,13 +1159,13 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_add_ps(gy, v0p5fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(vImgHf, reflecty_v); gy = _mm_sub_ps(gy, v0p5fp4); @@ -1176,13 +1176,13 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, // z gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); gz = _mm_add_ps(gz, v0p5fp4); - gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps_inv_sign_mask); gz = _mm_sub_ps(vImgDf, reflectz_v); gz = _mm_sub_ps(gz, v0p5fp4); @@ -1192,23 +1192,23 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); + __m128 z_t = _mm_floor_ps(gz); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); + __m128 t = _mm_sub_ps(gz, z_t); + __m128 b = _mm_sub_ps(v1fp4, t); __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); tnw = _mm_mul_ps(b, nw); tne = _mm_mul_ps(b, ne); @@ -1221,20 +1221,20 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, bse = _mm_mul_ps(t, se); } - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); + __m128i z0 = _mm_cvtps_epi32(z_t); + __m128i z1 = _mm_add_epi32(z0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); @@ -1244,29 +1244,29 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - auto _v = _mm_mul_ps(tnw_val, tnw); + __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm_comp_fmadd_ps(tse_val, tse, _v); @@ -1285,16 +1285,16 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1303,59 +1303,59 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); // z gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps_inv_sign_mask); gz = _mm_sub_ps(border_z, reflectz_v); } - auto x_w = _mm_floor_ps(gx); - auto y_n = _mm_floor_ps(gy); - auto z_t = _mm_floor_ps(gz); + __m128 x_w = _mm_floor_ps(gx); + __m128 y_n = _mm_floor_ps(gy); + __m128 z_t = _mm_floor_ps(gz); - auto w = _mm_sub_ps(gx, x_w); - auto e = _mm_sub_ps(v1fp4, w); - auto n = _mm_sub_ps(gy, y_n); - auto s = _mm_sub_ps(v1fp4, n); - auto t = _mm_sub_ps(gz, z_t); - auto b = _mm_sub_ps(v1fp4, t); + __m128 w = _mm_sub_ps(gx, x_w); + __m128 e = _mm_sub_ps(v1fp4, w); + __m128 n = _mm_sub_ps(gy, y_n); + __m128 s = _mm_sub_ps(v1fp4, n); + __m128 t = _mm_sub_ps(gz, z_t); + __m128 b = _mm_sub_ps(v1fp4, t); __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm_mul_ps(s, e); - auto ne = _mm_mul_ps(s, w); - auto sw = _mm_mul_ps(n, e); - auto se = _mm_mul_ps(n, w); + __m128 nw = _mm_mul_ps(s, e); + __m128 ne = _mm_mul_ps(s, w); + __m128 sw = _mm_mul_ps(n, e); + __m128 se = _mm_mul_ps(n, w); tnw = _mm_mul_ps(b, nw); tne = _mm_mul_ps(b, ne); @@ -1368,20 +1368,20 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, bse = _mm_mul_ps(t, se); } - auto x0 = _mm_cvtps_epi32(x_w); - auto x1 = _mm_add_epi32(x0, v1ip4); - auto y0 = _mm_cvtps_epi32(y_n); - auto y1 = _mm_add_epi32(y0, v1ip4); - auto z0 = _mm_cvtps_epi32(z_t); - auto z1 = _mm_add_epi32(z0, v1ip4); + __m128i x0 = _mm_cvtps_epi32(x_w); + __m128i x1 = _mm_add_epi32(x0, v1ip4); + __m128i y0 = _mm_cvtps_epi32(y_n); + __m128i y1 = _mm_add_epi32(y0, v1ip4); + __m128i z0 = _mm_cvtps_epi32(z_t); + __m128i z1 = _mm_add_epi32(z0, v1ip4); - auto x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); + __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); + __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); + __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); + __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); @@ -1391,29 +1391,29 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - auto i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); + __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); + __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - auto i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); + __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); + __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); + __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - auto tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - auto tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - auto tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - auto bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - auto bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - auto bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - auto bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); - auto _v = _mm_mul_ps(tnw_val, tnw); + __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm_comp_fmadd_ps(tse_val, tse, _v); diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index bf557ed99d6..593b3343d0b 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -14,12 +14,12 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -28,12 +28,12 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); @@ -42,49 +42,49 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); + __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); _v = _mm256_comp_fmadd_ps(se_val, se, _v); @@ -97,12 +97,12 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -111,12 +111,12 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); @@ -125,49 +125,49 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); + __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); _v = _mm256_comp_fmadd_ps(se_val, se, _v); @@ -180,12 +180,12 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -194,66 +194,66 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); + __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); _v = _mm256_comp_fmadd_ps(se_val, se, _v); @@ -266,12 +266,12 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -280,66 +280,66 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); + __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); _v = _mm256_comp_fmadd_ps(se_val, se, _v); @@ -352,12 +352,12 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -366,24 +366,24 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto v0p5fp8 = _mm256_set1_ps(0.5f); + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(vImgWf, reflectx_v); gx = _mm256_sub_ps(gx, v0p5fp8); @@ -395,13 +395,13 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_add_ps(gy, v0p5fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(vImgHf, reflecty_v); gy = _mm256_sub_ps(gy, v0p5fp8); @@ -411,44 +411,44 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); + __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); _v = _mm256_comp_fmadd_ps(se_val, se, _v); @@ -461,12 +461,12 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -475,72 +475,72 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(nw_val, nw); + __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); _v = _mm256_comp_fmadd_ps(se_val, se, _v); @@ -553,14 +553,14 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -571,13 +571,13 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); @@ -589,23 +589,23 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(v1fp8, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); tnw = _mm256_mul_ps(b, nw); tne = _mm256_mul_ps(b, ne); @@ -618,26 +618,26 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i z0 = _mm256_cvtps_epi32(z_t); + __m256i z1 = _mm256_add_epi32(z0, v1ip8); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); @@ -651,29 +651,29 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); + __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); + __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); + __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); + __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(tnw_val, tnw); + __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); @@ -692,14 +692,14 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); - - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -710,13 +710,13 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); @@ -728,23 +728,23 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(v1fp8, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); tnw = _mm256_mul_ps(b, nw); tne = _mm256_mul_ps(b, ne); @@ -757,26 +757,26 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i z0 = _mm256_cvtps_epi32(z_t); + __m256i z1 = _mm256_add_epi32(z0, v1ip8); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); @@ -790,29 +790,29 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); + __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); + __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); + __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); + __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(tnw_val, tnw); + __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); @@ -831,14 +831,14 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); - - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -849,53 +849,53 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(v1fp8, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); tnw = _mm256_mul_ps(b, nw); tne = _mm256_mul_ps(b, ne); @@ -908,20 +908,20 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i z0 = _mm256_cvtps_epi32(z_t); + __m256i z1 = _mm256_add_epi32(z0, v1ip8); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -931,29 +931,29 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(tnw_val, tnw); + __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); @@ -972,14 +972,14 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); - - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -990,53 +990,53 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(v1fp8, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); tnw = _mm256_mul_ps(b, nw); tne = _mm256_mul_ps(b, ne); @@ -1049,20 +1049,20 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i z0 = _mm256_cvtps_epi32(z_t); + __m256i z1 = _mm256_add_epi32(z0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -1072,29 +1072,29 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(tnw_val, tnw); + __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); @@ -1113,14 +1113,14 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); - - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1131,24 +1131,24 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto v0p5fp8 = _mm256_set1_ps(0.5f); + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(vImgWf, reflectx_v); gx = _mm256_sub_ps(gx, v0p5fp8); @@ -1159,13 +1159,13 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_add_ps(gy, v0p5fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(vImgHf, reflecty_v); gy = _mm256_sub_ps(gy, v0p5fp8); @@ -1176,13 +1176,13 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, // z gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_add_ps(gz, v0p5fp8); gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); gz = _mm256_sub_ps(vImgDf, reflectz_v); gz = _mm256_sub_ps(gz, v0p5fp8); @@ -1192,23 +1192,23 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(v1fp8, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); tnw = _mm256_mul_ps(b, nw); tne = _mm256_mul_ps(b, ne); @@ -1221,20 +1221,20 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, bse = _mm256_mul_ps(t, se); } - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i z0 = _mm256_cvtps_epi32(z_t); + __m256i z1 = _mm256_add_epi32(z0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -1244,29 +1244,29 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(tnw_val, tnw); + __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); @@ -1285,14 +1285,14 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); - - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1303,59 +1303,59 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); // z gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); gz = _mm256_sub_ps(border_z, reflectz_v); } - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - auto z_t = _mm256_floor_ps(gz); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - auto t = _mm256_sub_ps(gz, z_t); - auto b = _mm256_sub_ps(v1fp8, t); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(v1fp8, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); tnw = _mm256_mul_ps(b, nw); tne = _mm256_mul_ps(b, ne); @@ -1368,20 +1368,20 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, bse = _mm256_mul_ps(t, se); } - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - auto z0 = _mm256_cvtps_epi32(z_t); - auto z1 = _mm256_add_epi32(z0, v1ip8); - - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i z0 = _mm256_cvtps_epi32(z_t); + __m256i z1 = _mm256_add_epi32(z0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -1391,29 +1391,29 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, } // (W*H*z + W*y + x) * elempack + vec(8) - auto i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - auto i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - auto i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - auto i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - auto i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - auto i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - auto i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); for (int q = 0; q < dst.c; q++) { - auto tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - auto tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); + __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - auto bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - auto bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - auto bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - auto bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); + __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); + __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); + __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); - auto _v = _mm256_mul_ps(tnw_val, tnw); + __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); diff --git a/src/layer/x86/gridsample_nearest_pack16.h b/src/layer/x86/gridsample_nearest_pack16.h new file mode 100644 index 00000000000..c72c50abbf6 --- /dev/null +++ b/src/layer/x86/gridsample_nearest_pack16.h @@ -0,0 +1,814 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + const __m512 two = _mm512_set1_ps(2.f); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + + // compute coord + { + // x + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + __m512 v0p5fp16 = _mm512_set1_ps(0.5f); + gx = _mm512_add_ps(gx, v0p5fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(vImgWf, reflectx_v); + + gx = _mm512_sub_ps(gx, v0p5fp16); + + _mm512_sub_ps(gx, v0p5fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_add_ps(gy, v0p5fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(vImgHf, reflecty_v); + + gy = _mm512_sub_ps(gy, v0p5fp16); + + _mm512_sub_ps(gy, v0p5fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + } + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + + const __m512 two = _mm512_set1_ps(2.f); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + + // compute coord + { + // x + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(border_x, reflectx_v); + + // y + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(border_y, reflecty_v); + } + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); + } + } + } +} + +static void gridsample_3d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + // z + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + __m512i iz = _mm512_cvtps_epi32(gz); + + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + // z + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + __m512i iz = _mm512_cvtps_epi32(gz); + + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + // z + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + __m512i iz = _mm512_cvtps_epi32(gz); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + // compute coord + { + const __m512 two = _mm512_set1_ps(2.f); + + // x + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + // z + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); + } + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + __m512i iz = _mm512_cvtps_epi32(gz); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + const __m512 two = _mm512_set1_ps(2.f); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + + // compute coord + { + // x + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + __m512 v0p5fp16 = _mm512_set1_ps(0.5f); + gx = _mm512_add_ps(gx, v0p5fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(vImgWf, reflectx_v); + + gx = _mm512_sub_ps(gx, v0p5fp16); + + _mm512_sub_ps(gx, v0p5fp16); + + gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); + + // y + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_add_ps(gy, v0p5fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(vImgHf, reflecty_v); + + gy = _mm512_sub_ps(gy, v0p5fp16); + + _mm512_sub_ps(gy, v0p5fp16); + + gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); + + // z + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_add_ps(gz, v0p5fp16); + + gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, vImgDf), *(__m512*)_ps512_inv_sign_mask); + gz = _mm512_sub_ps(vImgDf, reflectz_v); + + gz = _mm512_sub_ps(gz, v0p5fp16); + + _mm512_sub_ps(gz, v0p5fp16); + + gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); + } + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + __m512i iz = _mm512_cvtps_epi32(gz); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} + +static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const __m512 vImgWf = _mm512_set1_ps(src.w); + const __m512 vImgHf = _mm512_set1_ps(src.h); + const __m512 vImgDf = _mm512_set1_ps(src.d); + const __m512i vImgWi = _mm512_set1_epi32(src.w); + const __m512i vImgHi = _mm512_set1_epi32(src.h); + const __m512i vImgDi = _mm512_set1_epi32(src.d); + + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int z = 0; z < dst.d; z++) + { + for (int y = 0; y < dst.h; y++) + { + for (int x = 0; x < dst.w; x++) + { + //grid tensor has been packed + const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; + __m512 gx = _mm512_set1_ps(gridptr[0]); + __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); + __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); + + const __m512 two = _mm512_set1_ps(2.f); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + + gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); + gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + + // compute coord + { + // x + const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + + gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); + gx = _mm512_sub_ps(border_x, reflectx_v); + + // y + const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + + gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); + gy = _mm512_sub_ps(border_y, reflecty_v); + + // z + const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + + gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); + + __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, border_z), *(__m512*)_ps512_inv_sign_mask); + gz = _mm512_sub_ps(border_z, reflectz_v); + } + + __m512i ix = _mm512_cvtps_epi32(gx); + __m512i iy = _mm512_cvtps_epi32(gy); + __m512i iz = _mm512_cvtps_epi32(gz); + + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + for (int q = 0; q < dst.c; q++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); + + _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); + } + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index 3e0bc9ab74c..9192ba2361e 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -14,26 +14,26 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); @@ -45,18 +45,18 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -66,26 +66,26 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); @@ -97,18 +97,18 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -118,38 +118,38 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } @@ -157,15 +157,15 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -175,38 +175,38 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } @@ -214,15 +214,15 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -232,24 +232,24 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); @@ -259,14 +259,14 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M // compute coord { // x - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - auto v0p5fp4 = _mm_set1_ps(0.5f); + __m128 v0p5fp4 = _mm_set1_ps(0.5f); gx = _mm_add_ps(gx, v0p5fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(vImgWf, reflectx_v); gx = _mm_sub_ps(gx, v0p5fp4); @@ -276,13 +276,13 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_add_ps(gy, v0p5fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(vImgHf, reflecty_v); gy = _mm_sub_ps(gy, v0p5fp4); @@ -292,15 +292,15 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -310,24 +310,24 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); @@ -337,31 +337,31 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M // compute coord { // x - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); // y - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); } - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -371,16 +371,16 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -389,13 +389,13 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); @@ -411,19 +411,19 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); + __m128i iz = _mm_cvtps_epi32(gz); - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -434,16 +434,16 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -452,13 +452,13 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); @@ -474,19 +474,19 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); + __m128i iz = _mm_cvtps_epi32(gz); - auto v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -497,16 +497,16 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -515,32 +515,32 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); // z gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } @@ -549,15 +549,15 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); + __m128i iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -568,16 +568,16 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -586,32 +586,32 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); // x gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); // z gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } @@ -620,15 +620,15 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); + __m128i iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -639,16 +639,16 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -657,11 +657,11 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); @@ -673,14 +673,14 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M // compute coord { // x - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - auto v0p5fp4 = _mm_set1_ps(0.5f); + __m128 v0p5fp4 = _mm_set1_ps(0.5f); gx = _mm_add_ps(gx, v0p5fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(vImgWf, reflectx_v); gx = _mm_sub_ps(gx, v0p5fp4); @@ -690,13 +690,13 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); // y - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); gy = _mm_add_ps(gy, v0p5fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(vImgHf, reflecty_v); gy = _mm_sub_ps(gy, v0p5fp4); @@ -706,13 +706,13 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); // z - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); gz = _mm_add_ps(gz, v0p5fp4); - gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps_inv_sign_mask); gz = _mm_sub_ps(vImgDf, reflectz_v); gz = _mm_sub_ps(gz, v0p5fp4); @@ -722,15 +722,15 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); + __m128i iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -741,16 +741,16 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm_set1_ps(src.w); - const auto vImgHf = _mm_set1_ps(src.h); - const auto vImgDf = _mm_set1_ps(src.d); - const auto vImgWi = _mm_set1_epi32(src.w); - const auto vImgHi = _mm_set1_epi32(src.h); - const auto vImgDi = _mm_set1_epi32(src.d); + const __m128 vImgWf = _mm_set1_ps(src.w); + const __m128 vImgHf = _mm_set1_ps(src.h); + const __m128 vImgDf = _mm_set1_ps(src.d); + const __m128i vImgWi = _mm_set1_epi32(src.w); + const __m128i vImgHi = _mm_set1_epi32(src.h); + const __m128i vImgDi = _mm_set1_epi32(src.d); - const auto vElempacki = _mm_set1_epi32(src.elempack); + const __m128i vElempacki = _mm_set1_epi32(src.elempack); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -759,11 +759,11 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm_set1_ps(gridptr[0]); - auto gy = _mm_set1_ps(gridptr[grid.elempack]); - auto gz = _mm_set1_ps(gridptr[grid.elempack * 2]); + __m128 gx = _mm_set1_ps(gridptr[0]); + __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); + __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - const auto two = _mm_set1_ps(2.f); + const __m128 two = _mm_set1_ps(2.f); gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); @@ -775,39 +775,39 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M // compute coord { // x - const auto border_x = _mm_sub_ps(vImgWf, v1fp4); + const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx = _mm_and_ps(gx, *(__m128*)_ps256_inv_sign_mask); + gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - auto reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); gx = _mm_sub_ps(border_x, reflectx_v); // y - const auto border_y = _mm_sub_ps(vImgHf, v1fp4); + const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps256_inv_sign_mask); + gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - auto reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps256_inv_sign_mask); + __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); gy = _mm_sub_ps(border_y, reflecty_v); // z - const auto border_z = _mm_sub_ps(vImgDf, v1fp4); + const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - gz = _mm_and_ps(gz, *(__m128*)_ps256_inv_sign_mask); + gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - auto reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps256_inv_sign_mask); + __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps_inv_sign_mask); gz = _mm_sub_ps(border_z, reflectz_v); } - auto ix = _mm_cvtps_epi32(gx); - auto iy = _mm_cvtps_epi32(gy); - auto iz = _mm_cvtps_epi32(gz); + __m128i ix = _mm_cvtps_epi32(gx); + __m128i iy = _mm_cvtps_epi32(gy); + __m128i iz = _mm_cvtps_epi32(gz); - auto i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index 21be4c1c0a5..a1329a2e228 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -14,12 +14,12 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -28,12 +28,12 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); @@ -45,18 +45,18 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -66,12 +66,12 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -80,12 +80,12 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); @@ -97,18 +97,18 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -118,12 +118,12 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -132,24 +132,24 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } @@ -157,15 +157,15 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -175,12 +175,12 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -189,24 +189,24 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } @@ -214,15 +214,15 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -232,12 +232,12 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -246,10 +246,10 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); @@ -259,14 +259,14 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto v0p5fp8 = _mm256_set1_ps(0.5f); + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(vImgWf, reflectx_v); gx = _mm256_sub_ps(gx, v0p5fp8); @@ -276,13 +276,13 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_add_ps(gy, v0p5fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(vImgHf, reflecty_v); gy = _mm256_sub_ps(gy, v0p5fp8); @@ -292,15 +292,15 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -310,12 +310,12 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -324,10 +324,10 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); @@ -337,31 +337,31 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); // y - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); } - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -371,14 +371,14 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -389,13 +389,13 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); @@ -411,19 +411,19 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + __m256i iz = _mm256_cvtps_epi32(gz); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -434,14 +434,14 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -452,13 +452,13 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); @@ -474,19 +474,19 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + __m256i iz = _mm256_cvtps_epi32(gz); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -497,14 +497,14 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -515,32 +515,32 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } @@ -549,15 +549,15 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + __m256i iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -568,14 +568,14 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -586,32 +586,32 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); // compute coord { - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); // x gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } @@ -620,15 +620,15 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + __m256i iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -639,14 +639,14 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -657,11 +657,11 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); @@ -673,14 +673,14 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto v0p5fp8 = _mm256_set1_ps(0.5f); + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(vImgWf, reflectx_v); gx = _mm256_sub_ps(gx, v0p5fp8); @@ -690,13 +690,13 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_add_ps(gy, v0p5fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(vImgHf, reflecty_v); gy = _mm256_sub_ps(gy, v0p5fp8); @@ -706,13 +706,13 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_add_ps(gz, v0p5fp8); gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); gz = _mm256_sub_ps(vImgDf, reflectz_v); gz = _mm256_sub_ps(gz, v0p5fp8); @@ -722,15 +722,15 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + __m256i iz = _mm256_cvtps_epi32(gz); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -741,14 +741,14 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - const auto vImgWf = _mm256_set1_ps(src.w); - const auto vImgHf = _mm256_set1_ps(src.h); - const auto vImgDf = _mm256_set1_ps(src.d); - const auto vImgWi = _mm256_set1_epi32(src.w); - const auto vImgHi = _mm256_set1_epi32(src.h); - const auto vImgDi = _mm256_set1_epi32(src.d); + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); - const auto vElempacki = _mm256_set1_epi32(src.elempack); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -759,11 +759,11 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M { //grid tensor has been packed const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - auto gx = _mm256_set1_ps(gridptr[0]); - auto gy = _mm256_set1_ps(gridptr[grid.elempack]); - auto gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); + __m256 gx = _mm256_set1_ps(gridptr[0]); + __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); + __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - const auto two = _mm256_set1_ps(2.f); + const __m256 two = _mm256_set1_ps(2.f); gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); @@ -775,39 +775,39 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const auto border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - auto reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); gx = _mm256_sub_ps(border_x, reflectx_v); // y - const auto border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); gy = _mm256_sub_ps(border_y, reflecty_v); // z - const auto border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - auto reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); gz = _mm256_sub_ps(border_z, reflectz_v); } - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); - auto iz = _mm256_cvtps_epi32(gz); - - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + __m256i iz = _mm256_cvtps_epi32(gz); + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index d03d8cdfe4b..4d600c75ac5 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -20,6 +20,9 @@ #if __AVX__ #include #include "avx_mathfun.h" +#if __AVX512F__ +#include "avx512_mathfun.h" +#endif // __AVX512F__ #endif // __AVX__ #endif // __SSE2__ #include "x86_usability.h" @@ -35,75 +38,26 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ +#if __AVX512F__ +const __m512 v1fp16 = _mm512_set1_ps(1.0f); +const __m512 vn1fp16 = _mm512_set1_ps(-1.0f); +const __m512i v1ip16 = _mm512_set1_epi32(1); +const __m512i vn1ip16 = _mm512_set1_epi32(-1); + +#include "gridsample_bilinear_pack16.h" +#include "gridsample_nearest_pack16.h" +#include "gridsample_bicubic_pack16.h" + +#endif // __AVX512F__ const __m256 v1fp8 = *(__m256*)_ps256_1; -const auto vn1fp8 = _mm256_set1_ps(-1.0f); -const auto v1ip8 = _mm256_set1_epi32(1); -const auto vn1ip8 = _mm256_set1_epi32(-1); +const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); +const __m256i v1ip8 = _mm256_set1_epi32(1); +const __m256i vn1ip8 = _mm256_set1_epi32(-1); #include "gridsample_bilinear_pack8.h" #include "gridsample_nearest_pack8.h" #include "gridsample_bicubic_pack8.h" -static __m256 NCNN_FORCEINLINE -grid_sample_unormalize_p8(const __m256& w, const __m256& coordx, int align_corner) -{ - __m256 two = _mm256_set1_ps(2.f); - - if (align_corner) - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coordx, v1fp8), two), _mm256_sub_ps(w, v1fp8)); - else - return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coordx, v1fp8), w, v1fp8), two); -} - -static NCNN_FORCEINLINE __m256 border_coord_p8(const __m256& coord, const __m256& border) -{ - return _mm256_min_ps(border, _mm256_max_ps(coord, _mm256_setzero_ps())); -} - -static NCNN_FORCEINLINE __m256 reflect_coord_p8(__m256 x, const __m256& high) -{ - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflect_v = _mm256_and_ps(_mm256_sub_ps(x, high), *(__m256*)_ps256_inv_sign_mask); - x = _mm256_sub_ps(high, reflect_v); - return x; -} - -static NCNN_FORCEINLINE __m256 compute_coord_p8(__m256 sx, const __m256& w, int padding_mode, int align_corner) -{ - if (padding_mode == 2) // border - { - sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); - } - else if (padding_mode == 3) // reflection - { - if (align_corner) - { - sx = reflect_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); - } - else - { - __m256 v0p5f = _mm256_set1_ps(0.5f); - sx = _mm256_sub_ps(reflect_coord_p8(_mm256_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord_p8(sx, _mm256_sub_ps(w, v1fp8)); - } - } - - return sx; -} - -static NCNN_FORCEINLINE __m256 get_coord_p8(const __m256& x, const __m256& w, int padding_mode, int align_corner) -{ - // compute the origin coordinates - __m256 sx = grid_sample_unormalize_p8(w, x, align_corner); - - // correct the coordinates according to the padding_mode - __m256 coord = compute_coord_p8(sx, w, padding_mode, align_corner); - - return coord; -} - #endif // __AVX__ const __m128 v1fp4 = _mm_set1_ps(1.0f); @@ -139,66 +93,6 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, #include "gridsample_bilinear_pack4.h" #include "gridsample_nearest_pack4.h" -static __m128 NCNN_FORCEINLINE -grid_sample_unormalize_p4(const __m128& w, const __m128& coordx, int align_corner) -{ - __m128 two = _mm_set1_ps(2.f); - - if (align_corner) - return _mm_mul_ps(_mm_div_ps(_mm_add_ps(coordx, v1fp4), two), _mm_sub_ps(w, v1fp4)); - else - return _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(coordx, v1fp4), w, v1fp4), two); -} - -static NCNN_FORCEINLINE __m128 border_coord_p4(const __m128& coord, const __m128& border) -{ - return _mm_min_ps(border, _mm_max_ps(coord, _mm_setzero_ps())); -} - -static NCNN_FORCEINLINE __m128 reflect_coord_p4(__m128 x, const __m128& high) -{ - /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); - - __m128 reflect_v = _mm_and_ps(_mm_sub_ps(x, high), *(__m128*)_ps_inv_sign_mask); - x = _mm_sub_ps(high, reflect_v); - return x; -} - -static NCNN_FORCEINLINE __m128 compute_coord_p4(__m128 sx, const __m128& w, int padding_mode, int align_corner) -{ - if (padding_mode == 2) // border - { - sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); - } - else if (padding_mode == 3) // reflection - { - if (align_corner) - { - sx = reflect_coord_p4(sx, _mm_sub_ps(w, v1fp4)); - } - else - { - __m128 v0p5f = *(__m128*)_ps_0p5; - sx = _mm_sub_ps(reflect_coord_p4(_mm_add_ps(sx, v0p5f), w), v0p5f); - sx = border_coord_p4(sx, _mm_sub_ps(w, v1fp4)); - } - } - - return sx; -} - -static NCNN_FORCEINLINE __m128 get_coord_p4(const __m128& x, const __m128& w, int padding_mode, int align_corner) -{ - // compute the origin coordinates - __m128 sx = grid_sample_unormalize_p4(w, x, align_corner); - - // correct the coordinates according to the padding_mode - __m128 coord = compute_coord_p4(sx, w, padding_mode, align_corner); - - return coord; -} - #endif // __SSE2__ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -216,6 +110,245 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Wed, 23 Nov 2022 18:44:59 +0000 Subject: [PATCH 029/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_pack16.h | 50 ++++++------- src/layer/x86/gridsample_bicubic_pack4.h | 64 ++++++++--------- src/layer/x86/gridsample_bicubic_pack8.h | 50 ++++++------- src/layer/x86/gridsample_bilinear_pack16.h | 84 +++++++++++----------- src/layer/x86/gridsample_bilinear_pack4.h | 36 +++++----- src/layer/x86/gridsample_bilinear_pack8.h | 66 ++++++++--------- src/layer/x86/gridsample_nearest_pack16.h | 17 +++-- src/layer/x86/gridsample_nearest_pack4.h | 44 ++++++------ src/layer/x86/gridsample_nearest_pack8.h | 24 +++---- src/layer/x86/gridsample_x86.cpp | 4 +- 10 files changed, 219 insertions(+), 220 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack16.h b/src/layer/x86/gridsample_bicubic_pack16.h index c2038edf877..2cc9ddd2a0c 100644 --- a/src/layer/x86/gridsample_bicubic_pack16.h +++ b/src/layer/x86/gridsample_bicubic_pack16.h @@ -81,7 +81,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i x1 = _mm512_cvtps_epi32(gx1); __m512i x2 = _mm512_cvtps_epi32(gx2); __m512i x3 = _mm512_cvtps_epi32(gx3); - + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x2); @@ -103,13 +103,13 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& v3_in_range[i] = x3_in_range & y_in_range; __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); @@ -203,13 +203,13 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& v3_in_range[i] = x3_in_range & y_in_range; __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); @@ -294,13 +294,13 @@ static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& __m512i y = _mm512_cvtps_epi32(gy); __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); @@ -386,13 +386,13 @@ static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& __m512i y = _mm512_cvtps_epi32(gy); __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); @@ -549,13 +549,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, __m512i y = _mm512_cvtps_epi32(gy); __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); @@ -674,13 +674,13 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, __m512i y = _mm512_cvtps_epi32(gy); __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index 94b66b30053..6e1efbe6b54 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -43,7 +43,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128 vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -88,7 +88,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d __m128i x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); @@ -103,13 +103,13 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -146,7 +146,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128 vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -188,7 +188,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128i x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); @@ -203,13 +203,13 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -246,7 +246,7 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& const __m128 vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -294,13 +294,13 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& __m128i y = _mm_cvtps_epi32(gy); __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -337,7 +337,7 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& const __m128 vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -386,13 +386,13 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& __m128i y = _mm_cvtps_epi32(gy); __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -429,7 +429,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M const __m128 vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -549,13 +549,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M __m128i y = _mm_cvtps_epi32(gy); __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); @@ -594,7 +594,7 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M const __m128 vElempackf = _mm_set1_ps(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -674,13 +674,13 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M __m128i y = _mm_cvtps_epi32(gy); __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); + _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index d39fbe591b1..b2c25041896 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -81,7 +81,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i x1 = _mm256_cvtps_epi32(gx1); __m256i x2 = _mm256_cvtps_epi32(gx2); __m256i x3 = _mm256_cvtps_epi32(gx3); - + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); @@ -103,13 +103,13 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -203,13 +203,13 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -294,13 +294,13 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& __m256i y = _mm256_cvtps_epi32(gy); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -386,13 +386,13 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& __m256i y = _mm256_cvtps_epi32(gy); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -549,13 +549,13 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M __m256i y = _mm256_cvtps_epi32(gy); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); @@ -674,13 +674,13 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M __m256i y = _mm256_cvtps_epi32(gy); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); diff --git a/src/layer/x86/gridsample_bilinear_pack16.h b/src/layer/x86/gridsample_bilinear_pack16.h index d15ef5df3b5..005590a3581 100644 --- a/src/layer/x86/gridsample_bilinear_pack16.h +++ b/src/layer/x86/gridsample_bilinear_pack16.h @@ -64,7 +64,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - + __mmask16 v00_in_range = x0_in_range & y0_in_range; __mmask16 v01_in_range = x0_in_range & y1_in_range; __mmask16 v10_in_range = x1_in_range & y0_in_range; @@ -72,7 +72,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); @@ -132,7 +132,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& __m512 e = _mm512_sub_ps(v1fp16, w); __m512 n = _mm512_sub_ps(gy, y_n); __m512 s = _mm512_sub_ps(v1fp16, n); - + __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); __m512 sw = _mm512_mul_ps(n, e); @@ -142,12 +142,12 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i x1 = _mm512_add_epi32(x0, v1ip16); __m512i y0 = _mm512_cvtps_epi32(y_n); __m512i y1 = _mm512_add_epi32(y0, v1ip16); - + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - + __mmask16 v00_in_range = x0_in_range & y0_in_range; __mmask16 v01_in_range = x0_in_range & y1_in_range; __mmask16 v10_in_range = x1_in_range & y0_in_range; @@ -155,7 +155,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); @@ -218,7 +218,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat __m512 x_w = _mm512_floor_ps(gx); __m512 y_n = _mm512_floor_ps(gy); - + __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(v1fp16, w); __m512 n = _mm512_sub_ps(gy, y_n); @@ -233,7 +233,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat __m512i x1 = _mm512_add_epi32(x0, v1ip16); __m512i y0 = _mm512_cvtps_epi32(y_n); __m512i y1 = _mm512_add_epi32(y0, v1ip16); - + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); @@ -241,7 +241,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat // (W*y + x) * elempack + vec(8) __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); @@ -309,25 +309,25 @@ static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat __m512 e = _mm512_sub_ps(v1fp16, w); __m512 n = _mm512_sub_ps(gy, y_n); __m512 s = _mm512_sub_ps(v1fp16, n); - + __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); __m512 sw = _mm512_mul_ps(n, e); __m512 se = _mm512_mul_ps(n, w); - + __m512i x0 = _mm512_cvtps_epi32(x_w); __m512i x1 = _mm512_add_epi32(x0, v1ip16); __m512i y0 = _mm512_cvtps_epi32(y_n); __m512i y1 = _mm512_add_epi32(y0, v1ip16); - + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - + __mmask16 v11_in_range = x1_in_range & y1_in_range; // (W*y + x) * elempack + vec(8) __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); @@ -428,15 +428,15 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, __m512i x1 = _mm512_add_epi32(x0, v1ip16); __m512i y0 = _mm512_cvtps_epi32(y_n); __m512i y1 = _mm512_add_epi32(y0, v1ip16); - + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - + __mmask16 v11_in_range = x1_in_range & y1_in_range; // (W*y + x) * elempack + vec(8) __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); @@ -528,7 +528,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, // (W*y + x) * elempack + vec(8) __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); @@ -624,7 +624,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i y1 = _mm512_add_epi32(y0, v1ip16); __m512i z0 = _mm512_cvtps_epi32(z_t); __m512i z1 = _mm512_add_epi32(z0, v1ip16); - + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); @@ -643,7 +643,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& v010_in_range = v01_in_range & z0_in_range; v100_in_range = v10_in_range & z0_in_range; v110_in_range = v11_in_range & z0_in_range; - + v001_in_range = v00_in_range & z1_in_range; v011_in_range = v01_in_range & z1_in_range; v101_in_range = v10_in_range & z1_in_range; @@ -655,7 +655,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); @@ -698,7 +698,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vImgDi = _mm512_set1_epi32(src.d); - + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -763,7 +763,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i y1 = _mm512_add_epi32(y0, v1ip16); __m512i z0 = _mm512_cvtps_epi32(z_t); __m512i z1 = _mm512_add_epi32(z0, v1ip16); - + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); @@ -782,7 +782,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& v010_in_range = v01_in_range & z0_in_range; v100_in_range = v10_in_range & z0_in_range; v110_in_range = v11_in_range & z0_in_range; - + v001_in_range = v00_in_range & z1_in_range; v011_in_range = v01_in_range & z1_in_range; v101_in_range = v10_in_range & z1_in_range; @@ -794,7 +794,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); @@ -837,7 +837,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vImgDi = _mm512_set1_epi32(src.d); - + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -935,7 +935,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); @@ -978,7 +978,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vImgDi = _mm512_set1_epi32(src.d); - + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -1055,7 +1055,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat __m512i y1 = _mm512_add_epi32(y0, v1ip16); __m512i z0 = _mm512_cvtps_epi32(z_t); __m512i z1 = _mm512_add_epi32(z0, v1ip16); - + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); @@ -1064,10 +1064,10 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat { __mmask16 v11_in_range = x1_in_range & y1_in_range; - v110_in_range = x1_in_range& y1_in_range; - - v011_in_range = y1_in_range& z1_in_range; - v101_in_range = x1_in_range& z1_in_range; + v110_in_range = x1_in_range & y1_in_range; + + v011_in_range = y1_in_range & z1_in_range; + v101_in_range = x1_in_range & z1_in_range; v111_in_range = v11_in_range & z1_in_range; } @@ -1076,7 +1076,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); @@ -1119,7 +1119,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vImgDi = _mm512_set1_epi32(src.d); - + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -1227,7 +1227,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, __m512i y1 = _mm512_add_epi32(y0, v1ip16); __m512i z0 = _mm512_cvtps_epi32(z_t); __m512i z1 = _mm512_add_epi32(z0, v1ip16); - + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); @@ -1237,7 +1237,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, __mmask16 v11_in_range = x1_in_range & y1_in_range; v110_in_range = x1_in_range & y1_in_range; - + v011_in_range = y1_in_range & z1_in_range; v101_in_range = x1_in_range & z1_in_range; v111_in_range = v11_in_range & z1_in_range; @@ -1248,7 +1248,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); @@ -1291,7 +1291,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vImgDi = _mm512_set1_epi32(src.d); - + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -1374,7 +1374,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, __m512i y1 = _mm512_add_epi32(y0, v1ip16); __m512i z0 = _mm512_cvtps_epi32(z_t); __m512i z1 = _mm512_add_epi32(z0, v1ip16); - + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); @@ -1384,7 +1384,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, __mmask16 v11_in_range = x1_in_range & y1_in_range; v110_in_range = x1_in_range & y1_in_range; - + v011_in_range = y1_in_range & z1_in_range; v101_in_range = x1_in_range & z1_in_range; v111_in_range = v11_in_range & z1_in_range; @@ -1395,7 +1395,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - + __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index b9e8067a750..482a2091678 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -21,7 +21,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -72,7 +72,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -104,7 +104,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -155,7 +155,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -187,7 +187,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -241,7 +241,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -273,7 +273,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -327,7 +327,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -359,7 +359,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -436,7 +436,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, // (W*y + x) * elempack + vec(8) __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -468,7 +468,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -528,7 +528,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, // (W*y + x) * elempack + vec(8) __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); @@ -562,7 +562,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -701,7 +701,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -840,7 +840,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -981,7 +981,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1122,7 +1122,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -1294,7 +1294,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index 593b3343d0b..e173dfcbe12 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -72,7 +72,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -132,7 +132,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256 e = _mm256_sub_ps(v1fp8, w); __m256 n = _mm256_sub_ps(gy, y_n); __m256 s = _mm256_sub_ps(v1fp8, n); - + __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); __m256 sw = _mm256_mul_ps(n, e); @@ -142,12 +142,12 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256i x1 = _mm256_add_epi32(x0, v1ip8); __m256i y0 = _mm256_cvtps_epi32(y_n); __m256i y1 = _mm256_add_epi32(y0, v1ip8); - + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); @@ -155,7 +155,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -218,7 +218,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); - + __m256 w = _mm256_sub_ps(gx, x_w); __m256 e = _mm256_sub_ps(v1fp8, w); __m256 n = _mm256_sub_ps(gy, y_n); @@ -233,7 +233,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256i x1 = _mm256_add_epi32(x0, v1ip8); __m256i y0 = _mm256_cvtps_epi32(y_n); __m256i y1 = _mm256_add_epi32(y0, v1ip8); - + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); @@ -241,7 +241,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -309,25 +309,25 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256 e = _mm256_sub_ps(v1fp8, w); __m256 n = _mm256_sub_ps(gy, y_n); __m256 s = _mm256_sub_ps(v1fp8, n); - + __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); - + __m256i x0 = _mm256_cvtps_epi32(x_w); __m256i x1 = _mm256_add_epi32(x0, v1ip8); __m256i y0 = _mm256_cvtps_epi32(y_n); __m256i y1 = _mm256_add_epi32(y0, v1ip8); - + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -428,15 +428,15 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256i x1 = _mm256_add_epi32(x0, v1ip8); __m256i y0 = _mm256_cvtps_epi32(y_n); __m256i y1 = _mm256_add_epi32(y0, v1ip8); - + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -528,7 +528,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); @@ -624,7 +624,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256i y1 = _mm256_add_epi32(y0, v1ip8); __m256i z0 = _mm256_cvtps_epi32(z_t); __m256i z1 = _mm256_add_epi32(z0, v1ip8); - + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); @@ -655,7 +655,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -698,7 +698,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -763,7 +763,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256i y1 = _mm256_add_epi32(y0, v1ip8); __m256i z0 = _mm256_cvtps_epi32(z_t); __m256i z1 = _mm256_add_epi32(z0, v1ip8); - + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); @@ -794,7 +794,7 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -837,7 +837,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -935,7 +935,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -978,7 +978,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -1055,7 +1055,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256i y1 = _mm256_add_epi32(y0, v1ip8); __m256i z0 = _mm256_cvtps_epi32(z_t); __m256i z1 = _mm256_add_epi32(z0, v1ip8); - + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); @@ -1076,7 +1076,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -1119,7 +1119,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -1227,7 +1227,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256i y1 = _mm256_add_epi32(y0, v1ip8); __m256i z0 = _mm256_cvtps_epi32(z_t); __m256i z1 = _mm256_add_epi32(z0, v1ip8); - + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); @@ -1248,7 +1248,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -1291,7 +1291,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -1374,7 +1374,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, __m256i y1 = _mm256_add_epi32(y0, v1ip8); __m256i z0 = _mm256_cvtps_epi32(z_t); __m256i z1 = _mm256_add_epi32(z0, v1ip8); - + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); @@ -1395,7 +1395,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); diff --git a/src/layer/x86/gridsample_nearest_pack16.h b/src/layer/x86/gridsample_nearest_pack16.h index c72c50abbf6..1caa217e1ce 100644 --- a/src/layer/x86/gridsample_nearest_pack16.h +++ b/src/layer/x86/gridsample_nearest_pack16.h @@ -18,7 +18,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& const __m512 vImgHf = _mm512_set1_ps(src.h); const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); - + const __m512i vElempacki = _mm512_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -51,7 +51,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -101,9 +101,8 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -160,7 +159,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack16(const Mat& src, Mat& __m512i iy = _mm512_cvtps_epi32(gy); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -217,7 +216,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack16(const Mat& src, Mat& __m512i iy = _mm512_cvtps_epi32(gy); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -295,7 +294,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, __m512i iy = _mm512_cvtps_epi32(gy); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -356,7 +355,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, __m512i iy = _mm512_cvtps_epi32(gy); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -799,7 +798,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); __m512i iz = _mm512_cvtps_epi32(gz); - + __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index 9192ba2361e..51cdd13a63e 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -21,7 +21,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -49,10 +49,10 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d __m128i iy = _mm_cvtps_epi32(gy); __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -73,7 +73,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -101,10 +101,10 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128i iy = _mm_cvtps_epi32(gy); __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -125,7 +125,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -161,7 +161,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& __m128i iy = _mm_cvtps_epi32(gy); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -182,7 +182,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -218,7 +218,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& __m128i iy = _mm_cvtps_epi32(gy); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -239,7 +239,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -296,7 +296,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M __m128i iy = _mm_cvtps_epi32(gy); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -317,7 +317,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { for (int x = 0; x < dst.w; x++) @@ -357,7 +357,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M __m128i iy = _mm_cvtps_epi32(gy); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); + _mm_set_epi32(3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -380,7 +380,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -416,7 +416,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d __m128i iz = _mm_cvtps_epi32(gz); __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); @@ -443,7 +443,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -479,7 +479,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128i iz = _mm_cvtps_epi32(gz); __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); + _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); @@ -506,7 +506,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -577,7 +577,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -648,7 +648,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) @@ -750,7 +750,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) { for (int y = 0; y < dst.h; y++) diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index a1329a2e228..7d5b0e2c300 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -18,7 +18,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -49,10 +49,10 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i iy = _mm256_cvtps_epi32(gy); __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -101,10 +101,10 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i iy = _mm256_cvtps_epi32(gy); __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -161,7 +161,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& __m256i iy = _mm256_cvtps_epi32(gy); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -218,7 +218,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& __m256i iy = _mm256_cvtps_epi32(gy); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -296,7 +296,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M __m256i iy = _mm256_cvtps_epi32(gy); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -357,7 +357,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256i iy = _mm256_cvtps_epi32(gy); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { @@ -416,7 +416,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i iz = _mm256_cvtps_epi32(gz); __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); @@ -479,7 +479,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i iz = _mm256_cvtps_epi32(gz); __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); @@ -802,7 +802,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 4d600c75ac5..29de9878d99 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -594,7 +594,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Fri, 25 Nov 2022 00:37:58 +0800 Subject: [PATCH 030/127] finish pack1 dims=3 [WIP] --- src/layer/x86/gridsample_bicubic_pack8.h | 30 - src/layer/x86/gridsample_x86.cpp | 2717 +++++++++++++++++++--- 2 files changed, 2407 insertions(+), 340 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index b2c25041896..095196e6875 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -12,28 +12,6 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) -{ - const __m256 A = _mm256_set1_ps(-0.75f); - - const __m256 x0 = _mm256_add_ps(tx, v1fp8); - const __m256& x1 = tx; - const __m256 x2 = _mm256_sub_ps(v1fp8, tx); - //const __m256 x3 = _mm256_add_ps(x2, v1fp8); - - const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); - const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); - const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); - - __m256 _v = _mm256_mul_ps(coeffs0, x0_v); - _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); - - return _v; -} - static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const __m256 vImgWf = _mm256_set1_ps(src.w); @@ -291,8 +269,6 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - __m256i y = _mm256_cvtps_epi32(gy); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -383,8 +359,6 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - __m256i y = _mm256_cvtps_epi32(gy); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -546,8 +520,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - __m256i y = _mm256_cvtps_epi32(gy); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -671,8 +643,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M gy = _mm256_sub_ps(border_y, reflecty_v); } - __m256i y = _mm256_cvtps_epi32(gy); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 29de9878d99..52456145cf2 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -54,6 +54,28 @@ const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); const __m256i v1ip8 = _mm256_set1_epi32(1); const __m256i vn1ip8 = _mm256_set1_epi32(-1); +static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) +{ + const __m256 A = _mm256_set1_ps(-0.75f); + + const __m256 x0 = _mm256_add_ps(tx, v1fp8); + const __m256& x1 = tx; + const __m256 x2 = _mm256_sub_ps(v1fp8, tx); + //const __m256 x3 = _mm256_add_ps(x2, v1fp8); + + const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); + const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); + const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); + + __m256 _v = _mm256_mul_ps(coeffs0, x0_v); + _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); + _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); + + return _v; +} + #include "gridsample_bilinear_pack8.h" #include "gridsample_nearest_pack8.h" #include "gridsample_bicubic_pack8.h" @@ -61,9 +83,9 @@ const __m256i vn1ip8 = _mm256_set1_epi32(-1); #endif // __AVX__ const __m128 v1fp4 = _mm_set1_ps(1.0f); -const auto vn1fp4 = _mm_set1_ps(-1.0f); -const auto v1ip4 = _mm_set1_epi32(1); -const auto vn1ip4 = _mm_set1_epi32(-1); +const __m128 vn1fp4 = _mm_set1_ps(-1.0f); +const __m128i v1ip4 = _mm_set1_epi32(1); +const __m128i vn1ip4 = _mm_set1_epi32(-1); static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) { @@ -93,6 +115,28 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, #include "gridsample_bilinear_pack4.h" #include "gridsample_nearest_pack4.h" +static inline void interpolate_cubic(float fx, float* coeffs) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +static inline float reflect_coord(float x, int high) +{ + x = abs(x); + x = high - abs(x - high); + return x; +} + #endif // __SSE2__ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -827,7 +871,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(grid_p1.data); + const int size = w * h; + const int grid_size = grid_p1.w * grid_p1.h; - top_blob.create(grid.h, grid.c, channels, elemsize, opt.blob_allocator); + top_blob.create(grid_p1.h, grid_p1.c, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) return -1; if (sample_type == 1) { - return GridSample::forward(bottom_blobs, top_blobs, opt); if (padding_mode == 1) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - int j = 0; - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); -#if __AVX__ - for (; j + 7 < size; j += 8) +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); - - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); - - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); - - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); - - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); - - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); - - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); - - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - - auto v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - auto v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - auto v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(8) - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); - - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); - - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(outptr, _v); - - outptr += 8; - } + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + for (int q = 0; q < channels; q++) + { + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; #endif // __AVX__ - for (; j < size; j++) + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + int v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + int v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + int v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v00 = image.row(y0)[x0] * v00_in_range; + float v01 = image.row(y0)[x1] * v01_in_range; + float v10 = image.row(y1)[x0] * v10_in_range; + float v11 = image.row(y1)[x1] * v11_in_range; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + for (int q = 0; q < channels; q++) + { + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + int v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + int v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + int v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v00 = image.row(y0)[x0] * v00_in_range; + float v01 = image.row(y0)[x1] * v01_in_range; + float v10 = image.row(y1)[x0] * v10_in_range; + float v11 = image.row(y1)[x1] * v11_in_range; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } } } } - else + else if (padding_mode == 2) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - int j = 0; - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); -#if __AVX__ - for (; j + 7 < size; j += 8) +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - auto x_w = _mm256_floor_ps(gx); - auto y_n = _mm256_floor_ps(gy); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - auto w = _mm256_sub_ps(gx, x_w); - auto e = _mm256_sub_ps(v1fp8, w); - auto n = _mm256_sub_ps(gy, y_n); - auto s = _mm256_sub_ps(v1fp8, n); + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - auto nw = _mm256_mul_ps(s, e); - auto ne = _mm256_mul_ps(s, w); - auto sw = _mm256_mul_ps(n, e); - auto se = _mm256_mul_ps(n, w); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto x0 = _mm256_cvtps_epi32(x_w); - auto x1 = _mm256_add_epi32(x0, v1ip8); - auto y0 = _mm256_cvtps_epi32(y_n); - auto y1 = _mm256_add_epi32(y0, v1ip8); + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - auto v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - auto i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - auto i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); - auto i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - auto i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } - auto nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_nw_offset, vn1fp8, sizeof(float)); - auto ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - auto sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - auto se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); - auto _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); - _mm256_storeu_ps(outptr, _v); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); - outptr += 8; - } + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + for (int q = 0; q < channels; q++) + { + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + + } + } + + nn = grid_size & 15; #endif // __AVX__ - for (; j < size; j++) - { + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + int x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + int v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1] * x1_in_range; + float v10 = image.row(y1)[x0] * y1_in_range; + float v11 = image.row(y1)[x1] * v11_in_range; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } } } - } - } - else if (sample_type == 2) - { - if (padding_mode == 1) - { - int nn = size >> 3; - int remain = size; -#if __AVX__ - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < nn; j++) + else { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = get_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = get_coord_p8(gy, vImgHf, padding_mode, align_corner); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - auto v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < channels; q++) - { - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), - i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - _mm256_storeu_ps(top_blob.channel(q).row(0) + j * 8, _v); - } - } + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - remain = remain & 7; + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + for (int q = 0; q < channels; q++) + { + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; #endif // __AVX__ - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = size - remain; j < nn; j++) - { + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + int x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + int v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1] * x1_in_range; + float v10 = image.row(y1)[x0] * y1_in_range; + float v11 = image.row(y1)[x1] * v11_in_range; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } } + } - else + else if (padding_mode == 3) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (align_corner == 0) { - int j = 0; - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); -#if __AVX__ - for (; j + 7 < size; j += 8) +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - gx = compute_coord_p8(gx, vImgWf, padding_mode, align_corner); - gy = compute_coord_p8(gy, vImgHf, padding_mode, align_corner); + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - auto ix = _mm256_cvtps_epi32(gx); - auto iy = _mm256_cvtps_epi32(gy); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - auto i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); - auto _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), static_cast(bottom_blob.channel(q).data), - i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - _mm256_storeu_ps(outptr, _v); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); - outptr += 8; - } + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + // (W*y + x) * elempack + vec(8) + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); + + for (int q = 0; q < channels; q++) + { + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + + } + } + + nn = grid_size & 15; #endif // __AVX__ - for (; j < size; j++) - { + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + sample_x = abs(sample_x + 0.5f); + sample_x = w - abs(sample_x - w) - 0.5; + + sample_y = abs(sample_y + 0.5f); + sample_y = h - abs(sample_y - h) - 0.5; + + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + int x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + int v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1] * x1_in_range; + float v10 = image.row(y1)[x0] * y1_in_range; + float v11 = image.row(y1)[x1] * v11_in_range; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } } } - } - } - else if (sample_type == 3) - { - return GridSample::forward(bottom_blobs, top_blobs, opt); - if (padding_mode == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + else { - int j = 0; - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); -#if __AVX__ - for (; j + 7 < size; j += 8) +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - __m256 coefficients[4]; + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - for (int i = 0; i < 4; i++) - { - auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); - auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); - auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); - auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - auto y = _mm256_cvtps_epi32(gy); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - auto x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - auto x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - auto x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - auto x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - auto y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } - auto v0_in_range = _mm256_and_si256(x0_in_range, y_in_range); - auto v1_in_range = _mm256_and_si256(x1_in_range, y_in_range); - auto v2_in_range = _mm256_and_si256(x2_in_range, y_in_range); - auto v3_in_range = _mm256_and_si256(x3_in_range, y_in_range); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(v1fp8, n); - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, *reinterpret_cast<__m256*>(&v0_in_range), sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, *reinterpret_cast<__m256*>(&v1_in_range), sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, *reinterpret_cast<__m256*>(&v2_in_range), sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, *reinterpret_cast<__m256*>(&v3_in_range), sizeof(float)); + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i y1 = _mm256_add_epi32(y0, v1ip8); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - _mm256_storeu_ps(outptr, _v); + // (W*y + x) * elempack + vec(8) + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, v1ip8); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, v1ip8); - outptr += 8; - } + for (int q = 0; q < channels; q++) + { + __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_nw_offset, vn1fp8, sizeof(float)); + __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); + __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); + __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; #endif // __AVX__ - for (; j < size; j++) - { + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + sample_x = abs(sample_x); + sample_x = (w - 1) - abs(sample_x - (w - 1)); + + sample_y = abs(sample_y); + sample_y = (h - 1) - abs(sample_y - (h - 1)); + + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + int x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + int v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1] * x1_in_range; + float v10 = image.row(y1)[x0] * y1_in_range; + float v11 = image.row(y1)[x1] * v11_in_range; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } } } + } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } + } + else if (sample_type == 2) + { + if (padding_mode == 1) + { + if (align_corner == 0) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - int j = 0; - float* outptr = top_blob.channel(q); - const float* ptr = static_cast(bottom_blob.channel(q).data); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; #if __AVX__ - for (; j + 7 < size; j += 8) + for (int x = 0; x + 15 < nn; x += 16) { - auto tmp_x = _mm256_loadu_ps(gridptr + j); - auto gy = _mm256_loadu_ps(gridptr + j + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - auto gx = _mm256_shuffle_ps(tmp_x, gy, 0x10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0x11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = grid_sample_unormalize_p8(vImgWf, gx, align_corner); - gy = grid_sample_unormalize_p8(vImgHf, gy, align_corner); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - auto gx_floor = _mm256_floor_ps(gx); - auto gy_floor = _mm256_floor_ps(gy); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - const auto tx = _mm256_sub_ps(gx, gx_floor); - const auto ty = _mm256_sub_ps(gy, gy_floor); + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - __m256 coefficients[4]; + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } - for (int i = 0; i < 4; i++) + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + for (int q = 0; q < bottom_blob.c; q++) { - auto gx0 = compute_coord_p8(_mm256_add_ps(gx_floor, vn1fp8), vImgWf, padding_mode, align_corner); - auto gx1 = compute_coord_p8(gx_floor, vImgWf, padding_mode, align_corner); - auto gx2 = compute_coord_p8(_mm256_add_ps(gx_floor, v1fp8), vImgWf, padding_mode, align_corner); - auto gx3 = compute_coord_p8(_mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)), vImgWf, padding_mode, align_corner); + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - gy = compute_coord_p8(_mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)), vImgHf, padding_mode, align_corner); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - auto x0 = _mm256_cvtps_epi32(gx0); - auto x1 = _mm256_cvtps_epi32(gx1); - auto x2 = _mm256_cvtps_epi32(gx2); - auto x3 = _mm256_cvtps_epi32(gx3); + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - auto y = _mm256_cvtps_epi32(gy); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - auto x0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - auto x1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - auto x2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - auto x3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - auto x0_offset = _mm256_cvtps_epi32(x0_offset_f); - auto x1_offset = _mm256_cvtps_epi32(x1_offset_f); - auto x2_offset = _mm256_cvtps_epi32(x2_offset_f); - auto x3_offset = _mm256_cvtps_epi32(x3_offset_f); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - auto x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x0_offset, vn1fp8, sizeof(float)); - auto x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x1_offset, vn1fp8, sizeof(float)); - auto x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x2_offset, vn1fp8, sizeof(float)); - auto x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, x3_offset, vn1fp8, sizeof(float)); + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); } - auto _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - _mm256_storeu_ps(outptr, _v); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - outptr += 8; + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } + + nn = grid_size & 15; #endif // __AVX__ - for (; j < size; j++) + for (int x = grid_size - nn; x < grid_size; x += 2) { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + } } } } } - } - - if (dims == 4) - { - return GridSample::forward(bottom_blobs, top_blobs, opt); - int size = w * h * d; - if (sample_type == 1) + else if (padding_mode == 2) { + if (align_corner == 0) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } + } + + } + else if (padding_mode == 3) + { + if (align_corner == 0) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 two = _mm256_set1_ps(2.f); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + sample_x = floor(sample_x + 0.5f); + sample_y = floor(sample_y + 0.5f); + + sample_x = abs(sample_x + 0.5f); + sample_x = w - abs(sample_x - w) - 0.5; + + sample_y = abs(sample_y + 0.5f); + sample_y = h - abs(sample_y - h) - 0.5; + + int x0 = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + int y0 = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 two = _mm256_set1_ps(2.f); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); + + + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + sample_x = floor(sample_x + 0.5f); + sample_y = floor(sample_y + 0.5f); + + sample_x = abs(sample_x); + int x0 = (w - 1) - abs(sample_x - (w - 1)); + + sample_y = abs(sample_y); + int y0 = (h - 1) - abs(sample_y - (h - 1)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } + } + + } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } + } + else if (sample_type == 3) + { + if (padding_mode == 1) + { + if (align_corner == 0) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + __m256i y = _mm256_cvtps_epi32(gy); + + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + + v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool x0_in_range = (x0 > -1) & (x0 < w); + bool y0_in_range = (y0 > -1) & (y0 < h); + bool x2_in_range = (x2 > -1) & (x2 < w); + bool y2_in_range = (y2 > -1) & (y2 < h); + bool x3_in_range = (x3 > -1) & (x3 < w); + bool y3_in_range = (y3 > -1) & (y3 < h); + + bool v00_in_range = x0_in_range * y0_in_range; + bool v01_in_range = x1_in_range * y0_in_range; + bool v02_in_range = x2_in_range * y0_in_range; + bool v03_in_range = x3_in_range * y0_in_range; + bool v10_in_range = x0_in_range * y1_in_range; + bool v11_in_range = x1_in_range * y1_in_range; + bool v12_in_range = x2_in_range * y1_in_range; + bool v13_in_range = x3_in_range * y1_in_range; + bool v20_in_range = x0_in_range * y2_in_range; + bool v21_in_range = x1_in_range * y2_in_range; + bool v22_in_range = x2_in_range * y2_in_range; + bool v23_in_range = x3_in_range * y2_in_range; + bool v30_in_range = x0_in_range * y3_in_range; + bool v31_in_range = x1_in_range * y3_in_range; + bool v32_in_range = x2_in_range * y3_in_range; + bool v33_in_range = x3_in_range * y3_in_range; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v02 = v02_in_range ? image.row(y0)[x2] : 0; + float v03 = v03_in_range ? image.row(y0)[x3] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + float v12 = v12_in_range ? image.row(y1)[x2] : 0; + float v13 = v13_in_range ? image.row(y1)[x3] : 0; + float v20 = v20_in_range ? image.row(y2)[x0] : 0; + float v21 = v21_in_range ? image.row(y2)[x1] : 0; + float v22 = v22_in_range ? image.row(y2)[x2] : 0; + float v23 = v23_in_range ? image.row(y2)[x3] : 0; + float v30 = v30_in_range ? image.row(y3)[x0] : 0; + float v31 = v31_in_range ? image.row(y3)[x1] : 0; + float v32 = v32_in_range ? image.row(y3)[x2] : 0; + float v33 = v33_in_range ? image.row(y3)[x3] : 0; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); + + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + __m256i y = _mm256_cvtps_epi32(gy); + + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + + v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool x0_in_range = (x0 > -1) & (x0 < w); + bool y0_in_range = (y0 > -1) & (y0 < h); + bool x2_in_range = (x2 > -1) & (x2 < w); + bool y2_in_range = (y2 > -1) & (y2 < h); + bool x3_in_range = (x3 > -1) & (x3 < w); + bool y3_in_range = (y3 > -1) & (y3 < h); + + bool v00_in_range = x0_in_range * y0_in_range; + bool v01_in_range = x1_in_range * y0_in_range; + bool v02_in_range = x2_in_range * y0_in_range; + bool v03_in_range = x3_in_range * y0_in_range; + bool v10_in_range = x0_in_range * y1_in_range; + bool v11_in_range = x1_in_range * y1_in_range; + bool v12_in_range = x2_in_range * y1_in_range; + bool v13_in_range = x3_in_range * y1_in_range; + bool v20_in_range = x0_in_range * y2_in_range; + bool v21_in_range = x1_in_range * y2_in_range; + bool v22_in_range = x2_in_range * y2_in_range; + bool v23_in_range = x3_in_range * y2_in_range; + bool v30_in_range = x0_in_range * y3_in_range; + bool v31_in_range = x1_in_range * y3_in_range; + bool v32_in_range = x2_in_range * y3_in_range; + bool v33_in_range = x3_in_range * y3_in_range; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v02 = v02_in_range ? image.row(y0)[x2] : 0; + float v03 = v03_in_range ? image.row(y0)[x3] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + float v12 = v12_in_range ? image.row(y1)[x2] : 0; + float v13 = v13_in_range ? image.row(y1)[x3] : 0; + float v20 = v20_in_range ? image.row(y2)[x0] : 0; + float v21 = v21_in_range ? image.row(y2)[x1] : 0; + float v22 = v22_in_range ? image.row(y2)[x2] : 0; + float v23 = v23_in_range ? image.row(y2)[x3] : 0; + float v30 = v30_in_range ? image.row(y3)[x0] : 0; + float v31 = v31_in_range ? image.row(y3)[x1] : 0; + float v32 = v32_in_range ? image.row(y3)[x2] : 0; + float v33 = v33_in_range ? image.row(y3)[x3] : 0; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } + } + } + else if (padding_mode == 2) + { + if (align_corner == 0) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + if (y == 1 && x == 24) + { + int a = 10; + } + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(w - 1, std::max(x1, 0)); + y1 = std::min(h - 1, std::max(y1, 0)); + x0 = std::min(w - 1, std::max(x0, 0)); + y0 = std::min(h - 1, std::max(y0, 0)); + x2 = std::min(w - 1, std::max(x2, 0)); + y2 = std::min(h - 1, std::max(y2, 0)); + x3 = std::min(w - 1, std::max(x3, 0)); + y3 = std::min(h - 1, std::max(y3, 0)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(w - 1, std::max(x1, 0)); + y1 = std::min(h - 1, std::max(y1, 0)); + x0 = std::min(w - 1, std::max(x0, 0)); + y0 = std::min(h - 1, std::max(y0, 0)); + x2 = std::min(w - 1, std::max(x2, 0)); + y2 = std::min(h - 1, std::max(y2, 0)); + x3 = std::min(w - 1, std::max(x3, 0)); + y3 = std::min(h - 1, std::max(y3, 0)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } + } + + } + else if (padding_mode == 3) + { + if (align_corner == 0) + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx0 = _mm256_add_ps(gx0, v0p5fp8); + + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); + + gx0 = _mm256_sub_ps(gx0, v0p5fp8); + + _mm256_sub_ps(gx0, v0p5fp8); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + + // x1 + gx1 = _mm256_add_ps(gx1, v0p5fp8); + + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); + + gx1 = _mm256_sub_ps(gx1, v0p5fp8); + + _mm256_sub_ps(gx1, v0p5fp8); + + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + + // x2 + gx2 = _mm256_add_ps(gx2, v0p5fp8); + + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); + + gx2 = _mm256_sub_ps(gx2, v0p5fp8); + + _mm256_sub_ps(gx2, v0p5fp8); + + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + + // x3 + gx3 = _mm256_add_ps(gx3, v0p5fp8); + + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); + + gx3 = _mm256_sub_ps(gx3, v0p5fp8); + + _mm256_sub_ps(gx3, v0p5fp8); + + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + } + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + { + //y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x0 = static_cast(reflect_coord(x0 + 0.5, w) - 0.5); + + y0 = static_cast(reflect_coord(y0 + 0.5, h) - 0.5); + + x0 = std::min(w - 1, std::max(x0, 0)); + y0 = std::min(h - 1, std::max(y0, 0)); + + x1 = static_cast(reflect_coord(x1 + 0.5, w) - 0.5); + + y1 = static_cast(reflect_coord(y1 + 0.5, h) - 0.5); + + x1 = std::min(w - 1, std::max(x1, 0)); + y1 = std::min(h - 1, std::max(y1, 0)); + + x2 = static_cast(reflect_coord(x2 + 0.5, w) - 0.5); + + y2 = static_cast(reflect_coord(y2 + 0.5, h) - 0.5); + + x2 = std::min(w - 1, std::max(x2, 0)); + y2 = std::min(h - 1, std::max(y2, 0)); + + x3 = static_cast(reflect_coord(x3 + 0.5, w) - 0.5); + + y3 = static_cast(reflect_coord(y3 + 0.5, h) - 0.5); + + x3 = std::min(w - 1, std::max(x3, 0)); + y3 = std::min(h - 1, std::max(y3, 0)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(border_x, reflectx0_v); + + // x1 + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(border_x, reflectx1_v); + + // x2 + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(border_x, reflectx2_v); + + // x3 + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(border_x, reflectx3_v); + } + + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + { + //y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x0 = static_cast(reflect_coord(x0, w - 1)); + y0 = static_cast(reflect_coord(y0, h - 1)); + x1 = static_cast(reflect_coord(x1, w - 1)); + y1 = static_cast(reflect_coord(y1, h - 1)); + x2 = static_cast(reflect_coord(x2, w - 1)); + y2 = static_cast(reflect_coord(y2, h - 1)); + x3 = static_cast(reflect_coord(x3, w - 1)); + y3 = static_cast(reflect_coord(y3, h - 1)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } + } + + } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } + } + } + + if (dims == 4) + { + return GridSample::forward(bottom_blobs, top_blobs, opt); + int size = w * h * d; + + top_blob.create(grid.h, grid.d, grid.c* grid.elempack, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + if (sample_type == 1) + { + } else if (sample_type == 2) { From 36ae964a081f2f480b8ef54fcf30610b6685eb25 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 24 Nov 2022 16:40:09 +0000 Subject: [PATCH 031/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 2469 +++++++++++++++--------------- 1 file changed, 1228 insertions(+), 1241 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 52456145cf2..e07bd1b0f13 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -902,7 +902,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v_in_range), sizeof(float)); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); - int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + } } } } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + else { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - } + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); - int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + } } } } } - } - else if (padding_mode == 2) - { - if (align_corner == 0) + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + if (align_corner == 0) { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } } } } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + else { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } } } } } - - } - else if (padding_mode == 3) - { - if (align_corner == 0) + else if (padding_mode == 3) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + if (align_corner == 0) { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + const __m256 two = _mm256_set1_ps(2.f); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); - gx = _mm256_sub_ps(gx, v0p5fp8); + gx = _mm256_sub_ps(gx, v0p5fp8); - _mm256_sub_ps(gx, v0p5fp8); + _mm256_sub_ps(gx, v0p5fp8); - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - - gy = _mm256_add_ps(gy, v0p5fp8); + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_add_ps(gy, v0p5fp8); - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(gy, v0p5fp8); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); - _mm256_sub_ps(gy, v0p5fp8); + gy = _mm256_sub_ps(gy, v0p5fp8); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } + _mm256_sub_ps(gy, v0p5fp8); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; - sample_x = floor(sample_x + 0.5f); - sample_y = floor(sample_y + 0.5f); + sample_x = floor(sample_x + 0.5f); + sample_y = floor(sample_y + 0.5f); - sample_x = abs(sample_x + 0.5f); - sample_x = w - abs(sample_x - w) - 0.5; + sample_x = abs(sample_x + 0.5f); + sample_x = w - abs(sample_x - w) - 0.5; - sample_y = abs(sample_y + 0.5f); - sample_y = h - abs(sample_y - h) - 0.5; + sample_y = abs(sample_y + 0.5f); + sample_y = h - abs(sample_y - h) - 0.5; - int x0 = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - int y0 = std::min(h - 1.0f, std::max(sample_y, 0.0f)); + int x0 = std::min(w - 1.0f, std::max(sample_x, 0.0f)); + int y0 = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } } } } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + else { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + const __m256 two = _mm256_set1_ps(2.f); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + __m256i ix = _mm256_cvtps_epi32(gx); + __m256i iy = _mm256_cvtps_epi32(gy); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + for (int q = 0; q < bottom_blob.c; q++) + { + __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); - sample_x = floor(sample_x + 0.5f); - sample_y = floor(sample_y + 0.5f); + sample_x = floor(sample_x + 0.5f); + sample_y = floor(sample_y + 0.5f); - sample_x = abs(sample_x); - int x0 = (w - 1) - abs(sample_x - (w - 1)); + sample_x = abs(sample_x); + int x0 = (w - 1) - abs(sample_x - (w - 1)); - sample_y = abs(sample_y); - int y0 = (h - 1) - abs(sample_y - (h - 1)); + sample_y = abs(sample_y); + int y0 = (h - 1) - abs(sample_y - (h - 1)); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } } } } } - - } - else - { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; - } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } } else if (sample_type == 3) { - if (padding_mode == 1) - { - if (align_corner == 0) + if (padding_mode == 1) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + if (align_corner == 0) { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - } + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + } - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 coefficients[4]; + __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - __m256i y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); - v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) + for (int q = 0; q < bottom_blob.c; q++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool x0_in_range = (x0 > -1) & (x0 < w); - bool y0_in_range = (y0 > -1) & (y0 < h); - bool x2_in_range = (x2 > -1) & (x2 < w); - bool y2_in_range = (y2 > -1) & (y2 < h); - bool x3_in_range = (x3 > -1) & (x3 < w); - bool y3_in_range = (y3 > -1) & (y3 < h); - - bool v00_in_range = x0_in_range * y0_in_range; - bool v01_in_range = x1_in_range * y0_in_range; - bool v02_in_range = x2_in_range * y0_in_range; - bool v03_in_range = x3_in_range * y0_in_range; - bool v10_in_range = x0_in_range * y1_in_range; - bool v11_in_range = x1_in_range * y1_in_range; - bool v12_in_range = x2_in_range * y1_in_range; - bool v13_in_range = x3_in_range * y1_in_range; - bool v20_in_range = x0_in_range * y2_in_range; - bool v21_in_range = x1_in_range * y2_in_range; - bool v22_in_range = x2_in_range * y2_in_range; - bool v23_in_range = x3_in_range * y2_in_range; - bool v30_in_range = x0_in_range * y3_in_range; - bool v31_in_range = x1_in_range * y3_in_range; - bool v32_in_range = x2_in_range * y3_in_range; - bool v33_in_range = x3_in_range * y3_in_range; - - for (int q = 0; q < channels; q++) + for (int x = grid_size - nn; x < grid_size; x += 2) { - const Mat& image = bottom_blob.channel(q); - - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v02 = v02_in_range ? image.row(y0)[x2] : 0; - float v03 = v03_in_range ? image.row(y0)[x3] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - float v12 = v12_in_range ? image.row(y1)[x2] : 0; - float v13 = v13_in_range ? image.row(y1)[x3] : 0; - float v20 = v20_in_range ? image.row(y2)[x0] : 0; - float v21 = v21_in_range ? image.row(y2)[x1] : 0; - float v22 = v22_in_range ? image.row(y2)[x2] : 0; - float v23 = v23_in_range ? image.row(y2)[x3] : 0; - float v30 = v30_in_range ? image.row(y3)[x0] : 0; - float v31 = v31_in_range ? image.row(y3)[x1] : 0; - float v32 = v32_in_range ? image.row(y3)[x2] : 0; - float v33 = v33_in_range ? image.row(y3)[x3] : 0; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x1, x_coeffs); - interpolate_cubic(sample_y - y1, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool x0_in_range = (x0 > -1) & (x0 < w); + bool y0_in_range = (y0 > -1) & (y0 < h); + bool x2_in_range = (x2 > -1) & (x2 < w); + bool y2_in_range = (y2 > -1) & (y2 < h); + bool x3_in_range = (x3 > -1) & (x3 < w); + bool y3_in_range = (y3 > -1) & (y3 < h); + + bool v00_in_range = x0_in_range * y0_in_range; + bool v01_in_range = x1_in_range * y0_in_range; + bool v02_in_range = x2_in_range * y0_in_range; + bool v03_in_range = x3_in_range * y0_in_range; + bool v10_in_range = x0_in_range * y1_in_range; + bool v11_in_range = x1_in_range * y1_in_range; + bool v12_in_range = x2_in_range * y1_in_range; + bool v13_in_range = x3_in_range * y1_in_range; + bool v20_in_range = x0_in_range * y2_in_range; + bool v21_in_range = x1_in_range * y2_in_range; + bool v22_in_range = x2_in_range * y2_in_range; + bool v23_in_range = x3_in_range * y2_in_range; + bool v30_in_range = x0_in_range * y3_in_range; + bool v31_in_range = x1_in_range * y3_in_range; + bool v32_in_range = x2_in_range * y3_in_range; + bool v33_in_range = x3_in_range * y3_in_range; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v02 = v02_in_range ? image.row(y0)[x2] : 0; + float v03 = v03_in_range ? image.row(y0)[x3] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + float v12 = v12_in_range ? image.row(y1)[x2] : 0; + float v13 = v13_in_range ? image.row(y1)[x3] : 0; + float v20 = v20_in_range ? image.row(y2)[x0] : 0; + float v21 = v21_in_range ? image.row(y2)[x1] : 0; + float v22 = v22_in_range ? image.row(y2)[x2] : 0; + float v23 = v23_in_range ? image.row(y2)[x3] : 0; + float v30 = v30_in_range ? image.row(y3)[x0] : 0; + float v31 = v31_in_range ? image.row(y3)[x1] : 0; + float v32 = v32_in_range ? image.row(y3)[x2] : 0; + float v33 = v33_in_range ? image.row(y3)[x3] : 0; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } } } } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + else { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); + // compute coord + { + const __m256 two = _mm256_set1_ps(2.f); - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - } + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + } - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 coefficients[4]; + __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], + v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - __m256i y = _mm256_cvtps_epi32(gy); + __m256i y = _mm256_cvtps_epi32(gy); - __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); - v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) + for (int q = 0; q < bottom_blob.c; q++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool x0_in_range = (x0 > -1) & (x0 < w); - bool y0_in_range = (y0 > -1) & (y0 < h); - bool x2_in_range = (x2 > -1) & (x2 < w); - bool y2_in_range = (y2 > -1) & (y2 < h); - bool x3_in_range = (x3 > -1) & (x3 < w); - bool y3_in_range = (y3 > -1) & (y3 < h); - - bool v00_in_range = x0_in_range * y0_in_range; - bool v01_in_range = x1_in_range * y0_in_range; - bool v02_in_range = x2_in_range * y0_in_range; - bool v03_in_range = x3_in_range * y0_in_range; - bool v10_in_range = x0_in_range * y1_in_range; - bool v11_in_range = x1_in_range * y1_in_range; - bool v12_in_range = x2_in_range * y1_in_range; - bool v13_in_range = x3_in_range * y1_in_range; - bool v20_in_range = x0_in_range * y2_in_range; - bool v21_in_range = x1_in_range * y2_in_range; - bool v22_in_range = x2_in_range * y2_in_range; - bool v23_in_range = x3_in_range * y2_in_range; - bool v30_in_range = x0_in_range * y3_in_range; - bool v31_in_range = x1_in_range * y3_in_range; - bool v32_in_range = x2_in_range * y3_in_range; - bool v33_in_range = x3_in_range * y3_in_range; - - for (int q = 0; q < channels; q++) + for (int x = grid_size - nn; x < grid_size; x += 2) { - const Mat& image = bottom_blob.channel(q); - - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v02 = v02_in_range ? image.row(y0)[x2] : 0; - float v03 = v03_in_range ? image.row(y0)[x3] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - float v12 = v12_in_range ? image.row(y1)[x2] : 0; - float v13 = v13_in_range ? image.row(y1)[x3] : 0; - float v20 = v20_in_range ? image.row(y2)[x0] : 0; - float v21 = v21_in_range ? image.row(y2)[x1] : 0; - float v22 = v22_in_range ? image.row(y2)[x2] : 0; - float v23 = v23_in_range ? image.row(y2)[x3] : 0; - float v30 = v30_in_range ? image.row(y3)[x0] : 0; - float v31 = v31_in_range ? image.row(y3)[x1] : 0; - float v32 = v32_in_range ? image.row(y3)[x2] : 0; - float v33 = v33_in_range ? image.row(y3)[x3] : 0; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x1, x_coeffs); - interpolate_cubic(sample_y - y1, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool x0_in_range = (x0 > -1) & (x0 < w); + bool y0_in_range = (y0 > -1) & (y0 < h); + bool x2_in_range = (x2 > -1) & (x2 < w); + bool y2_in_range = (y2 > -1) & (y2 < h); + bool x3_in_range = (x3 > -1) & (x3 < w); + bool y3_in_range = (y3 > -1) & (y3 < h); + + bool v00_in_range = x0_in_range * y0_in_range; + bool v01_in_range = x1_in_range * y0_in_range; + bool v02_in_range = x2_in_range * y0_in_range; + bool v03_in_range = x3_in_range * y0_in_range; + bool v10_in_range = x0_in_range * y1_in_range; + bool v11_in_range = x1_in_range * y1_in_range; + bool v12_in_range = x2_in_range * y1_in_range; + bool v13_in_range = x3_in_range * y1_in_range; + bool v20_in_range = x0_in_range * y2_in_range; + bool v21_in_range = x1_in_range * y2_in_range; + bool v22_in_range = x2_in_range * y2_in_range; + bool v23_in_range = x3_in_range * y2_in_range; + bool v30_in_range = x0_in_range * y3_in_range; + bool v31_in_range = x1_in_range * y3_in_range; + bool v32_in_range = x2_in_range * y3_in_range; + bool v33_in_range = x3_in_range * y3_in_range; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v02 = v02_in_range ? image.row(y0)[x2] : 0; + float v03 = v03_in_range ? image.row(y0)[x3] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + float v12 = v12_in_range ? image.row(y1)[x2] : 0; + float v13 = v13_in_range ? image.row(y1)[x3] : 0; + float v20 = v20_in_range ? image.row(y2)[x0] : 0; + float v21 = v21_in_range ? image.row(y2)[x1] : 0; + float v22 = v22_in_range ? image.row(y2)[x2] : 0; + float v23 = v23_in_range ? image.row(y2)[x3] : 0; + float v30 = v30_in_range ? image.row(y3)[x0] : 0; + float v31 = v31_in_range ? image.row(y3)[x1] : 0; + float v32 = v32_in_range ? image.row(y3)[x2] : 0; + float v33 = v33_in_range ? image.row(y3)[x3] : 0; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } } } } } - } - else if (padding_mode == 2) - { - if (align_corner == 0) + else if (padding_mode == 2) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + if (align_corner == 0) { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - __m256 coefficients[4]; + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 coefficients[4]; - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); - for (int q = 0; q < bottom_blob.c; q++) - { + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); } - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - if (y == 1 && x == 24) + for (int x = grid_size - nn; x < grid_size; x += 2) { - int a = 10; - } + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x1 = std::min(w - 1, std::max(x1, 0)); - y1 = std::min(h - 1, std::max(y1, 0)); - x0 = std::min(w - 1, std::max(x0, 0)); - y0 = std::min(h - 1, std::max(y0, 0)); - x2 = std::min(w - 1, std::max(x2, 0)); - y2 = std::min(h - 1, std::max(y2, 0)); - x3 = std::min(w - 1, std::max(x3, 0)); - y3 = std::min(h - 1, std::max(y3, 0)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + if (y == 1 && x == 24) + { + int a = 10; + } + + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(w - 1, std::max(x1, 0)); + y1 = std::min(h - 1, std::max(y1, 0)); + x0 = std::min(w - 1, std::max(x0, 0)); + y0 = std::min(h - 1, std::max(y0, 0)); + x2 = std::min(w - 1, std::max(x2, 0)); + y2 = std::min(h - 1, std::max(y2, 0)); + x3 = std::min(w - 1, std::max(x3, 0)); + y3 = std::min(h - 1, std::max(y3, 0)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } } } } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + else { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - __m256 coefficients[4]; + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 coefficients[4]; - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); - for (int q = 0; q < bottom_blob.c; q++) - { + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); } - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + for (int q = 0; q < bottom_blob.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x1 = std::min(w - 1, std::max(x1, 0)); - y1 = std::min(h - 1, std::max(y1, 0)); - x0 = std::min(w - 1, std::max(x0, 0)); - y0 = std::min(h - 1, std::max(y0, 0)); - x2 = std::min(w - 1, std::max(x2, 0)); - y2 = std::min(h - 1, std::max(y2, 0)); - x3 = std::min(w - 1, std::max(x3, 0)); - y3 = std::min(h - 1, std::max(y3, 0)); - - for (int q = 0; q < channels; q++) + for (int x = grid_size - nn; x < grid_size; x += 2) { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(w - 1, std::max(x1, 0)); + y1 = std::min(h - 1, std::max(y1, 0)); + x0 = std::min(w - 1, std::max(x0, 0)); + y0 = std::min(h - 1, std::max(y0, 0)); + x2 = std::min(w - 1, std::max(x2, 0)); + y2 = std::min(h - 1, std::max(y2, 0)); + x3 = std::min(w - 1, std::max(x3, 0)); + y3 = std::min(h - 1, std::max(y3, 0)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } } } } } - - } - else if (padding_mode == 3) - { - if (align_corner == 0) + else if (padding_mode == 3) { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + if (align_corner == 0) { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - __m256 coefficients[4]; + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - { - // x0 - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + __m256 coefficients[4]; - gx0 = _mm256_add_ps(gx0, v0p5fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_add_ps(gx0, v0p5fp8); - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(gx0, v0p5fp8); + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); - _mm256_sub_ps(gx0, v0p5fp8); + gx0 = _mm256_sub_ps(gx0, v0p5fp8); - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + _mm256_sub_ps(gx0, v0p5fp8); - // x1 - gx1 = _mm256_add_ps(gx1, v0p5fp8); + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + // x1 + gx1 = _mm256_add_ps(gx1, v0p5fp8); - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(gx1, v0p5fp8); + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); - _mm256_sub_ps(gx1, v0p5fp8); + gx1 = _mm256_sub_ps(gx1, v0p5fp8); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + _mm256_sub_ps(gx1, v0p5fp8); - // x2 - gx2 = _mm256_add_ps(gx2, v0p5fp8); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + // x2 + gx2 = _mm256_add_ps(gx2, v0p5fp8); - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(gx2, v0p5fp8); + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); - _mm256_sub_ps(gx2, v0p5fp8); + gx2 = _mm256_sub_ps(gx2, v0p5fp8); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + _mm256_sub_ps(gx2, v0p5fp8); - // x3 - gx3 = _mm256_add_ps(gx3, v0p5fp8); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + // x3 + gx3 = _mm256_add_ps(gx3, v0p5fp8); - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(gx3, v0p5fp8); + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); - _mm256_sub_ps(gx3, v0p5fp8); + gx3 = _mm256_sub_ps(gx3, v0p5fp8); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - } + _mm256_sub_ps(gx3, v0p5fp8); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + } - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) { - //y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_add_ps(gy, v0p5fp8); + { + //y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_add_ps(gy, v0p5fp8); - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(gy, v0p5fp8); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); - _mm256_sub_ps(gy, v0p5fp8); + gy = _mm256_sub_ps(gy, v0p5fp8); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } + _mm256_sub_ps(gy, v0p5fp8); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; + sample_x = ((sample_x + 1) * w - 1) / 2.f; + sample_y = ((sample_y + 1) * h - 1) / 2.f; - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; - x0 = static_cast(reflect_coord(x0 + 0.5, w) - 0.5); + x0 = static_cast(reflect_coord(x0 + 0.5, w) - 0.5); - y0 = static_cast(reflect_coord(y0 + 0.5, h) - 0.5); + y0 = static_cast(reflect_coord(y0 + 0.5, h) - 0.5); - x0 = std::min(w - 1, std::max(x0, 0)); - y0 = std::min(h - 1, std::max(y0, 0)); + x0 = std::min(w - 1, std::max(x0, 0)); + y0 = std::min(h - 1, std::max(y0, 0)); - x1 = static_cast(reflect_coord(x1 + 0.5, w) - 0.5); + x1 = static_cast(reflect_coord(x1 + 0.5, w) - 0.5); - y1 = static_cast(reflect_coord(y1 + 0.5, h) - 0.5); + y1 = static_cast(reflect_coord(y1 + 0.5, h) - 0.5); - x1 = std::min(w - 1, std::max(x1, 0)); - y1 = std::min(h - 1, std::max(y1, 0)); + x1 = std::min(w - 1, std::max(x1, 0)); + y1 = std::min(h - 1, std::max(y1, 0)); - x2 = static_cast(reflect_coord(x2 + 0.5, w) - 0.5); + x2 = static_cast(reflect_coord(x2 + 0.5, w) - 0.5); - y2 = static_cast(reflect_coord(y2 + 0.5, h) - 0.5); + y2 = static_cast(reflect_coord(y2 + 0.5, h) - 0.5); - x2 = std::min(w - 1, std::max(x2, 0)); - y2 = std::min(h - 1, std::max(y2, 0)); + x2 = std::min(w - 1, std::max(x2, 0)); + y2 = std::min(h - 1, std::max(y2, 0)); - x3 = static_cast(reflect_coord(x3 + 0.5, w) - 0.5); + x3 = static_cast(reflect_coord(x3 + 0.5, w) - 0.5); - y3 = static_cast(reflect_coord(y3 + 0.5, h) - 0.5); + y3 = static_cast(reflect_coord(y3 + 0.5, h) - 0.5); - x3 = std::min(w - 1, std::max(x3, 0)); - y3 = std::min(h - 1, std::max(y3, 0)); + x3 = std::min(w - 1, std::max(x3, 0)); + y3 = std::min(h - 1, std::max(y3, 0)); - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } } } } - } - else - { -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) + else { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = gx; + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = gx; - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + const __m256 two = _mm256_set1_ps(2.f); + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - __m256 coefficients[4]; + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - { - // x0 - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + __m256 coefficients[4]; - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(border_x, reflectx0_v); + __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - // x1 - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(border_x, reflectx0_v); - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(border_x, reflectx1_v); + // x1 + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - // x2 - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(border_x, reflectx1_v); - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(border_x, reflectx2_v); + // x2 + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - // x3 - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(border_x, reflectx2_v); - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(border_x, reflectx3_v); - } + // x3 + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(border_x, reflectx3_v); + } - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + __m256i x0 = _mm256_cvtps_epi32(gx0); + __m256i x1 = _mm256_cvtps_epi32(gx1); + __m256i x2 = _mm256_cvtps_epi32(gx2); + __m256i x3 = _mm256_cvtps_epi32(gx3); + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) { - //y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + { + //y + const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < bottom_blob.c; q++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + for (int i = 0; i < 4; i++) + { + __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v0_offset[i], vn1fp8, sizeof(float)); + __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], vn1fp8, sizeof(float)); + __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], vn1fp8, sizeof(float)); + __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], vn1fp8, sizeof(float)); - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); + } } - } - nn = grid_size & 15; + nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x0 = static_cast(reflect_coord(x0, w - 1)); - y0 = static_cast(reflect_coord(y0, h - 1)); - x1 = static_cast(reflect_coord(x1, w - 1)); - y1 = static_cast(reflect_coord(y1, h - 1)); - x2 = static_cast(reflect_coord(x2, w - 1)); - y2 = static_cast(reflect_coord(y2, h - 1)); - x3 = static_cast(reflect_coord(x3, w - 1)); - y3 = static_cast(reflect_coord(y3, h - 1)); - - for (int q = 0; q < channels; q++) + for (int x = grid_size - nn; x < grid_size; x += 2) { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (w - 1); + sample_y = (sample_y + 1) / 2.f * (h - 1); + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x0 = static_cast(reflect_coord(x0, w - 1)); + y0 = static_cast(reflect_coord(y0, h - 1)); + x1 = static_cast(reflect_coord(x1, w - 1)); + y1 = static_cast(reflect_coord(y1, h - 1)); + x2 = static_cast(reflect_coord(x2, w - 1)); + y2 = static_cast(reflect_coord(y2, h - 1)); + x3 = static_cast(reflect_coord(x3, w - 1)); + y3 = static_cast(reflect_coord(y3, h - 1)); + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } } } } } - - } - else - { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; - } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } } } @@ -3355,12 +3343,11 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Fri, 25 Nov 2022 23:56:05 +0800 Subject: [PATCH 032/127] solve conflict --- tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 4e672e21dbd..5dcae56832c 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -1,10 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -<<<<<<< HEAD -// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. -======= // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. - >>>>>>> master // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -17,12 +13,8 @@ // specific language governing permissions and limitations under the License. #include "pass_ncnn.h" - <<<<<<< HEAD -#include - ======= - >>>>>>> master - namespace pnnx +namespace pnnx { namespace ncnn { From 43690adfb05e7a3b71228af6510878722b51d9dd Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Fri, 25 Nov 2022 15:57:47 +0000 Subject: [PATCH 033/127] apply code-format changes --- tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 63 +++++++++++----------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 5dcae56832c..7c681eb08b7 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -14,8 +14,7 @@ #include "pass_ncnn.h" -namespace pnnx -{ +namespace pnnx { namespace ncnn { class F_grid_sample : public GraphRewriterPass @@ -30,42 +29,42 @@ pnnx.Input input_1 0 1 input1 F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners pnnx.Output output 1 0 out )PNNXIR"; - } + } - const char* type_str() const - { - return "GridSample"; - } + const char* type_str() const + { + return "GridSample"; + } - const char* name_str() const - { - return "gridsample"; - } + const char* name_str() const + { + return "gridsample"; + } - void write(Operator* op, const std::map& captured_params) const - { - const std::string& mode = captured_params.at("mode").s; - if (mode == "bilinear") - op->params["0"] = 1; - if (mode == "nearest") - op->params["0"] = 2; - if (mode == "bicubic") - op->params["0"] = 3; + void write(Operator* op, const std::map& captured_params) const + { + const std::string& mode = captured_params.at("mode").s; + if (mode == "bilinear") + op->params["0"] = 1; + if (mode == "nearest") + op->params["0"] = 2; + if (mode == "bicubic") + op->params["0"] = 3; - const std::string& padding_mode = captured_params.at("padding_mode").s; - if (padding_mode == "zeros") - op->params["1"] = 1; - if (padding_mode == "border") - op->params["1"] = 2; - if (padding_mode == "reflection") - op->params["1"] = 3; + const std::string& padding_mode = captured_params.at("padding_mode").s; + if (padding_mode == "zeros") + op->params["1"] = 1; + if (padding_mode == "border") + op->params["1"] = 2; + if (padding_mode == "reflection") + op->params["1"] = 3; - op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; - } - }; + op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; + } +}; - REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) - } // namespace ncnn +} // namespace ncnn } // namespace pnnx From b4ff40e82a669a5b5fa443b729bc64ab772e266d Mon Sep 17 00:00:00 2001 From: Yoh Date: Sat, 26 Nov 2022 01:30:47 +0800 Subject: [PATCH 034/127] fix sse pack4 bug and pack1 naive bug --- src/layer/x86/gridsample_x86.cpp | 94 ++++++++++++++++---------------- tests/test_gridsample.cpp | 2 - 2 files changed, 47 insertions(+), 49 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index e07bd1b0f13..87f003f0632 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -96,10 +96,10 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, memcpy(offseti, &offset, 4 * sizeof(int)); memcpy(maski, &mask, 4 * sizeof(int)); - float data[4]; + float data[4] = {0.0f, 0.0f, 0.0f, 0.0f}; for (int i = 0; i < 4; i++) { - if (maski[i] & 0x01) + if (maski[i] & 0xF0000000) { data[i] = *(ptr + offseti[i]); } @@ -998,10 +998,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - int v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - int v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); - int v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + bool v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + bool v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + bool v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); float alpha = sample_x - x0; float beta = sample_y - y0; @@ -1009,10 +1009,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - int v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - int v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); - int v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + bool v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + bool v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); + bool v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); float alpha = sample_x - x0; float beta = sample_y - y0; @@ -1130,10 +1130,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (x1 < bottom_blob.w); - int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - int v11_in_range = x1_in_range & y1_in_range; - + bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + bool v11_in_range = x1_in_range & y1_in_range; + float alpha = sample_x - x0; float beta = sample_y - y0; @@ -1261,9 +1261,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (x1 < bottom_blob.w); - int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - int v11_in_range = x1_in_range & y1_in_range; + bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + bool v11_in_range = x1_in_range & y1_in_range; float alpha = sample_x - x0; float beta = sample_y - y0; @@ -1387,9 +1387,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (x1 < bottom_blob.w); - int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - int v11_in_range = x1_in_range & y1_in_range; + bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + bool v11_in_range = x1_in_range & y1_in_range; float alpha = sample_x - x0; float beta = sample_y - y0; @@ -1546,9 +1546,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (x1 < bottom_blob.w); - int y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - int v11_in_range = x1_in_range & y1_in_range; + bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); + bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); + bool v11_in_range = x1_in_range & y1_in_range; float alpha = sample_x - x0; float beta = sample_y - y0; @@ -1684,9 +1684,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); for (int q = 0; q < channels; q++) { const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + top_blob.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; } } } @@ -1845,13 +1845,13 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - int v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); + bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); for (int q = 0; q < channels; q++) { const Mat& image = bottom_blob.channel(q); - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0] * v00_in_range; + top_blob.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; } } } diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 7763d3c6441..3d019d98419 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -15,8 +15,6 @@ #include "layer/gridsample.h" #include "testutil.h" -#include - static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner) { ncnn::ParamDict pd; From c37f9b956bf8031dc83cc863640a47e2f4f30171 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Fri, 25 Nov 2022 17:32:41 +0000 Subject: [PATCH 035/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 87f003f0632..903365f2ec2 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -1253,7 +1253,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector -1) & (x1 < bottom_blob.w); bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); bool v11_in_range = x1_in_range & y1_in_range; - + float alpha = sample_x - x0; float beta = sample_y - y0; From cc8f046238dc8cf5e3c58458835527f0dac7b7d0 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Thu, 1 Dec 2022 21:16:42 +0800 Subject: [PATCH 036/127] fix sample_type=2 padding_mode=1 align_corner=1 and compile bug --- src/layer/x86/gridsample_bicubic_pack16.h | 104 ++--- src/layer/x86/gridsample_bicubic_pack8.h | 140 +++---- src/layer/x86/gridsample_bilinear_pack16.h | 300 +++++++------- src/layer/x86/gridsample_bilinear_pack8.h | 446 ++++++++++----------- src/layer/x86/gridsample_nearest_pack16.h | 112 +++--- src/layer/x86/gridsample_nearest_pack8.h | 144 +++---- src/layer/x86/gridsample_x86.cpp | 378 +++++++++-------- 7 files changed, 823 insertions(+), 801 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack16.h b/src/layer/x86/gridsample_bicubic_pack16.h index 2cc9ddd2a0c..4f4e9aa1e4c 100644 --- a/src/layer/x86/gridsample_bicubic_pack16.h +++ b/src/layer/x86/gridsample_bicubic_pack16.h @@ -16,15 +16,15 @@ static NCNN_FORCEINLINE __m512 cubic_interp1d_p16(const __m512& x0_v, const __m5 { const __m512 A = _mm512_set1_ps(-0.75f); - const __m512 x0 = _mm512_add_ps(tx, v1fp16); + const __m512 x0 = _mm512_add_ps(tx, *(__m512*)_ps512_1); const __m512& x1 = tx; - const __m512 x2 = _mm512_sub_ps(v1fp16, tx); - //const __m512 x3 = _mm512_add_ps(x2, v1fp16); + const __m512 x2 = _mm512_sub_ps(*(__m512*)_ps512_1, tx); + //const __m512 x3 = _mm512_add_ps(x2, *(__m512*)_ps512_1); const __m512 coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); - const __m512 coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), v1fp16); - const __m512 coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), v1fp16); - const __m512 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(v1fp16, coeffs0), coeffs1), coeffs2); + const __m512 coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), *(__m512*)_ps512_1); + const __m512 coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), *(__m512*)_ps512_1); + const __m512 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); __m512 _v = _mm512_mul_ps(coeffs0, x0_v); _v = _mm512_fmadd_ps(coeffs1, x1_v, _v); @@ -58,10 +58,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); } __m512 gx_floor = _mm512_floor_ps(gx); @@ -72,9 +72,9 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& __m512 coefficients[4]; - __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); __m512i x0 = _mm512_cvtps_epi32(gx0); @@ -82,10 +82,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i x2 = _mm512_cvtps_epi32(gx2); __m512i x3 = _mm512_cvtps_epi32(gx3); - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x2); - __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x3); + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x2); + __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x3); __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; __mmask16 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; @@ -95,7 +95,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i y = _mm512_cvtps_epi32(gy); - __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y); + __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y); v0_in_range[i] = x0_in_range & y_in_range; v1_in_range[i] = x1_in_range & y_in_range; @@ -160,8 +160,8 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& { const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); } __m512 gx_floor = _mm512_floor_ps(gx); @@ -172,9 +172,9 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& __m512 coefficients[4]; - __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); __m512i x0 = _mm512_cvtps_epi32(gx0); @@ -182,10 +182,10 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i x2 = _mm512_cvtps_epi32(gx2); __m512i x3 = _mm512_cvtps_epi32(gx3); - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x2); - __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x3); + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x2); + __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x3); __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; __mmask16 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; @@ -195,7 +195,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i y = _mm512_cvtps_epi32(gy); - __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y); + __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y); v0_in_range[i] = x0_in_range & y_in_range; v1_in_range[i] = x1_in_range & y_in_range; @@ -257,10 +257,10 @@ static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); __m512 gx_floor = _mm512_floor_ps(gx); __m512 gy_floor = _mm512_floor_ps(gy); @@ -270,9 +270,9 @@ static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& __m512 coefficients[4]; - __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); @@ -348,11 +348,11 @@ static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); __m512 gx_floor = _mm512_floor_ps(gx); __m512 gy_floor = _mm512_floor_ps(gy); @@ -362,9 +362,9 @@ static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& __m512 coefficients[4]; - __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); @@ -440,10 +440,10 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); __m512 gx_floor = _mm512_floor_ps(gx); __m512 gy_floor = _mm512_floor_ps(gy); @@ -453,14 +453,14 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, __m512 coefficients[4]; - __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); { // x0 - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx0 = _mm512_add_ps(gx0, v0p5fp16); @@ -530,7 +530,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, { //y - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_add_ps(gy, v0p5fp16); @@ -605,11 +605,11 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); __m512 gx_floor = _mm512_floor_ps(gx); __m512 gy_floor = _mm512_floor_ps(gy); @@ -619,14 +619,14 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, __m512 coefficients[4]; - __m512 gx0 = _mm512_add_ps(gx_floor, vn1fp16); + __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, v1fp16); + __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); { // x0 - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); __m512 reflectx0_v = _mm512_and_ps(_mm512_sub_ps(gx0, border_x), *(__m512*)_ps512_inv_sign_mask); @@ -663,7 +663,7 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, { //y - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index 095196e6875..13bbe768104 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -36,10 +36,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); } __m256 gx_floor = _mm256_floor_ps(gx); @@ -50,9 +50,9 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); __m256i x0 = _mm256_cvtps_epi32(gx0); @@ -60,10 +60,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i x2 = _mm256_cvtps_epi32(gx2); __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x3)); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; @@ -73,7 +73,7 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i y = _mm256_cvtps_epi32(gy); - __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y)); v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); @@ -99,10 +99,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i])); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -138,8 +138,8 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); } __m256 gx_floor = _mm256_floor_ps(gx); @@ -150,9 +150,9 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); __m256i x0 = _mm256_cvtps_epi32(gx0); @@ -160,10 +160,10 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i x2 = _mm256_cvtps_epi32(gx2); __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x2)); - __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x3)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x2)); + __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x3)); __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; @@ -173,7 +173,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i y = _mm256_cvtps_epi32(gy); - __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y)); + __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y)); v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); @@ -199,10 +199,10 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i]), sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i])); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -235,10 +235,10 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); __m256 gx_floor = _mm256_floor_ps(gx); __m256 gy_floor = _mm256_floor_ps(gy); @@ -248,9 +248,9 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); @@ -288,10 +288,10 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& { for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -324,11 +324,11 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); __m256 gx_floor = _mm256_floor_ps(gx); __m256 gy_floor = _mm256_floor_ps(gy); @@ -338,9 +338,9 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); @@ -378,10 +378,10 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& { for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -414,10 +414,10 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); __m256 gx_floor = _mm256_floor_ps(gx); __m256 gy_floor = _mm256_floor_ps(gy); @@ -427,14 +427,14 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); { // x0 - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx0 = _mm256_add_ps(gx0, v0p5fp8); @@ -504,7 +504,7 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { //y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_add_ps(gy, v0p5fp8); @@ -539,10 +539,10 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -577,11 +577,11 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); __m256 gx_floor = _mm256_floor_ps(gx); __m256 gy_floor = _mm256_floor_ps(gy); @@ -591,14 +591,14 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, vn1fp8); + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, v1fp8); + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); { // x0 - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); @@ -635,7 +635,7 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { //y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); @@ -662,10 +662,10 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { for (int i = 0; i < 4; i++) { - __m256 x0_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v0_offset[i], vn1fp8, sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v1_offset[i], vn1fp8, sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v2_offset[i], vn1fp8, sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), v3_offset[i], vn1fp8, sizeof(float)); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } diff --git a/src/layer/x86/gridsample_bilinear_pack16.h b/src/layer/x86/gridsample_bilinear_pack16.h index 005590a3581..1b0f6389f8a 100644 --- a/src/layer/x86/gridsample_bilinear_pack16.h +++ b/src/layer/x86/gridsample_bilinear_pack16.h @@ -36,19 +36,19 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); } __m512 x_w = _mm512_floor_ps(gx); __m512 y_n = _mm512_floor_ps(gy); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); @@ -56,14 +56,14 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& __m512 se = _mm512_mul_ps(n, w); __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 v00_in_range = x0_in_range & y0_in_range; __mmask16 v01_in_range = x0_in_range & y1_in_range; @@ -119,19 +119,19 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); } __m512 x_w = _mm512_floor_ps(gx); __m512 y_n = _mm512_floor_ps(gy); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); @@ -139,14 +139,14 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& __m512 se = _mm512_mul_ps(n, w); __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 v00_in_range = x0_in_range & y0_in_range; __mmask16 v01_in_range = x0_in_range & y1_in_range; @@ -202,16 +202,16 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } @@ -220,9 +220,9 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat __m512 y_n = _mm512_floor_ps(gy); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); @@ -230,12 +230,12 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat __m512 se = _mm512_mul_ps(n, w); __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 v11_in_range = x1_in_range & y1_in_range; @@ -288,16 +288,16 @@ static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } @@ -306,9 +306,9 @@ static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat __m512 y_n = _mm512_floor_ps(gy); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); @@ -316,12 +316,12 @@ static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat __m512 se = _mm512_mul_ps(n, w); __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 v11_in_range = x1_in_range & y1_in_range; @@ -374,9 +374,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); __m512 v0p5fp16 = _mm512_set1_ps(0.5f); gx = _mm512_add_ps(gx, v0p5fp16); @@ -393,9 +393,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_add_ps(gy, v0p5fp16); @@ -415,9 +415,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, __m512 y_n = _mm512_floor_ps(gy); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); @@ -425,12 +425,12 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, __m512 se = _mm512_mul_ps(n, w); __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 v11_in_range = x1_in_range & y1_in_range; @@ -483,9 +483,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); @@ -493,9 +493,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, gx = _mm512_sub_ps(border_x, reflectx_v); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); @@ -507,9 +507,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, __m512 y_n = _mm512_floor_ps(gy); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 nw = _mm512_mul_ps(s, e); __m512 ne = _mm512_mul_ps(s, w); @@ -517,12 +517,12 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, __m512 se = _mm512_mul_ps(n, w); __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); __mmask16 v11_in_range = x1_in_range & y1_in_range; @@ -580,13 +580,13 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); } __m512 x_w = _mm512_floor_ps(gx); @@ -594,11 +594,11 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& __m512 z_t = _mm512_floor_ps(gz); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(v1fp16, t); + __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -619,18 +619,18 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& } __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, v1ip16); + __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z0); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z0); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); __mmask16 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -719,13 +719,13 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); } __m512 x_w = _mm512_floor_ps(gx); @@ -733,11 +733,11 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& __m512 z_t = _mm512_floor_ps(gz); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(v1fp16, t); + __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -758,18 +758,18 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& } __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, v1ip16); + __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z0); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z0); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); __mmask16 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -858,23 +858,23 @@ static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } @@ -884,11 +884,11 @@ static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat __m512 z_t = _mm512_floor_ps(gz); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(v1fp16, t); + __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -909,15 +909,15 @@ static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat } __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, v1ip16); + __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -999,23 +999,23 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } @@ -1025,11 +1025,11 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat __m512 z_t = _mm512_floor_ps(gz); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(v1fp16, t); + __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -1050,15 +1050,15 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat } __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, v1ip16); + __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -1140,8 +1140,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); __m512 v0p5fp16 = _mm512_set1_ps(0.5f); gx = _mm512_add_ps(gx, v0p5fp16); @@ -1158,8 +1158,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_add_ps(gy, v0p5fp16); @@ -1175,8 +1175,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_add_ps(gz, v0p5fp16); @@ -1197,11 +1197,11 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, __m512 z_t = _mm512_floor_ps(gz); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(v1fp16, t); + __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -1222,15 +1222,15 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, } __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, v1ip16); + __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -1312,8 +1312,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); @@ -1321,8 +1321,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, gx = _mm512_sub_ps(border_x, reflectx_v); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); @@ -1330,8 +1330,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, gy = _mm512_sub_ps(border_y, reflecty_v); // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); @@ -1344,11 +1344,11 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, __m512 z_t = _mm512_floor_ps(gz); __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(v1fp16, w); + __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(v1fp16, n); + __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(v1fp16, t); + __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -1369,15 +1369,15 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, } __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, v1ip16); + __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, v1ip16); + __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, v1ip16); + __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, z1); + __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); + __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); + __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index e173dfcbe12..093e14351c8 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -36,19 +36,19 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); } __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); @@ -56,14 +56,14 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256 se = _mm256_mul_ps(n, w); __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); @@ -79,10 +79,10 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -119,19 +119,19 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); } __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); @@ -139,14 +139,14 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256 se = _mm256_mul_ps(n, w); __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); @@ -162,10 +162,10 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range), sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -202,16 +202,16 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } @@ -220,9 +220,9 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256 y_n = _mm256_floor_ps(gy); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); @@ -230,12 +230,12 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256 se = _mm256_mul_ps(n, w); __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -248,10 +248,10 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -288,16 +288,16 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } @@ -306,9 +306,9 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256 y_n = _mm256_floor_ps(gy); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); @@ -316,12 +316,12 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256 se = _mm256_mul_ps(n, w); __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -334,10 +334,10 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -374,9 +374,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); @@ -393,9 +393,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_add_ps(gy, v0p5fp8); @@ -415,9 +415,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256 y_n = _mm256_floor_ps(gy); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); @@ -425,12 +425,12 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256 se = _mm256_mul_ps(n, w); __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -443,10 +443,10 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, for (int q = 0; q < dst.c; q++) { - __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -483,9 +483,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); @@ -493,9 +493,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, gx = _mm256_sub_ps(border_x, reflectx_v); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); @@ -507,9 +507,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, __m256 y_n = _mm256_floor_ps(gy); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 nw = _mm256_mul_ps(s, e); __m256 ne = _mm256_mul_ps(s, w); @@ -517,12 +517,12 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, __m256 se = _mm256_mul_ps(n, w); __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); @@ -535,10 +535,10 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, for (int q = 0; q < dst.c; q++) { - __m256 nw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_nw_offset, vn1fp8, sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -580,13 +580,13 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); } __m256 x_w = _mm256_floor_ps(gx); @@ -594,11 +594,11 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256 z_t = _mm256_floor_ps(gz); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(v1fp8, t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -619,18 +619,18 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& } __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, v1ip8); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -663,15 +663,15 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); - __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); - __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); - __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); - __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -719,13 +719,13 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); } __m256 x_w = _mm256_floor_ps(gx); @@ -733,11 +733,11 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256 z_t = _mm256_floor_ps(gz); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(v1fp8, t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -758,18 +758,18 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& } __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, v1ip8); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -802,15 +802,15 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range), sizeof(float)); - __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range), sizeof(float)); - __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range), sizeof(float)); - __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range), sizeof(float)); - __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -858,23 +858,23 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } @@ -884,11 +884,11 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256 z_t = _mm256_floor_ps(gz); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(v1fp8, t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -909,15 +909,15 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& } __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, v1ip8); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -943,15 +943,15 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -999,23 +999,23 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } @@ -1025,11 +1025,11 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256 z_t = _mm256_floor_ps(gz); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(v1fp8, t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -1050,15 +1050,15 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& } __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, v1ip8); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -1084,15 +1084,15 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -1140,8 +1140,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); @@ -1158,8 +1158,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_add_ps(gy, v0p5fp8); @@ -1175,8 +1175,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_add_ps(gz, v0p5fp8); @@ -1197,11 +1197,11 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256 z_t = _mm256_floor_ps(gz); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(v1fp8, t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -1222,15 +1222,15 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, } __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, v1ip8); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -1256,15 +1256,15 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -1312,8 +1312,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); @@ -1321,8 +1321,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, gx = _mm256_sub_ps(border_x, reflectx_v); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); @@ -1330,8 +1330,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, gy = _mm256_sub_ps(border_y, reflecty_v); // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); @@ -1344,11 +1344,11 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, __m256 z_t = _mm256_floor_ps(gz); __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(v1fp8, w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(v1fp8, n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(v1fp8, t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; { @@ -1369,15 +1369,15 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, } __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, v1ip8); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, v1ip8); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, v1ip8); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, vn1ip8), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, vn1ip8), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, vn1ip8), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; { @@ -1403,15 +1403,15 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tnw_offset, vn1fp8, sizeof(float)); - __m256 tne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range), sizeof(float)); - __m256 tsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 tse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range), sizeof(float)); - - __m256 bnw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range), sizeof(float)); - __m256 bne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range), sizeof(float)); - __m256 bsw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range), sizeof(float)); - __m256 bse_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range), sizeof(float)); + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); + + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); diff --git a/src/layer/x86/gridsample_nearest_pack16.h b/src/layer/x86/gridsample_nearest_pack16.h index 1caa217e1ce..0d0dc7a01ba 100644 --- a/src/layer/x86/gridsample_nearest_pack16.h +++ b/src/layer/x86/gridsample_nearest_pack16.h @@ -36,10 +36,10 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); } gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); @@ -48,7 +48,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); @@ -87,10 +87,10 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); } gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); @@ -99,7 +99,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); @@ -138,16 +138,16 @@ static void gridsample_2d_nearest_align0_border_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } @@ -195,16 +195,16 @@ static void gridsample_2d_nearest_align1_border_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } @@ -248,8 +248,8 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); @@ -257,7 +257,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, // compute coord { // x - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); __m512 v0p5fp16 = _mm512_set1_ps(0.5f); gx = _mm512_add_ps(gx, v0p5fp16); @@ -274,7 +274,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_add_ps(gy, v0p5fp16); @@ -326,8 +326,8 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); @@ -335,7 +335,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, // compute coord { // x - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); @@ -343,7 +343,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, gx = _mm512_sub_ps(border_x, reflectx_v); // y - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); @@ -396,13 +396,13 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); } gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); @@ -413,8 +413,8 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& __m512i iy = _mm512_cvtps_epi32(gy); __m512i iz = _mm512_cvtps_epi32(gz); - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); @@ -458,13 +458,13 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); } gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); @@ -475,8 +475,8 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& __m512i iy = _mm512_cvtps_epi32(gy); __m512i iz = _mm512_cvtps_epi32(gz); - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, vn1ip16) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); + __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); + v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); @@ -520,23 +520,23 @@ static void gridsample_3d_nearest_align0_border_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } @@ -591,23 +591,23 @@ static void gridsample_3d_nearest_align1_border_blob_pack16(const Mat& src, Mat& const __m512 two = _mm512_set1_ps(2.f); // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } @@ -658,9 +658,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, v1fp16), vImgWf, v1fp16), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, v1fp16), vImgHf, v1fp16), two); - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, v1fp16), vImgDf, v1fp16), two); + gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); + gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); + gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); @@ -669,7 +669,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, // compute coord { // x - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); __m512 v0p5fp16 = _mm512_set1_ps(0.5f); gx = _mm512_add_ps(gx, v0p5fp16); @@ -686,7 +686,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); // y - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_add_ps(gy, v0p5fp16); @@ -702,7 +702,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); // z - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_add_ps(gz, v0p5fp16); @@ -760,9 +760,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, v1fp16), two), _mm512_sub_ps(vImgWf, v1fp16)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, v1fp16), two), _mm512_sub_ps(vImgHf, v1fp16)); - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, v1fp16), two), _mm512_sub_ps(vImgDf, v1fp16)); + gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); + gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); + gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); @@ -771,7 +771,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, // compute coord { // x - const __m512 border_x = _mm512_sub_ps(vImgWf, v1fp16); + const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); @@ -779,7 +779,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, gx = _mm512_sub_ps(border_x, reflectx_v); // y - const __m512 border_y = _mm512_sub_ps(vImgHf, v1fp16); + const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); @@ -787,7 +787,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, gy = _mm512_sub_ps(border_y, reflecty_v); // z - const __m512 border_z = _mm512_sub_ps(vImgDf, v1fp16); + const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index 7d5b0e2c300..2b49ab9a049 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -36,10 +36,10 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); } gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); @@ -48,15 +48,15 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -88,10 +88,10 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); } gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); @@ -100,15 +100,15 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -140,16 +140,16 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } @@ -165,7 +165,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -197,16 +197,16 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } @@ -222,7 +222,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -250,8 +250,8 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -259,7 +259,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); @@ -276,7 +276,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_add_ps(gy, v0p5fp8); @@ -300,7 +300,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -328,8 +328,8 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -337,7 +337,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); @@ -345,7 +345,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M gx = _mm256_sub_ps(border_x, reflectx_v); // y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); @@ -361,7 +361,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -398,13 +398,13 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); } gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); @@ -415,15 +415,15 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -461,13 +461,13 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); } gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); @@ -478,15 +478,15 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, vn1ip8), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, vn1ip8), _mm256_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, vn1ip8), _mm256_cmpgt_epi32(vImgDi, iz))); + __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), + _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); + v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -524,23 +524,23 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } @@ -557,7 +557,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -595,23 +595,23 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256 two = _mm256_set1_ps(2.f); // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } @@ -628,7 +628,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -662,9 +662,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, v1fp8), vImgWf, v1fp8), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, v1fp8), vImgHf, v1fp8), two); - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, v1fp8), vImgDf, v1fp8), two); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -673,7 +673,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); __m256 v0p5fp8 = _mm256_set1_ps(0.5f); gx = _mm256_add_ps(gx, v0p5fp8); @@ -690,7 +690,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); // y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_add_ps(gy, v0p5fp8); @@ -706,7 +706,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); // z - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_add_ps(gz, v0p5fp8); @@ -730,7 +730,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -764,9 +764,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, v1fp8), two), _mm256_sub_ps(vImgWf, v1fp8)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, v1fp8), two), _mm256_sub_ps(vImgHf, v1fp8)); - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, v1fp8), two), _mm256_sub_ps(vImgDf, v1fp8)); + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -775,7 +775,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M // compute coord { // x - const __m256 border_x = _mm256_sub_ps(vImgWf, v1fp8); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); @@ -783,7 +783,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M gx = _mm256_sub_ps(border_x, reflectx_v); // y - const __m256 border_y = _mm256_sub_ps(vImgHf, v1fp8); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); @@ -791,7 +791,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M gy = _mm256_sub_ps(border_y, reflecty_v); // z - const __m256 border_z = _mm256_sub_ps(vImgDf, v1fp8); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); @@ -807,7 +807,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M for (int q = 0; q < dst.c; q++) { - __m256 _v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), src.channel(q), i_offset, _mm256_set1_ps(-1.0f), sizeof(float)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 903365f2ec2..98ab9362083 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -39,34 +39,56 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ #if __AVX512F__ -const __m512 v1fp16 = _mm512_set1_ps(1.0f); -const __m512 vn1fp16 = _mm512_set1_ps(-1.0f); -const __m512i v1ip16 = _mm512_set1_epi32(1); -const __m512i vn1ip16 = _mm512_set1_epi32(-1); + +_PS512_CONST(n1, -1.0f); +_PI32_CONST512(n1, -1); #include "gridsample_bilinear_pack16.h" #include "gridsample_nearest_pack16.h" #include "gridsample_bicubic_pack16.h" #endif // __AVX512F__ -const __m256 v1fp8 = *(__m256*)_ps256_1; -const __m256 vn1fp8 = _mm256_set1_ps(-1.0f); -const __m256i v1ip8 = _mm256_set1_epi32(1); -const __m256i vn1ip8 = _mm256_set1_epi32(-1); + +_PS256_CONST(n1, -1.0f); +_PI32_CONST256(n1, -1); + +static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) +{ +#if __AVX2__ + __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[8], maski[8]; + memcpy(offseti, &offset, 8 * sizeof(int)); + memcpy(maski, &mask, 8 * sizeof(int)); + + float data[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < 8; i++) + { + if (maski[i] & 0xF0000000) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m256 v = _mm256_loadu_ps(data); +#endif // __AVX__ + + return v; +} static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) { const __m256 A = _mm256_set1_ps(-0.75f); - const __m256 x0 = _mm256_add_ps(tx, v1fp8); + const __m256 x0 = _mm256_add_ps(tx, *(__m256*)_ps256_1); const __m256& x1 = tx; - const __m256 x2 = _mm256_sub_ps(v1fp8, tx); - //const __m256 x3 = _mm256_add_ps(x2, v1fp8); + const __m256 x2 = _mm256_sub_ps(*(__m256*)_ps256_1, tx); + //const __m256 x3 = _mm256_add_ps(x2, *(__m256*)_ps256_1); const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), v1fp8); - const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), v1fp8); - const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(v1fp8, coeffs0), coeffs1), coeffs2); + const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), *(__m256*)_ps256_1); + const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), *(__m256*)_ps256_1); + const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(*(__m256*)_ps256_1, coeffs0), coeffs1), coeffs2); __m256 _v = _mm256_mul_ps(coeffs0, x0_v); _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); @@ -89,7 +111,7 @@ const __m128i vn1ip4 = _mm_set1_epi32(-1); static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) { -#if __AVX__ +#if __AVX2__ __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); #else int offseti[4], maski[4]; @@ -925,19 +947,19 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); @@ -1298,16 +1320,16 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); @@ -1427,9 +1449,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); @@ -1583,9 +1605,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); @@ -1732,10 +1754,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Mon, 5 Dec 2022 18:11:36 +0800 Subject: [PATCH 037/127] slove the sse2 complier problem --- src/layer/x86/gridsample_bicubic_pack4.h | 24 +- src/layer/x86/gridsample_bilinear_pack4.h | 337 ++++++++++++++++++++-- src/layer/x86/gridsample_nearest_pack4.h | 144 ++++++--- src/layer/x86/gridsample_x86.cpp | 142 ++++----- src/layer/x86/sse_mathfun.h | 84 ++++++ src/layer/x86/unaryop_x86.cpp | 78 +---- tests/test_gridsample.cpp | 36 +-- 7 files changed, 608 insertions(+), 237 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index 6e1efbe6b54..43f95ad111f 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -64,8 +64,8 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); } - __m128 gx_floor = _mm_floor_ps(gx); - __m128 gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = floor_ps(gx); + __m128 gy_floor = floor_ps(gy); const __m128 tx = _mm_sub_ps(gx, gx_floor); const __m128 ty = _mm_sub_ps(gy, gy_floor); @@ -164,8 +164,8 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); } - __m128 gx_floor = _mm_floor_ps(gx); - __m128 gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = floor_ps(gx); + __m128 gy_floor = floor_ps(gy); const __m128 tx = _mm_sub_ps(gx, gx_floor); const __m128 ty = _mm_sub_ps(gy, gy_floor); @@ -262,8 +262,8 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - __m128 gx_floor = _mm_floor_ps(gx); - __m128 gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = floor_ps(gx); + __m128 gy_floor = floor_ps(gy); const __m128 tx = _mm_sub_ps(gx, gx_floor); const __m128 ty = _mm_sub_ps(gy, gy_floor); @@ -354,8 +354,8 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - __m128 gx_floor = _mm_floor_ps(gx); - __m128 gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = floor_ps(gx); + __m128 gy_floor = floor_ps(gy); const __m128 tx = _mm_sub_ps(gx, gx_floor); const __m128 ty = _mm_sub_ps(gy, gy_floor); @@ -445,8 +445,8 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - __m128 gx_floor = _mm_floor_ps(gx); - __m128 gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = floor_ps(gx); + __m128 gy_floor = floor_ps(gy); const __m128 tx = _mm_sub_ps(gx, gx_floor); const __m128 ty = _mm_sub_ps(gy, gy_floor); @@ -611,8 +611,8 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - __m128 gx_floor = _mm_floor_ps(gx); - __m128 gy_floor = _mm_floor_ps(gy); + __m128 gx_floor = floor_ps(gx); + __m128 gy_floor = floor_ps(gy); const __m128 tx = _mm_sub_ps(gx, gx_floor); const __m128 ty = _mm_sub_ps(gy, gy_floor); diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index 482a2091678..7a47b90623a 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -20,6 +20,9 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -42,8 +45,13 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -70,12 +78,25 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) + // (W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); +#else + __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); + __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); + + __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); + __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); + __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); + __m128i i_se_offset = _mm_cvtps_epi32(se_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -103,6 +124,9 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -125,8 +149,13 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -153,12 +182,25 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) + // (W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); +#else + __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); + __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); + + __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); + __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); + __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); + __m128i i_se_offset = _mm_cvtps_epi32(se_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -186,6 +228,9 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -216,8 +261,13 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -239,12 +289,25 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) + // (W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); +#else + __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); + __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); + + __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); + __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); + __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); + __m128i i_se_offset = _mm_cvtps_epi32(se_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -272,6 +335,9 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -302,8 +368,13 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -325,12 +396,25 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) + // (W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); +#else + __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); + __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); + + __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); + __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); + __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); + __m128i i_se_offset = _mm_cvtps_epi32(se_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -358,6 +442,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -411,8 +498,13 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -434,12 +526,25 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) + // (W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); +#else + __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); + __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); + + __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); + __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); + __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); + __m128i i_se_offset = _mm_cvtps_epi32(se_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -467,6 +572,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -503,8 +611,13 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, gy = _mm_sub_ps(border_y, reflecty_v); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -526,12 +639,25 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) + // (W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); +#else + __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); + __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); + + __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); + __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); + __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); + __m128i i_se_offset = _mm_cvtps_epi32(se_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -561,6 +687,9 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -589,9 +718,16 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); __m128 z_t = _mm_floor_ps(gz); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); + __m128 z_t = floor_ps(gz); +#endif // __SSE4_1__ + __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -650,7 +786,8 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) + // (W*H*z + W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); @@ -660,6 +797,27 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); +#else + __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); + __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); + + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); + __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); + + __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); + __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); + __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); + __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); + + __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); + __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); + __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); + __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -700,6 +858,9 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -728,9 +889,15 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); } +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128 x_w = _mm_floor_ps(gx); __m128 y_n = _mm_floor_ps(gy); __m128 z_t = _mm_floor_ps(gz); +#else + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); + __m128 z_t = floor_ps(gz); +#endif // __SSE4_1__ __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -789,7 +956,8 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) + // (W*H*z + W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); @@ -799,6 +967,27 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); +#else + __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); + __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); + + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); + __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); + + __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); + __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); + __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); + __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); + + __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); + __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); + __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); + __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -839,6 +1028,9 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -879,9 +1071,9 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); - __m128 z_t = _mm_floor_ps(gz); + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); + __m128 z_t = floor_ps(gz); __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -930,7 +1122,8 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) + // (W*H*z + W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); @@ -940,6 +1133,27 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); +#else + __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); + __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); + + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); + __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); + + __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); + __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); + __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); + __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); + + __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); + __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); + __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); + __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -980,6 +1194,9 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1020,9 +1237,9 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); - __m128 z_t = _mm_floor_ps(gz); + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); + __m128 z_t = floor_ps(gz); __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -1071,7 +1288,8 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) + // (W*H*z + W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); @@ -1081,6 +1299,27 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); +#else + __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); + __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); + + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); + __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); + + __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); + __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); + __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); + __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); + + __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); + __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); + __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); + __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -1121,6 +1360,9 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1192,9 +1434,9 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); - __m128 z_t = _mm_floor_ps(gz); + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); + __m128 z_t = floor_ps(gz); __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -1243,7 +1485,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) + // (W*H*z + W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); @@ -1253,6 +1496,27 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); +#else + __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); + __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); + + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); + __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); + + __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); + __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); + __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); + __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); + + __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); + __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); + __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); + __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -1293,6 +1557,9 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1339,9 +1606,9 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, gz = _mm_sub_ps(border_z, reflectz_v); } - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); - __m128 z_t = _mm_floor_ps(gz); + __m128 x_w = floor_ps(gx); + __m128 y_n = floor_ps(gy); + __m128 z_t = floor_ps(gz); __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); @@ -1390,7 +1657,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) + // (W*H*z + W*y + x) * elempack + vec(4) +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); @@ -1400,6 +1668,27 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); +#else + __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); + __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); + + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); + __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); + __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); + + __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); + __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); + __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); + __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); + + __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); + __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); + __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); + __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index 51cdd13a63e..e8fa5d2c7e7 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -20,6 +20,9 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -42,8 +45,8 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); @@ -51,8 +54,14 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm_set_epi32(3, 2, 1, 0)); +#else + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -72,6 +81,9 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -94,8 +106,8 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); @@ -103,8 +115,14 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm_set_epi32(3, 2, 1, 0)); +#else + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -124,6 +142,9 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -154,14 +175,20 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm_set_epi32(3, 2, 1, 0)); +#else + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -181,6 +208,9 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -211,14 +241,20 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm_set_epi32(3, 2, 1, 0)); +#else + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -238,6 +274,9 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -253,8 +292,8 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); // compute coord { @@ -295,8 +334,14 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm_set_epi32(3, 2, 1, 0)); +#else + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -316,6 +361,9 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vElempacki = _mm_set1_epi32(src.elempack); +#if !((_MSC_VER && __AVX__) || __SSE4_1__) + const __m128 vElempackf = _mm_set1_ps(src.elempack); +#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -331,8 +379,8 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); // compute coord { @@ -356,8 +404,14 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); +#if (_MSC_VER && __AVX__) || __SSE4_1__ __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm_set_epi32(3, 2, 1, 0)); +#else + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), + _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); +#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -379,6 +433,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -407,9 +462,9 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); @@ -419,7 +474,9 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) { @@ -442,6 +499,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -470,9 +528,9 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); @@ -482,7 +540,9 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) { @@ -505,6 +565,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -545,15 +606,17 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); __m128i iz = _mm_cvtps_epi32(gz); - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) { @@ -576,6 +639,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -616,15 +680,17 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); __m128i ix = _mm_cvtps_epi32(gx); __m128i iy = _mm_cvtps_epi32(gy); __m128i iz = _mm_cvtps_epi32(gz); - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) { @@ -647,6 +713,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -666,9 +733,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); // compute coord { @@ -726,7 +793,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M __m128i iy = _mm_cvtps_epi32(gy); __m128i iz = _mm_cvtps_epi32(gz); - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) { @@ -749,6 +818,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M const __m128i vImgDi = _mm_set1_epi32(src.d); const __m128i vElempacki = _mm_set1_epi32(src.elempack); + const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -768,9 +838,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - gx = _mm_floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = _mm_floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = _mm_floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); + gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); + gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); + gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); // compute coord { @@ -803,7 +873,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M __m128i iy = _mm_cvtps_epi32(gy); __m128i iz = _mm_cvtps_epi32(gz); - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), iz), _mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); + __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) { diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 98ab9362083..3a4a852d841 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -937,7 +937,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v00_in_range), sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range)); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1059,7 +1059,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v00_in_range), sizeof(float)); - __m256 ne_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range)); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1183,7 +1183,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1310,7 +1310,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1439,7 +1439,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1595,7 +1595,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range), sizeof(float)); - __m256 sw_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range), sizeof(float)); - __m256 se_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range), sizeof(float)); + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1744,7 +1744,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v_in_range), sizeof(float)); + __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); } @@ -1848,7 +1848,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(&v_in_range), sizeof(float)); + __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); } @@ -1896,7 +1896,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v0_in_range[i]), sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i])); + __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); + __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); + __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -2446,7 +2446,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v0_in_range[i]), sizeof(float)); - __m256 x1_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i]), sizeof(float)); - __m256 x2_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i]), sizeof(float)); - __m256 x3_val = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i]), sizeof(float)); + __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i])); + __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); + __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); + __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -2628,7 +2628,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(static_cast(absolute)); + __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); + + // truncated_with_sign = (truncated || negative_mask); + __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); + + // negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f); + __m128 negative_fix = _mm_and_ps( + _mm_cmplt_ps(x, truncated_with_sign), + _mm_set_ps1(1.0f)); + + // fixed_result = truncated_with_sign - negative_fix; + __m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix); + + // return ((x && no_fraction) || (!no_fraction && fixed_result)); + return _mm_or_ps( + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); +} + +static NCNN_FORCEINLINE __m128 ceil_ps(const __m128 x) +{ +#if (_MSC_VER && __AVX__) || __SSE4_1__ + return _mm_ceil_ps(x); +#endif // __SSE4_1__ + + // Use negative zero as the sign bit mask. + const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); + + // The smallest float number that have no fractional part. (2^23) + const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f); + + // absolute = abs(x); + __m128 absolute = _mm_andnot_ps(magic_negative_zero, x); + + // negative_mask = magic_negative_zero && x; + __m128 negative_mask = _mm_and_ps(magic_negative_zero, x); + + // no_fraction = (magic_smallest_no_fraction < absolute); + __m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute); + + // truncated = static_cast(static_cast(absolute)); + __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); + + // truncated_with_sign = (truncated || negative_mask); + __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); + + // positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f); + __m128 positive_fix = _mm_and_ps( + _mm_and_ps( + _mm_cmpgt_ps(x, magic_negative_zero), + _mm_cmpgt_ps(x, truncated_with_sign)), + _mm_set_ps1(-1.0f)); + + // fixed_result = truncated_with_sign - positive_fix; + __m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix); + + // return ((x && no_fraction) || (!no_fraction && fixed_result)); + return _mm_or_ps( + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); +} + #endif // SSE_MATHFUN_H diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index d7e052c2994..9674804d322 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -158,43 +158,7 @@ struct unary_op_floor #if __SSE2__ __m128 func_pack4(const __m128& x) const { -#if __SSE4_1__ - return _mm_floor_ps(x); -#endif // __SSE4_1__ - - // Use negative zero as the sign bit mask. - const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); - - // The smallest float number that have no fractional part. (2^23) - const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f); - - // absolute = abs(x); - __m128 absolute = _mm_andnot_ps(magic_negative_zero, x); - - // negative_mask = magic_negative_zero && x; - __m128 negative_mask = _mm_and_ps(magic_negative_zero, x); - - // no_fraction = (magic_smallest_no_fraction < absolute); - __m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute); - - // truncated = static_cast(static_cast(absolute)); - __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); - - // truncated_with_sign = (truncated || negative_mask); - __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); - - // negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f); - __m128 negative_fix = _mm_and_ps( - _mm_cmplt_ps(x, truncated_with_sign), - _mm_set_ps1(1.0f)); - - // fixed_result = truncated_with_sign - negative_fix; - __m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix); - - // return ((x && no_fraction) || (!no_fraction && fixed_result)); - return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + return floor_ps(x); } #if __AVX__ __m256 func_pack8(const __m256& x) const @@ -220,45 +184,7 @@ struct unary_op_ceil #if __SSE2__ __m128 func_pack4(const __m128& x) const { -#if __SSE4_1__ - return _mm_ceil_ps(x); -#endif // __SSE4_1__ - - // Use negative zero as the sign bit mask. - const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); - - // The smallest float number that have no fractional part. (2^23) - const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f); - - // absolute = abs(x); - __m128 absolute = _mm_andnot_ps(magic_negative_zero, x); - - // negative_mask = magic_negative_zero && x; - __m128 negative_mask = _mm_and_ps(magic_negative_zero, x); - - // no_fraction = (magic_smallest_no_fraction < absolute); - __m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute); - - // truncated = static_cast(static_cast(absolute)); - __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); - - // truncated_with_sign = (truncated || negative_mask); - __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); - - // positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f); - __m128 positive_fix = _mm_and_ps( - _mm_and_ps( - _mm_cmpgt_ps(x, magic_negative_zero), - _mm_cmpgt_ps(x, truncated_with_sign)), - _mm_set_ps1(-1.0f)); - - // fixed_result = truncated_with_sign - positive_fix; - __m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix); - - // return ((x && no_fraction) || (!no_fraction && fixed_result)); - return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + return ceil_ps(x); } #if __AVX__ __m256 func_pack8(const __m256& x) const diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 3d019d98419..81043959cee 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -42,24 +42,24 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample static int test_gridsample_0() { return 0 - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 2, 3, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 2, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 2, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 3, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 16, 12), 3, 3, 1); + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 3, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 2, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 2, 1) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 3, 0) + || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 3, 1); } static int test_gridsample_1() From cd0878b0aa17be9e9347b5cd8e0bd978f7398692 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Mon, 5 Dec 2022 10:13:28 +0000 Subject: [PATCH 038/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_pack4.h | 5 ++--- src/layer/x86/gridsample_nearest_pack4.h | 26 +++++++++++++++++------ src/layer/x86/sse_mathfun.h | 20 ++++++++--------- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index 7a47b90623a..e8b3c207d82 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -87,7 +87,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); #else __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); + _mm_set_ps(3, 2, 1, 0)); __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); @@ -728,7 +728,6 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& __m128 z_t = floor_ps(gz); #endif // __SSE4_1__ - __m128 w = _mm_sub_ps(gx, x_w); __m128 e = _mm_sub_ps(v1fp4, w); __m128 n = _mm_sub_ps(gy, y_n); @@ -802,7 +801,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - + __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index e8fa5d2c7e7..dab9b431ee7 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -59,7 +59,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d _mm_set_epi32(3, 2, 1, 0)); #else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); #endif // __SSE4_1__ @@ -475,7 +475,9 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) @@ -541,7 +543,9 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) @@ -615,7 +619,9 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& __m128i iz = _mm_cvtps_epi32(gz); __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) @@ -689,7 +695,9 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& __m128i iz = _mm_cvtps_epi32(gz); __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) @@ -794,7 +802,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M __m128i iz = _mm_cvtps_epi32(gz); __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) @@ -874,7 +884,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M __m128i iz = _mm_cvtps_epi32(gz); __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)),vElempackf), _mm_set_ps(3, 2, 1, 0)); + _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); for (int q = 0; q < dst.c; q++) diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index 64f0d1a7fcd..d7fa1ae92f6 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -767,16 +767,16 @@ static NCNN_FORCEINLINE __m128 floor_ps(const __m128 x) // negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f); __m128 negative_fix = _mm_and_ps( - _mm_cmplt_ps(x, truncated_with_sign), - _mm_set_ps1(1.0f)); + _mm_cmplt_ps(x, truncated_with_sign), + _mm_set_ps1(1.0f)); // fixed_result = truncated_with_sign - negative_fix; __m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix); // return ((x && no_fraction) || (!no_fraction && fixed_result)); return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); } static NCNN_FORCEINLINE __m128 ceil_ps(const __m128 x) @@ -808,18 +808,18 @@ static NCNN_FORCEINLINE __m128 ceil_ps(const __m128 x) // positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f); __m128 positive_fix = _mm_and_ps( - _mm_and_ps( - _mm_cmpgt_ps(x, magic_negative_zero), - _mm_cmpgt_ps(x, truncated_with_sign)), - _mm_set_ps1(-1.0f)); + _mm_and_ps( + _mm_cmpgt_ps(x, magic_negative_zero), + _mm_cmpgt_ps(x, truncated_with_sign)), + _mm_set_ps1(-1.0f)); // fixed_result = truncated_with_sign - positive_fix; __m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix); // return ((x && no_fraction) || (!no_fraction && fixed_result)); return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); } #endif // SSE_MATHFUN_H From 24f0351c44118ef5b09e39f1815ab355459ea5e0 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 6 Dec 2022 15:44:19 +0800 Subject: [PATCH 039/127] fix compile bug and optimize avx sse [WIP] --- src/layer/x86/gridsample_bicubic_pack16.h | 24 +- src/layer/x86/gridsample_bicubic_pack4.h | 16 +- src/layer/x86/gridsample_bicubic_pack8.h | 96 +-- src/layer/x86/gridsample_bilinear_pack16.h | 60 +- src/layer/x86/gridsample_bilinear_pack4.h | 130 ++-- src/layer/x86/gridsample_bilinear_pack8.h | 731 ++++++++++++++------- src/layer/x86/gridsample_nearest_pack16.h | 60 +- src/layer/x86/gridsample_nearest_pack4.h | 8 +- src/layer/x86/gridsample_nearest_pack8.h | 146 +++- src/layer/x86/gridsample_x86.cpp | 428 +++++++----- 10 files changed, 1083 insertions(+), 616 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack16.h b/src/layer/x86/gridsample_bicubic_pack16.h index 4f4e9aa1e4c..5076252bb30 100644 --- a/src/layer/x86/gridsample_bicubic_pack16.h +++ b/src/layer/x86/gridsample_bicubic_pack16.h @@ -64,8 +64,8 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); } - __m512 gx_floor = _mm512_floor_ps(gx); - __m512 gy_floor = _mm512_floor_ps(gy); + __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); const __m512 tx = _mm512_sub_ps(gx, gx_floor); const __m512 ty = _mm512_sub_ps(gy, gy_floor); @@ -164,8 +164,8 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); } - __m512 gx_floor = _mm512_floor_ps(gx); - __m512 gy_floor = _mm512_floor_ps(gy); + __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); const __m512 tx = _mm512_sub_ps(gx, gx_floor); const __m512 ty = _mm512_sub_ps(gy, gy_floor); @@ -262,8 +262,8 @@ static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - __m512 gx_floor = _mm512_floor_ps(gx); - __m512 gy_floor = _mm512_floor_ps(gy); + __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); const __m512 tx = _mm512_sub_ps(gx, gx_floor); const __m512 ty = _mm512_sub_ps(gy, gy_floor); @@ -354,8 +354,8 @@ static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - __m512 gx_floor = _mm512_floor_ps(gx); - __m512 gy_floor = _mm512_floor_ps(gy); + __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); const __m512 tx = _mm512_sub_ps(gx, gx_floor); const __m512 ty = _mm512_sub_ps(gy, gy_floor); @@ -445,8 +445,8 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - __m512 gx_floor = _mm512_floor_ps(gx); - __m512 gy_floor = _mm512_floor_ps(gy); + __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); const __m512 tx = _mm512_sub_ps(gx, gx_floor); const __m512 ty = _mm512_sub_ps(gy, gy_floor); @@ -611,8 +611,8 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - __m512 gx_floor = _mm512_floor_ps(gx); - __m512 gy_floor = _mm512_floor_ps(gy); + __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); const __m512 tx = _mm512_sub_ps(gx, gx_floor); const __m512 ty = _mm512_sub_ps(gy, gy_floor); diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index 43f95ad111f..11fb8517899 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -121,10 +121,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], _mm_castsi128_ps(v0_in_range[i])); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], _mm_castsi128_ps(v1_in_range[i])); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], _mm_castsi128_ps(v2_in_range[i])); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], _mm_castsi128_ps(v3_in_range[i])); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } @@ -221,10 +221,10 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], *reinterpret_cast<__m128*>(&v0_in_range[i])); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], *reinterpret_cast<__m128*>(&v1_in_range[i])); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], *reinterpret_cast<__m128*>(&v2_in_range[i])); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], *reinterpret_cast<__m128*>(&v3_in_range[i])); + __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], _mm_castsi128_ps(v0_in_range[i])); + __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], _mm_castsi128_ps(v1_in_range[i])); + __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], _mm_castsi128_ps(v2_in_range[i])); + __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], _mm_castsi128_ps(v3_in_range[i])); coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); } diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index 13bbe768104..5623dd3ee96 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -55,30 +55,23 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x2)); - __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x3)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - __m256i y = _mm256_cvtps_epi32(gy); - - __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y)); + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); @@ -99,10 +92,10 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i])); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -155,30 +148,23 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i x2_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x2, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x2)); - __m256i x3_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x3, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x3)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - __m256i y = _mm256_cvtps_epi32(gy); + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - __m256i y_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y)); - - v0_in_range[i] = _mm256_and_si256(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_si256(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_si256(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_si256(x3_in_range, y_in_range); + v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); @@ -199,10 +185,10 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { for (int i = 0; i < 4; i++) { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *reinterpret_cast<__m256*>(&v0_in_range[i])); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -258,11 +244,6 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { @@ -348,11 +329,6 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { @@ -492,11 +468,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); } - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { @@ -623,11 +594,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M gx3 = _mm256_sub_ps(border_x, reflectx3_v); } - __m256i x0 = _mm256_cvtps_epi32(gx0); - __m256i x1 = _mm256_cvtps_epi32(gx1); - __m256i x2 = _mm256_cvtps_epi32(gx2); - __m256i x3 = _mm256_cvtps_epi32(gx3); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { diff --git a/src/layer/x86/gridsample_bilinear_pack16.h b/src/layer/x86/gridsample_bilinear_pack16.h index 1b0f6389f8a..66d7c87ea84 100644 --- a/src/layer/x86/gridsample_bilinear_pack16.h +++ b/src/layer/x86/gridsample_bilinear_pack16.h @@ -42,8 +42,8 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -125,8 +125,8 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -216,8 +216,8 @@ static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -302,8 +302,8 @@ static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -411,8 +411,8 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -503,8 +503,8 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, gy = _mm512_sub_ps(border_y, reflecty_v); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -589,9 +589,9 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); - __m512 z_t = _mm512_floor_ps(gz); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); + __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -728,9 +728,9 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); - __m512 z_t = _mm512_floor_ps(gz); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); + __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -879,9 +879,9 @@ static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); - __m512 z_t = _mm512_floor_ps(gz); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); + __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -1020,9 +1020,9 @@ static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); - __m512 z_t = _mm512_floor_ps(gz); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); + __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -1192,9 +1192,9 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); - __m512 z_t = _mm512_floor_ps(gz); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); + __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); @@ -1339,9 +1339,9 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, gz = _mm512_sub_ps(border_z, reflectz_v); } - __m512 x_w = _mm512_floor_ps(gx); - __m512 y_n = _mm512_floor_ps(gy); - __m512 z_t = _mm512_floor_ps(gz); + __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); + __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); + __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); __m512 w = _mm512_sub_ps(gx, x_w); __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index 7a47b90623a..600ef60da0d 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -100,10 +100,10 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, _mm_castsi128_ps(v00_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(v10_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(v01_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -204,10 +204,10 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, *reinterpret_cast<__m128*>(&v00_in_range)); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&v10_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&v01_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, _mm_castsi128_ps(v00_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(v10_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(v01_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -312,9 +312,9 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -419,9 +419,9 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -549,9 +549,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, for (int q = 0; q < dst.c; q++) { __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -662,9 +662,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, for (int q = 0; q < dst.c; q++) { __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, *reinterpret_cast<__m128*>(&v11_in_range)); + __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); __m128 _v = _mm_mul_ps(nw_val, nw); _v = _mm_comp_fmadd_ps(ne_val, ne, _v); @@ -821,15 +821,15 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, _mm_castsi128_ps(v000_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(v100_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(v010_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(v001_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -991,15 +991,15 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, *reinterpret_cast<__m128*>(&v000_in_range)); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&v100_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&v010_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, _mm_castsi128_ps(v000_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(v100_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(v010_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&v001_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(v001_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -1158,14 +1158,14 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -1324,14 +1324,14 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -1521,14 +1521,14 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, for (int q = 0; q < dst.c; q++) { __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); @@ -1693,14 +1693,14 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, for (int q = 0; q < dst.c; q++) { __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, *reinterpret_cast<__m128*>(&x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, *reinterpret_cast<__m128*>(&y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, *reinterpret_cast<__m128*>(&v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, *reinterpret_cast<__m128*>(&z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, *reinterpret_cast<__m128*>(&v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, *reinterpret_cast<__m128*>(&v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, *reinterpret_cast<__m128*>(&v111_in_range)); + __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); + __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); + __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); + + __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); + __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); + __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); + __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); __m128 _v = _mm_mul_ps(tnw_val, tnw); _v = _mm_comp_fmadd_ps(tne_val, tne, _v); diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index 093e14351c8..fcae5afc6e7 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -20,6 +20,10 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !!__AVX2__ + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -55,9 +59,10 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); +#if __AVX2__ __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); @@ -70,19 +75,50 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range)); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif // __AVX2__ __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -103,6 +139,9 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !!__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -138,9 +177,10 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); + #if __AVX2__ __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); @@ -153,19 +193,50 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *reinterpret_cast<__m256*>(&v00_in_range)); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif // __AVX2__ __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -185,7 +256,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -229,29 +300,32 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); for (int q = 0; q < dst.c; q++) { + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -271,7 +345,7 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -315,29 +389,31 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); for (int q = 0; q < dst.c; q++) { __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -357,7 +433,7 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -424,29 +500,31 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); for (int q = 0; q < dst.c; q++) { __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -466,7 +544,7 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -516,29 +594,31 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - // (W*y + x) * elempack + vec(8) - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); for (int q = 0; q < dst.c; q++) { __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -561,6 +641,9 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !!__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -618,11 +701,13 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } +#if __AVX2__ __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); @@ -649,8 +734,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); } - - // (W*H*z + W*y + x) * elempack + vec(8) + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -660,18 +744,82 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); +#endif // __AVX2__ + for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range)); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); +#if __AVX2__ + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, _mm256_castsi256_ps(v100_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, _mm256_castsi256_ps(v110_in_range)); + + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, _mm256_castsi256_ps(v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, v000_in_range); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, v100_in_range); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, v010_in_range); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, v001_in_range); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); +#endif __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -700,6 +848,9 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !!__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -757,11 +908,13 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } +#if __AVX2__ __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); @@ -789,7 +942,6 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); } - // (W*H*z + W*y + x) * elempack + vec(8) __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -799,18 +951,83 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *reinterpret_cast<__m256*>(&v000_in_range)); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&v100_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&v010_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&v001_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); +#if __AVX2__ + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, _mm256_castsi256_ps(v100_in_range)); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, _mm256_castsi256_ps(v110_in_range)); + + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, _mm256_castsi256_ps(v101_in_range)); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, v000_in_range); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, v100_in_range); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, v010_in_range); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, v001_in_range); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); +#endif __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -838,7 +1055,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -908,50 +1125,56 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)),vElempackf),_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); for (int q = 0; q < dst.c; q++) { __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -979,7 +1202,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1049,50 +1272,58 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); for (int q = 0; q < dst.c; q++) { __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -1120,7 +1351,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1221,50 +1452,58 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, bse = _mm256_mul_ps(t, se); } - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); for (int q = 0; q < dst.c; q++) { __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); @@ -1292,7 +1531,7 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1368,50 +1607,58 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, bse = _mm256_mul_ps(t, se); } - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - __m256i v110_in_range, v011_in_range, v101_in_range, v111_in_range; + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; { - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v110_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v011_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); for (int q = 0; q < dst.c; q++) { __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, *reinterpret_cast<__m256*>(&x1_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, *reinterpret_cast<__m256*>(&v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, *reinterpret_cast<__m256*>(&z1_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, *reinterpret_cast<__m256*>(&v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, *reinterpret_cast<__m256*>(&v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, *reinterpret_cast<__m256*>(&v111_in_range)); + __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); __m256 _v = _mm256_mul_ps(tnw_val, tnw); _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); diff --git a/src/layer/x86/gridsample_nearest_pack16.h b/src/layer/x86/gridsample_nearest_pack16.h index 0d0dc7a01ba..8c8ea888263 100644 --- a/src/layer/x86/gridsample_nearest_pack16.h +++ b/src/layer/x86/gridsample_nearest_pack16.h @@ -42,8 +42,8 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -93,8 +93,8 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -152,8 +152,8 @@ static void gridsample_2d_nearest_align0_border_blob_pack16(const Mat& src, Mat& gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -209,8 +209,8 @@ static void gridsample_2d_nearest_align1_border_blob_pack16(const Mat& src, Mat& gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -251,8 +251,8 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); // compute coord { @@ -329,8 +329,8 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); // compute coord { @@ -405,9 +405,9 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); - gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -467,9 +467,9 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); - gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -541,9 +541,9 @@ static void gridsample_3d_nearest_align0_border_blob_pack16(const Mat& src, Mat& gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); - gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -612,9 +612,9 @@ static void gridsample_3d_nearest_align1_border_blob_pack16(const Mat& src, Mat& gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); } - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); - gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); __m512i ix = _mm512_cvtps_epi32(gx); __m512i iy = _mm512_cvtps_epi32(gy); @@ -662,9 +662,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); - gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); // compute coord { @@ -764,9 +764,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - gx = _mm512_floor_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f))); - gy = _mm512_floor_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f))); - gz = _mm512_floor_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f))); + gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); + gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); // compute coord { diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index e8fa5d2c7e7..094d1f218c7 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -65,7 +65,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d for (int q = 0; q < dst.c; q++) { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -126,7 +126,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d for (int q = 0; q < dst.c; q++) { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -480,7 +480,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d for (int q = 0; q < dst.c; q++) { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -546,7 +546,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d for (int q = 0; q < dst.c; q++) { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, *reinterpret_cast<__m128*>(&v_in_range)); + __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index 2b49ab9a049..9974a0f0441 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -20,6 +20,9 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -51,12 +54,19 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); +#if __AVX2__ __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -72,6 +82,9 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -103,12 +116,19 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); +#if __AVX2__ __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -124,6 +144,9 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -160,8 +183,15 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); +#if __AVX2__ __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -181,6 +211,9 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -217,8 +250,15 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); +#if __AVX2__ __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -238,6 +278,9 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -295,8 +338,15 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); +#if __AVX2__ __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -316,6 +366,9 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -356,8 +409,15 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); +#if __AVX2__ __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -379,6 +439,9 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -419,11 +482,19 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#if __AVX2__ + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -442,6 +513,9 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -482,11 +556,19 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#if __AVX2__ + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *reinterpret_cast<__m256*>(&v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -505,6 +587,9 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -553,7 +638,15 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#if __AVX2__ + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -576,6 +669,9 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -624,7 +720,15 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#if __AVX2__ + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -647,6 +751,9 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -726,7 +833,15 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#if __AVX2__ + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -749,6 +864,9 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#if !__AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -803,7 +921,15 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256i iy = _mm256_cvtps_epi32(gy); __m256i iz = _mm256_cvtps_epi32(gz); - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#if __AVX2__ + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + + __m256i i_offset = _mm256_cvtps_epi32(offset); +#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 3a4a852d841..c0464d9639b 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -966,9 +966,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v00_in_range)); - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, v00_in_range); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, v10_in_range); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, v01_in_range); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); +#endif // __AVX2__ __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1088,9 +1119,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v00_in_range)); - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, *reinterpret_cast<__m256*>(&v10_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&v01_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, v00_in_range); + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, v10_in_range); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, v01_in_range); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); +#endif // __AVX2__ __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1221,8 +1283,9 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); +#endif __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1347,9 +1435,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); +#endif __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1499,9 +1612,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); +#endif __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1638,9 +1776,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, *reinterpret_cast<__m256*>(&y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, *reinterpret_cast<__m256*>(&v11_in_range)); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); +#endif __m256 _v = _mm256_mul_ps(nw_val, nw); _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); @@ -1763,17 +1926,15 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(&v_in_range)); + __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, v_in_range); _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); } @@ -1841,14 +2002,15 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(&v_in_range)); + __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, v_in_range); _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); } @@ -1926,8 +2088,8 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v0_in_range[i])); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); + __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], v0_in_range[i]); + __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], v1_in_range[i]); + __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], v2_in_range[i]); + __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], v3_in_range[i]); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -2378,22 +2533,22 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector -1) & (x3 < w); bool y3_in_range = (y3 > -1) & (y3 < h); - bool v00_in_range = x0_in_range * y0_in_range; - bool v01_in_range = x1_in_range * y0_in_range; - bool v02_in_range = x2_in_range * y0_in_range; - bool v03_in_range = x3_in_range * y0_in_range; - bool v10_in_range = x0_in_range * y1_in_range; - bool v11_in_range = x1_in_range * y1_in_range; - bool v12_in_range = x2_in_range * y1_in_range; - bool v13_in_range = x3_in_range * y1_in_range; - bool v20_in_range = x0_in_range * y2_in_range; - bool v21_in_range = x1_in_range * y2_in_range; - bool v22_in_range = x2_in_range * y2_in_range; - bool v23_in_range = x3_in_range * y2_in_range; - bool v30_in_range = x0_in_range * y3_in_range; - bool v31_in_range = x1_in_range * y3_in_range; - bool v32_in_range = x2_in_range * y3_in_range; - bool v33_in_range = x3_in_range * y3_in_range; + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v02_in_range = x2_in_range & y0_in_range; + bool v03_in_range = x3_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + bool v12_in_range = x2_in_range & y1_in_range; + bool v13_in_range = x3_in_range & y1_in_range; + bool v20_in_range = x0_in_range & y2_in_range; + bool v21_in_range = x1_in_range & y2_in_range; + bool v22_in_range = x2_in_range & y2_in_range; + bool v23_in_range = x3_in_range & y2_in_range; + bool v30_in_range = x0_in_range & y3_in_range; + bool v31_in_range = x1_in_range & y3_in_range; + bool v32_in_range = x2_in_range & y3_in_range; + bool v33_in_range = x3_in_range & y3_in_range; for (int q = 0; q < channels; q++) { @@ -2475,30 +2630,23 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(&v0_in_range[i])); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *reinterpret_cast<__m256*>(&v1_in_range[i])); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *reinterpret_cast<__m256*>(&v2_in_range[i])); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *reinterpret_cast<__m256*>(&v3_in_range[i])); + __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], v0_in_range[i]); + __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], v1_in_range[i]); + __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], v2_in_range[i]); + __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], v3_in_range[i]); coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); } @@ -2557,22 +2705,22 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector -1) & (x3 < w); bool y3_in_range = (y3 > -1) & (y3 < h); - bool v00_in_range = x0_in_range * y0_in_range; - bool v01_in_range = x1_in_range * y0_in_range; - bool v02_in_range = x2_in_range * y0_in_range; - bool v03_in_range = x3_in_range * y0_in_range; - bool v10_in_range = x0_in_range * y1_in_range; - bool v11_in_range = x1_in_range * y1_in_range; - bool v12_in_range = x2_in_range * y1_in_range; - bool v13_in_range = x3_in_range * y1_in_range; - bool v20_in_range = x0_in_range * y2_in_range; - bool v21_in_range = x1_in_range * y2_in_range; - bool v22_in_range = x2_in_range * y2_in_range; - bool v23_in_range = x3_in_range * y2_in_range; - bool v30_in_range = x0_in_range * y3_in_range; - bool v31_in_range = x1_in_range * y3_in_range; - bool v32_in_range = x2_in_range * y3_in_range; - bool v33_in_range = x3_in_range * y3_in_range; + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v02_in_range = x2_in_range & y0_in_range; + bool v03_in_range = x3_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + bool v12_in_range = x2_in_range & y1_in_range; + bool v13_in_range = x3_in_range & y1_in_range; + bool v20_in_range = x0_in_range & y2_in_range; + bool v21_in_range = x1_in_range & y2_in_range; + bool v22_in_range = x2_in_range & y2_in_range; + bool v23_in_range = x3_in_range & y2_in_range; + bool v30_in_range = x0_in_range & y3_in_range; + bool v31_in_range = x1_in_range & y3_in_range; + bool v32_in_range = x2_in_range & y3_in_range; + bool v33_in_range = x3_in_range & y3_in_range; for (int q = 0; q < channels; q++) { @@ -2657,11 +2805,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Tue, 6 Dec 2022 07:48:43 +0000 Subject: [PATCH 040/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_pack8.h | 45 +++++++------- src/layer/x86/gridsample_nearest_pack8.h | 74 +++++++++++++++-------- src/layer/x86/gridsample_x86.cpp | 9 ++- 3 files changed, 76 insertions(+), 52 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index fcae5afc6e7..abd6b441a5f 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -24,7 +24,6 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // !!__AVX2__ - #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) { @@ -95,7 +94,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); @@ -113,7 +112,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else +#else __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); @@ -177,7 +176,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& __m256 sw = _mm256_mul_ps(n, e); __m256 se = _mm256_mul_ps(n, w); - #if __AVX2__ +#if __AVX2__ __m256i x0 = _mm256_cvtps_epi32(x_w); __m256i y0 = _mm256_cvtps_epi32(y_n); __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); @@ -321,7 +320,6 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& for (int q = 0; q < dst.c; q++) { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); @@ -734,7 +732,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); } - + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); @@ -774,12 +772,14 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -796,7 +796,6 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); #endif // __AVX2__ - for (int q = 0; q < dst.c; q++) { #if __AVX2__ @@ -982,8 +981,8 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& } __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -1144,7 +1143,9 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)),vElempackf),_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); @@ -1154,7 +1155,7 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); @@ -1272,7 +1273,7 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& bse = _mm256_mul_ps(t, se); } - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); @@ -1291,8 +1292,8 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -1452,7 +1453,7 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, bse = _mm256_mul_ps(t, se); } - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); @@ -1471,8 +1472,8 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -1626,8 +1627,8 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), + vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index 9974a0f0441..012591cc951 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -59,7 +59,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ @@ -483,11 +483,15 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); #if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), + vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ @@ -557,11 +561,15 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); #if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), + vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ @@ -639,11 +647,15 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& __m256i iz = _mm256_cvtps_epi32(gz); #if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), + vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ @@ -721,11 +733,15 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& __m256i iz = _mm256_cvtps_epi32(gz); #if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), + vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ @@ -834,11 +850,15 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M __m256i iz = _mm256_cvtps_epi32(gz); #if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), + vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ @@ -922,11 +942,15 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256i iz = _mm256_cvtps_epi32(gz); #if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), + _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), + vElempacki), + _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); #endif // __AVX2__ diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index c0464d9639b..622e055f682 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -1319,7 +1319,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Tue, 6 Dec 2022 16:20:52 +0800 Subject: [PATCH 041/127] fix compile bug and optimize [WIP] --- src/layer/x86/gridsample_nearest_pack8.h | 215 +++-------------------- 1 file changed, 28 insertions(+), 187 deletions(-) diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index 012591cc951..1448653c987 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -19,10 +19,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -48,25 +45,17 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -81,10 +70,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -110,25 +96,17 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); } @@ -143,10 +121,7 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -180,18 +155,10 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -210,10 +177,7 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -247,18 +211,10 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -277,10 +233,7 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -335,18 +288,10 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); } - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -365,10 +310,7 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -409,15 +351,10 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M __m256i ix = _mm256_cvtps_epi32(gx); __m256i iy = _mm256_cvtps_epi32(gy); -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -438,10 +375,7 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -474,31 +408,18 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256i iz = _mm256_cvtps_epi32(gz); - - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), - vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -516,10 +437,7 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -552,31 +470,18 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256i iz = _mm256_cvtps_epi32(gz); - - __m256i v_in_range = _mm256_and_si256(_mm256_and_si256(_mm256_cmpgt_epi32(ix, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, ix)), - _mm256_and_si256(_mm256_cmpgt_epi32(iy, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm256_and_si256(v_in_range, _mm256_and_si256(_mm256_cmpgt_epi32(iz, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, iz))); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), - vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_castsi256_ps(v_in_range)); + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); } @@ -594,10 +499,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -642,23 +544,10 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256i iz = _mm256_cvtps_epi32(gz); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), - vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -680,10 +569,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -728,23 +614,10 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256i iz = _mm256_cvtps_epi32(gz); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), - vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -766,10 +639,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -845,23 +715,10 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256i iz = _mm256_cvtps_epi32(gz); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), - vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { @@ -883,10 +740,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !__AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -937,23 +791,10 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M gz = _mm256_sub_ps(border_z, reflectz_v); } - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256i iz = _mm256_cvtps_epi32(gz); - -#if __AVX2__ - __m256i i_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), iz), - _mm256_add_epi32(_mm256_mullo_epi32(iy, vImgWi), ix)), - vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); -#endif // __AVX2__ for (int q = 0; q < dst.c; q++) { From b5f9dd4b23c056466e7a2a9ef1b051ef3fd82bfd Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Tue, 6 Dec 2022 08:23:03 +0000 Subject: [PATCH 042/127] apply code-format changes --- src/layer/x86/gridsample_nearest_pack8.h | 44 +++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index 1448653c987..e7fb13fa1e9 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -408,12 +408,14 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); @@ -470,12 +472,14 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); @@ -544,8 +548,10 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); @@ -614,8 +620,10 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); @@ -715,8 +723,10 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); } - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); @@ -791,8 +801,10 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M gz = _mm256_sub_ps(border_z, reflectz_v); } - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), + vElempackf), + _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m256i i_offset = _mm256_cvtps_epi32(offset); From 0367ccaa9ecfa69637e67f67ccf242a06155527f Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 6 Dec 2022 21:13:42 +0800 Subject: [PATCH 043/127] fix compile bug and optimize [WIP] --- src/layer/x86/gridsample_bicubic_pack4.h | 44 +-- src/layer/x86/gridsample_x86.cpp | 453 ++++++++++++++++++++++- src/layer/x86/sse_mathfun.h | 4 +- 3 files changed, 444 insertions(+), 57 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index 11fb8517899..bc7c4c3635e 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -176,7 +176,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128 gx1 = gx_floor; __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - + __m128i x0 = _mm_cvtps_epi32(gx0); __m128i x1 = _mm_cvtps_epi32(gx1); __m128i x2 = _mm_cvtps_epi32(gx2); @@ -241,8 +241,6 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128 vElempackf = _mm_set1_ps(src.elempack); @@ -280,19 +278,12 @@ static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - __m128i x0 = _mm_cvtps_epi32(gx0); - __m128i x1 = _mm_cvtps_epi32(gx1); - __m128i x2 = _mm_cvtps_epi32(gx2); - __m128i x3 = _mm_cvtps_epi32(gx3); - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - __m128i y = _mm_cvtps_epi32(gy); - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -332,8 +323,6 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128 vElempackf = _mm_set1_ps(src.elempack); @@ -372,19 +361,12 @@ static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - __m128i x0 = _mm_cvtps_epi32(gx0); - __m128i x1 = _mm_cvtps_epi32(gx1); - __m128i x2 = _mm_cvtps_epi32(gx2); - __m128i x3 = _mm_cvtps_epi32(gx3); - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - __m128i y = _mm_cvtps_epi32(gy); - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -424,8 +406,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128 vElempackf = _mm_set1_ps(src.elempack); @@ -460,8 +440,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M const __m128 v0p5fp4 = _mm_set1_ps(0.5f); { // x0 - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx0 = _mm_add_ps(gx0, v0p5fp4); gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); @@ -518,11 +496,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); } - __m128i x0 = _mm_cvtps_epi32(gx0); - __m128i x1 = _mm_cvtps_epi32(gx1); - __m128i x2 = _mm_cvtps_epi32(gx2); - __m128i x3 = _mm_cvtps_epi32(gx3); - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { @@ -530,8 +503,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M { //y - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_add_ps(gy, v0p5fp4); gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); @@ -546,8 +517,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - __m128i y = _mm_cvtps_epi32(gy); - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -589,8 +558,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128 vElempackf = _mm_set1_ps(src.elempack); @@ -626,8 +593,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M const __m128 v0p5fp4 = _mm_set1_ps(0.5f); { // x0 - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); __m128 reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, border_x), *(__m128*)_ps_inv_sign_mask); gx0 = _mm_sub_ps(border_x, reflectx0_v); @@ -651,11 +616,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M gx3 = _mm_sub_ps(border_x, reflectx3_v); } - __m128i x0 = _mm_cvtps_epi32(gx0); - __m128i x1 = _mm_cvtps_epi32(gx1); - __m128i x2 = _mm_cvtps_epi32(gx2); - __m128i x3 = _mm_cvtps_epi32(gx3); - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { @@ -671,8 +631,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M gy = _mm_sub_ps(border_y, reflecty_v); } - __m128i y = _mm_cvtps_epi32(gy); - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 622e055f682..ad4298374d7 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -409,7 +409,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Tue, 6 Dec 2022 13:17:34 +0000 Subject: [PATCH 044/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_pack4.h | 2 +- src/layer/x86/gridsample_x86.cpp | 31 +++++++++--------------- src/layer/x86/sse_mathfun.h | 4 +-- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index bc7c4c3635e..0e815ffbb35 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -176,7 +176,7 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128 gx1 = gx_floor; __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - + __m128i x0 = _mm_cvtps_epi32(gx0); __m128i x1 = _mm_cvtps_epi32(gx1); __m128i x2 = _mm_cvtps_epi32(gx2); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index ad4298374d7..0a27b602041 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -3495,7 +3495,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Wed, 7 Dec 2022 23:05:30 +0800 Subject: [PATCH 045/127] finish pack1 dims=4 and optimize --- src/layer/x86/gridsample_nearest_pack4.h | 101 -- src/layer/x86/gridsample_x86.cpp | 1801 +++++++++++++++++++++- 2 files changed, 1730 insertions(+), 172 deletions(-) diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index fef18e57a04..4b9561d368c 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -19,10 +19,7 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -54,14 +51,9 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); -#else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); -#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -80,10 +72,7 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -115,14 +104,9 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); -#else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); -#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -138,13 +122,8 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -178,17 +157,9 @@ static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); -#else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); -#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -204,13 +175,8 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -244,17 +210,9 @@ static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); -#else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); -#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -270,13 +228,8 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -331,17 +284,9 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, M gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); } - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); -#else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); -#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -357,13 +302,8 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M { const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -401,17 +341,9 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, M gy = _mm_sub_ps(border_y, reflecty_v); } - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); -#else __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), _mm_set_ps(3, 2, 1, 0)); __m128i i_offset = _mm_cvtps_epi32(offset); -#endif // __SSE4_1__ for (int q = 0; q < dst.c; q++) { @@ -500,7 +432,6 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -564,11 +495,7 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -614,10 +541,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - __m128i iz = _mm_cvtps_epi32(gz); - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), vElempackf), @@ -640,11 +563,7 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -690,10 +609,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - __m128i iz = _mm_cvtps_epi32(gz); - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), vElempackf), @@ -716,11 +631,7 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -797,10 +708,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, M gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); } - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - __m128i iz = _mm_cvtps_epi32(gz); - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), vElempackf), @@ -823,11 +730,7 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) @@ -879,10 +782,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, M gz = _mm_sub_ps(border_z, reflectz_v); } - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - __m128i iz = _mm_cvtps_epi32(gz); - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), vElempackf), diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 0a27b602041..df08788dd26 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -907,6 +907,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + + nn = grid_size % 24; + #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = ((gx + 1) * w - 1) / 2.f; + gy = ((gy + 1) * h - 1) / 2.f; + gz = ((gz + 1) * d - 1) / 2.f; + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < w); + bool y0_in_range = (y0 > -1) & (y0 < h); + bool z0_in_range = (z0 > -1) & (z0 < d); + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool z1_in_range = (z1 > -1) & (z1 < d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + bool v000_in_range = v00_in_range & z0_in_range; + bool v010_in_range = v10_in_range & z0_in_range; + bool v100_in_range = v00_in_range & z1_in_range; + bool v110_in_range = v10_in_range & z1_in_range; + + bool v001_in_range = v01_in_range & z0_in_range; + bool v011_in_range = v11_in_range & z0_in_range; + bool v101_in_range = v01_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; + float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + float v = v0 * (1 - gamma) + v1 * gamma; + top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } } } } else { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid_p1.c; y++) { float* gridptr = grid_p1.channel(y); @@ -3552,13 +3784,253 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (w - 1); + gy = (gy + 1) / 2.f * (h - 1); + gz = (gz + 1) / 2.f * (d - 1); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < w); + bool y0_in_range = (y0 > -1) & (y0 < h); + bool z0_in_range = (z0 > -1) & (z0 < d); + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool z1_in_range = (z1 > -1) & (z1 < d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + bool v000_in_range = v00_in_range & z0_in_range; + bool v010_in_range = v10_in_range & z0_in_range; + bool v100_in_range = v00_in_range & z1_in_range; + bool v110_in_range = v10_in_range & z1_in_range; + + bool v001_in_range = v01_in_range & z0_in_range; + bool v011_in_range = v11_in_range & z0_in_range; + bool v101_in_range = v01_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; + float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + float v = v0 * (1 - gamma) + v1 * gamma; + top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } } } } @@ -3567,7 +4039,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ + + gx = ((gx + 1) * w - 1) / 2.f; + gy = ((gy + 1) * h - 1) / 2.f; + gz = ((gz + 1) * d - 1) / 2.f; + + gx = std::min(w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool z1_in_range = (z1 > -1) & (z1 < d); + + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + float v = v0 * (1 - gamma) + v1 * gamma; + top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } + } + else + { +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid_p1.c; y++) + { + float* gridptr = grid_p1.channel(y); + int nn = grid_size; +#if __AVX__ for (int x = 0; x + 23 < nn; x += 24) { __m256 tmp_x = _mm256_loadu_ps(gridptr + x); @@ -3623,13 +4274,191 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (w - 1); + gy = (gy + 1) / 2.f * (h - 1); + gz = (gz + 1) / 2.f * (d - 1); + + gx = std::min(w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool z1_in_range = (z1 > -1) & (z1 < d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + float v = v0 * (1 - gamma) + v1 * gamma; + top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } } } } @@ -3638,7 +4467,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = ((gx + 1) * w - 1) / 2.f; + gy = ((gy + 1) * h - 1) / 2.f; + gz = ((gz + 1) * d - 1) / 2.f; + + gx = abs(gx + 0.5f); + gx = w - abs(gx - w) - 0.5; + + gy = abs(gy + 0.5f); + gy = h - abs(gy - h) - 0.5; + + gz = abs(gz + 0.5f); + gz = d - abs(gz - d) - 0.5; + + gx = std::min(w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool z1_in_range = (z1 > -1) & (z1 < d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + float v = v0 * (1 - gamma) + v1 * gamma; + top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } } } } else { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid_p1.c; y++) { float* gridptr = grid_p1.channel(y); @@ -3694,13 +4741,206 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (w - 1); + gy = (gy + 1) / 2.f * (h - 1); + gz = (gz + 1) / 2.f * (d - 1); + + gx = abs(gx); + gx = (w - 1) - abs(gx - (w - 1)); + + gy = abs(gy); + gy = (h - 1) - abs(gy - (h - 1)); + + gz = abs(gz); + gz = (d - 1) - abs(gz - (d - 1)); + + gx = std::min(w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < w); + bool y1_in_range = (y1 > -1) & (y1 < h); + bool z1_in_range = (z1 > -1) & (z1 < d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < channels; q++) + { + const Mat& image = bottom_blob.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + float v = v0 * (1 - gamma) + v1 * gamma; + top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } } } } @@ -3712,7 +4952,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = ((gx + 1) * w - 1) / 2.f; + gy = ((gy + 1) * h - 1) / 2.f; + gz = ((gz + 1) * d - 1) / 2.f; + + // bilinear interpolate + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + bool v_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h) && (z0 > -1) && (z0 < bottom_blob.d); + + for (int q = 0; q < channels; q++) + { + top_blob.channel(q).depth(y)[x / 3] = v_in_range ? bottom_blob.channel(q).depth(z0).row(y0)[x0] : 0; + } } } } else { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid_p1.c; y++) { float* gridptr = grid_p1.channel(y); @@ -3769,13 +5060,62 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (w - 1); + gy = (gy + 1) / 2.f * (h - 1); + gz = (gz + 1) / 2.f * (d - 1); + + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + bool v_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h) && (z0 > -1) && (z0 < bottom_blob.d); + + for (int q = 0; q < channels; q++) + { + top_blob.channel(q).depth(y)[x / 3] = v_in_range ? bottom_blob.channel(q).depth(z0).row(y0)[x0] : 0; + } } } } @@ -3784,7 +5124,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = ((gx + 1) * w - 1) / 2.f; + gy = ((gy + 1) * h - 1) / 2.f; + gz = ((gz + 1) * d - 1) / 2.f; + + gx = std::min(w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(d - 1.0f, std::max(gz, 0.0f)); + + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + for (int q = 0; q < channels; q++) + { + top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; + } } } } else { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid_p1.c; y++) { float* gridptr = grid_p1.channel(y); @@ -3840,13 +5239,101 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (w - 1); + gy = (gy + 1) / 2.f * (h - 1); + gz = (gz + 1) / 2.f * (d - 1); + + gx = std::min(w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(d - 1.0f, std::max(gz, 0.0f)); + + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + for (int q = 0; q < channels; q++) + { + top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; + } } } } @@ -3855,7 +5342,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = ((gx + 1) * w - 1) / 2.f; + gy = ((gy + 1) * h - 1) / 2.f; + gz = ((gz + 1) * d - 1) / 2.f; + + gx = floor(gx + 0.5f); + gy = floor(gy + 0.5f); + gz = floor(gz + 0.5f); + + gx = abs(gx + 0.5f); + gx = w - abs(gx - w) - 0.5; + + gy = abs(gy + 0.5f); + gy = h - abs(gy - h) - 0.5; + + gz = abs(gz + 0.5f); + gz = d - abs(gz - d) - 0.5; + + int x0 = std::min(w - 1.0f, std::max(gx, 0.0f)); + int y0 = std::min(h - 1.0f, std::max(gy, 0.0f)); + int z0 = std::min(d - 1.0f, std::max(gz, 0.0f)); + + for (int q = 0; q < channels; q++) + { + top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; + } } } } else { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid_p1.c; y++) { float* gridptr = grid_p1.channel(y); @@ -3911,13 +5496,89 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); + } } + nn = grid_size % 24; #endif // __AVX__ for (int x = grid_size - nn; x < grid_size; x += 3) { float gx = gridptr[x]; float gy = gridptr[x + 1]; float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (w - 1); + gy = (gy + 1) / 2.f * (h - 1); + gz = (gz + 1) / 2.f * (d - 1); + + gx = floor(gx + 0.5f); + gy = floor(gy + 0.5f); + gz = floor(gz + 0.5f); + + gx = abs(gx); + gx = (w - 1) - abs(gx - (w - 1)); + + gy = abs(gy); + gy = (h - 1) - abs(gy - (h - 1)); + + gz = abs(gz); + gz = (d - 1) - abs(gz - (d - 1)); + + int x0 = std::min(w - 1.0f, std::max(gx, 0.0f)); + int y0 = std::min(h - 1.0f, std::max(gy, 0.0f)); + int z0 = std::min(d - 1.0f, std::max(gz, 0.0f)); + + for (int q = 0; q < channels; q++) + { + top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; + } } } } @@ -3932,10 +5593,8 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Wed, 7 Dec 2022 15:07:38 +0000 Subject: [PATCH 046/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 72 ++++++++++++++++---------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index df08788dd26..60254ce80a0 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -924,7 +924,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (y1 < h); bool z1_in_range = (z1 > -1) & (z1 < d); - bool v11_in_range = x1_in_range & y1_in_range; bool v110_in_range = y1_in_range & z1_in_range; @@ -4252,7 +4251,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Wed, 7 Dec 2022 23:13:14 +0800 Subject: [PATCH 047/127] fix macro typo --- src/layer/x86/sse_mathfun.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index c8434c62f97..b4f0fb3ef32 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -740,7 +740,7 @@ static NCNN_FORCEINLINE __m128 pow_ps(__m128 a, __m128 b) static NCNN_FORCEINLINE __m128 floor_ps(const __m128 x) { -#if (_MSC_VER && __AVX__) || (__SSE4_1__ && !__APPLE__ && !__ABDRIUD__) +#if (_MSC_VER && __AVX__) || (__SSE4_1__ && !__APPLE__ && !__ANDRIOD__) return _mm_floor_ps(x); #endif // __SSE4_1__ @@ -781,7 +781,7 @@ static NCNN_FORCEINLINE __m128 floor_ps(const __m128 x) static NCNN_FORCEINLINE __m128 ceil_ps(const __m128 x) { -#if (_MSC_VER && __AVX__) || (__SSE4_1__ && !__APPLE__ && !__ABDRIUD__) +#if (_MSC_VER && __AVX__) || (__SSE4_1__ && !__APPLE__ && !__ANDRIOD__) return _mm_ceil_ps(x); #endif // __SSE4_1__ From 33e066a178f06c71096aef174d8e2c7129c1b864 Mon Sep 17 00:00:00 2001 From: Yoh Date: Thu, 8 Dec 2022 01:38:47 +0800 Subject: [PATCH 048/127] fix avx register copy bug --- src/layer/x86/gridsample_x86.cpp | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 60254ce80a0..35cf4e61d4c 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -937,7 +937,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 8 Dec 2022 16:22:05 +0800 Subject: [PATCH 049/127] optimize unused variable and fix floor&ceil macro in sse_mathfun --- src/layer/x86/gridsample_bicubic_pack16.h | 22 -- src/layer/x86/gridsample_bicubic_pack4.h | 5 - src/layer/x86/gridsample_bicubic_pack8.h | 23 -- src/layer/x86/gridsample_bilinear_pack4.h | 37 +-- src/layer/x86/gridsample_bilinear_pack8.h | 40 +--- src/layer/x86/gridsample_nearest_pack16.h | 8 - src/layer/x86/gridsample_nearest_pack4.h | 1 - src/layer/x86/gridsample_nearest_pack8.h | 33 --- src/layer/x86/gridsample_x86.cpp | 260 ++++++++-------------- src/layer/x86/sse_mathfun.h | 4 +- 10 files changed, 135 insertions(+), 298 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack16.h b/src/layer/x86/gridsample_bicubic_pack16.h index 5076252bb30..ea3d7950bd0 100644 --- a/src/layer/x86/gridsample_bicubic_pack16.h +++ b/src/layer/x86/gridsample_bicubic_pack16.h @@ -241,8 +241,6 @@ static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& { const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512 vElempackf = _mm512_set1_ps(src.elempack); @@ -291,8 +289,6 @@ static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - __m512i y = _mm512_cvtps_epi32(gy); - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -332,8 +328,6 @@ static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& { const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512 vElempackf = _mm512_set1_ps(src.elempack); @@ -383,8 +377,6 @@ static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - __m512i y = _mm512_cvtps_epi32(gy); - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -424,8 +416,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, { const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512 vElempackf = _mm512_set1_ps(src.elempack); @@ -460,8 +450,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); { // x0 - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - gx0 = _mm512_add_ps(gx0, v0p5fp16); gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); @@ -546,8 +534,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); } - __m512i y = _mm512_cvtps_epi32(gy); - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), @@ -589,8 +575,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512 vElempackf = _mm512_set1_ps(src.elempack); @@ -626,8 +610,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); { // x0 - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); __m512 reflectx0_v = _mm512_and_ps(_mm512_sub_ps(gx0, border_x), *(__m512*)_ps512_inv_sign_mask); gx0 = _mm512_sub_ps(border_x, reflectx0_v); @@ -663,16 +645,12 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, { //y - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); gy = _mm512_sub_ps(border_y, reflecty_v); } - __m512i y = _mm512_cvtps_epi32(gy); - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h index 0e815ffbb35..624f9005ecb 100644 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ b/src/layer/x86/gridsample_bicubic_pack4.h @@ -554,8 +554,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, M static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - float* outptr = static_cast(dst.data); - const __m128 vImgWf = _mm_set1_ps(src.w); const __m128 vImgHf = _mm_set1_ps(src.h); @@ -590,7 +588,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M __m128 gx1 = gx_floor; __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - const __m128 v0p5fp4 = _mm_set1_ps(0.5f); { // x0 gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); @@ -623,8 +620,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, M { //y - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h index 5623dd3ee96..1d33f0f527d 100644 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ b/src/layer/x86/gridsample_bicubic_pack8.h @@ -16,8 +16,6 @@ static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& d { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -112,8 +110,6 @@ static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& d { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -205,8 +201,6 @@ static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -289,8 +283,6 @@ static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -374,8 +366,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -410,8 +400,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); { // x0 - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx0 = _mm256_add_ps(gx0, v0p5fp8); gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); @@ -475,8 +463,6 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M { //y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - gy = _mm256_add_ps(gy, v0p5fp8); gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); @@ -528,12 +514,8 @@ static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, M static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { - float* outptr = static_cast(dst.data); - const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -566,11 +548,8 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); { // x0 - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); gx0 = _mm256_sub_ps(border_x, reflectx0_v); @@ -601,8 +580,6 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, M { //y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h index c24ab0aadf3..416711c12b2 100644 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ b/src/layer/x86/gridsample_bilinear_pack4.h @@ -19,8 +19,9 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -123,8 +124,9 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -227,8 +229,9 @@ static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -334,8 +337,9 @@ static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -441,8 +445,9 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -571,8 +576,9 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -686,8 +692,9 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -856,8 +863,9 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -1026,8 +1034,9 @@ static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -1192,8 +1201,9 @@ static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -1358,8 +1368,9 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ @@ -1554,9 +1565,9 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, const __m128i vImgWi = _mm_set1_epi32(src.w); const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); - +#if ((_MSC_VER && __AVX__) || __SSE4_1__) const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#if !((_MSC_VER && __AVX__) || __SSE4_1__) +#else const __m128 vElempackf = _mm_set1_ps(src.elempack); #endif // !__SSE4_1__ diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h index abd6b441a5f..944ccf5c639 100644 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ b/src/layer/x86/gridsample_bilinear_pack8.h @@ -16,13 +16,14 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ +#else const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !!__AVX2__ +#endif // __AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -134,13 +135,14 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ +#else const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !!__AVX2__ +#endif // __AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < dst.h; y++) @@ -252,8 +254,6 @@ static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -340,8 +340,6 @@ static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -428,8 +426,6 @@ static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -539,8 +535,6 @@ static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -634,14 +628,15 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); +#if __AVX2__ const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ +#else const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !!__AVX2__ +#endif // __AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -842,14 +837,15 @@ static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); +#if __AVX2__ const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#if !__AVX2__ +#else const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // !!__AVX2__ +#endif // __AVX2__ #pragma omp parallel for num_threads(opt.num_threads) for (int z = 0; z < dst.d; z++) @@ -1050,9 +1046,6 @@ static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -1199,9 +1192,6 @@ static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -1348,9 +1338,6 @@ static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -1528,9 +1515,6 @@ static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); diff --git a/src/layer/x86/gridsample_nearest_pack16.h b/src/layer/x86/gridsample_nearest_pack16.h index 8c8ea888263..8969fac63e2 100644 --- a/src/layer/x86/gridsample_nearest_pack16.h +++ b/src/layer/x86/gridsample_nearest_pack16.h @@ -119,7 +119,6 @@ static void gridsample_2d_nearest_align0_border_blob_pack16(const Mat& src, Mat& const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -176,7 +175,6 @@ static void gridsample_2d_nearest_align1_border_blob_pack16(const Mat& src, Mat& const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -233,7 +231,6 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -311,7 +308,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, const __m512 vImgWf = _mm512_set1_ps(src.w); const __m512 vImgHf = _mm512_set1_ps(src.h); const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -498,7 +494,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack16(const Mat& src, Mat& const __m512 vImgDf = _mm512_set1_ps(src.d); const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -569,7 +564,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack16(const Mat& src, Mat& const __m512 vImgDf = _mm512_set1_ps(src.d); const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -640,7 +634,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, const __m512 vImgDf = _mm512_set1_ps(src.d); const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); @@ -742,7 +735,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, const __m512 vImgDf = _mm512_set1_ps(src.d); const __m512i vImgWi = _mm512_set1_epi32(src.w); const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); const __m512i vElempacki = _mm512_set1_epi32(src.elempack); diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h index 4b9561d368c..6d44dd9b822 100644 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ b/src/layer/x86/gridsample_nearest_pack4.h @@ -364,7 +364,6 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& d const __m128i vImgHi = _mm_set1_epi32(src.h); const __m128i vImgDi = _mm_set1_epi32(src.d); - const __m128i vElempacki = _mm_set1_epi32(src.elempack); const __m128 vElempackf = _mm_set1_ps(src.elempack); #pragma omp parallel for num_threads(opt.num_threads) diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h index e7fb13fa1e9..a8baf3bc1dc 100644 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ b/src/layer/x86/gridsample_nearest_pack8.h @@ -16,8 +16,6 @@ static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -67,8 +65,6 @@ static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -118,8 +114,6 @@ static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -174,8 +168,6 @@ static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -230,8 +222,6 @@ static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, M { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -307,8 +297,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M { const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -348,9 +336,6 @@ static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, M gy = _mm256_sub_ps(border_y, reflecty_v); } - __m256i ix = _mm256_cvtps_epi32(gx); - __m256i iy = _mm256_cvtps_epi32(gy); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); @@ -371,9 +356,6 @@ static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& d const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -435,9 +417,6 @@ static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& d const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -499,9 +478,6 @@ static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -571,9 +547,6 @@ static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -643,9 +616,6 @@ static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, M const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); @@ -746,9 +716,6 @@ static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, M const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 35cf4e61d4c..6651cf6f12b 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -50,6 +50,7 @@ _PI32_CONST512(n1, -1); #endif // __AVX512F__ _PS256_CONST(n1, -1.0f); +_PS256_CONST(2, 2.0f); _PI32_CONST256(n1, -1); static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) @@ -554,10 +555,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 8 Dec 2022 08:23:56 +0000 Subject: [PATCH 050/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 34 +------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 6651cf6f12b..851288f4802 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -942,8 +942,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Fri, 9 Dec 2022 17:41:27 +0800 Subject: [PATCH 051/127] fix cmake -DNCNN_SSE2=OFF bug --- src/layer/x86/gridsample_x86.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 6651cf6f12b..617b44319d9 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -5524,6 +5524,8 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Mon, 19 Dec 2022 16:44:17 +0800 Subject: [PATCH 052/127] separate pack1 to file --- src/layer/x86/gridsample_bicubic_pack1.h | 1093 +++++ src/layer/x86/gridsample_bilinear_pack1.h | 2514 ++++++++++++ src/layer/x86/gridsample_nearest_pack1.h | 1167 ++++++ src/layer/x86/gridsample_x86.cpp | 4485 +-------------------- 4 files changed, 4829 insertions(+), 4430 deletions(-) create mode 100644 src/layer/x86/gridsample_bicubic_pack1.h create mode 100644 src/layer/x86/gridsample_bilinear_pack1.h create mode 100644 src/layer/x86/gridsample_nearest_pack1.h diff --git a/src/layer/x86/gridsample_bicubic_pack1.h b/src/layer/x86/gridsample_bicubic_pack1.h new file mode 100644 index 00000000000..ca40d150019 --- /dev/null +++ b/src/layer/x86/gridsample_bicubic_pack1.h @@ -0,0 +1,1093 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_bicubic_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); + + v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < src.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool x2_in_range = (x2 > -1) & (x2 < src.w); + bool y2_in_range = (y2 > -1) & (y2 < src.h); + bool x3_in_range = (x3 > -1) & (x3 < src.w); + bool y3_in_range = (y3 > -1) & (y3 < src.h); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v02_in_range = x2_in_range & y0_in_range; + bool v03_in_range = x3_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + bool v12_in_range = x2_in_range & y1_in_range; + bool v13_in_range = x3_in_range & y1_in_range; + bool v20_in_range = x0_in_range & y2_in_range; + bool v21_in_range = x1_in_range & y2_in_range; + bool v22_in_range = x2_in_range & y2_in_range; + bool v23_in_range = x3_in_range & y2_in_range; + bool v30_in_range = x0_in_range & y3_in_range; + bool v31_in_range = x1_in_range & y3_in_range; + bool v32_in_range = x2_in_range & y3_in_range; + bool v33_in_range = x3_in_range & y3_in_range; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v02 = v02_in_range ? image.row(y0)[x2] : 0; + float v03 = v03_in_range ? image.row(y0)[x3] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + float v12 = v12_in_range ? image.row(y1)[x2] : 0; + float v13 = v13_in_range ? image.row(y1)[x3] : 0; + float v20 = v20_in_range ? image.row(y2)[x0] : 0; + float v21 = v21_in_range ? image.row(y2)[x1] : 0; + float v22 = v22_in_range ? image.row(y2)[x2] : 0; + float v23 = v23_in_range ? image.row(y2)[x3] : 0; + float v30 = v30_in_range ? image.row(y3)[x0] : 0; + float v31 = v31_in_range ? image.row(y3)[x1] : 0; + float v32 = v32_in_range ? image.row(y3)[x2] : 0; + float v33 = v33_in_range ? image.row(y3)[x3] : 0; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } +} + +static void gridsample_2d_bicubic_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); + + v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < src.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool x2_in_range = (x2 > -1) & (x2 < src.w); + bool y2_in_range = (y2 > -1) & (y2 < src.h); + bool x3_in_range = (x3 > -1) & (x3 < src.w); + bool y3_in_range = (y3 > -1) & (y3 < src.h); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v02_in_range = x2_in_range & y0_in_range; + bool v03_in_range = x3_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + bool v12_in_range = x2_in_range & y1_in_range; + bool v13_in_range = x3_in_range & y1_in_range; + bool v20_in_range = x0_in_range & y2_in_range; + bool v21_in_range = x1_in_range & y2_in_range; + bool v22_in_range = x2_in_range & y2_in_range; + bool v23_in_range = x3_in_range & y2_in_range; + bool v30_in_range = x0_in_range & y3_in_range; + bool v31_in_range = x1_in_range & y3_in_range; + bool v32_in_range = x2_in_range & y3_in_range; + bool v33_in_range = x3_in_range & y3_in_range; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v02 = v02_in_range ? image.row(y0)[x2] : 0; + float v03 = v03_in_range ? image.row(y0)[x3] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + float v12 = v12_in_range ? image.row(y1)[x2] : 0; + float v13 = v13_in_range ? image.row(y1)[x3] : 0; + float v20 = v20_in_range ? image.row(y2)[x0] : 0; + float v21 = v21_in_range ? image.row(y2)[x1] : 0; + float v22 = v22_in_range ? image.row(y2)[x2] : 0; + float v23 = v23_in_range ? image.row(y2)[x3] : 0; + float v30 = v30_in_range ? image.row(y3)[x0] : 0; + float v31 = v31_in_range ? image.row(y3)[x1] : 0; + float v32 = v32_in_range ? image.row(y3)[x2] : 0; + float v33 = v33_in_range ? image.row(y3)[x3] : 0; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } +} + +static void gridsample_2d_bicubic_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < src.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(src.w - 1, std::max(x1, 0)); + y1 = std::min(src.h - 1, std::max(y1, 0)); + x0 = std::min(src.w - 1, std::max(x0, 0)); + y0 = std::min(src.h - 1, std::max(y0, 0)); + x2 = std::min(src.w - 1, std::max(x2, 0)); + y2 = std::min(src.h - 1, std::max(y2, 0)); + x3 = std::min(src.w - 1, std::max(x3, 0)); + y3 = std::min(src.h - 1, std::max(y3, 0)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } +} + +static void gridsample_2d_bicubic_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < src.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(src.w - 1, std::max(x1, 0)); + y1 = std::min(src.h - 1, std::max(y1, 0)); + x0 = std::min(src.w - 1, std::max(x0, 0)); + y0 = std::min(src.h - 1, std::max(y0, 0)); + x2 = std::min(src.w - 1, std::max(x2, 0)); + y2 = std::min(src.h - 1, std::max(y2, 0)); + x3 = std::min(src.w - 1, std::max(x3, 0)); + y3 = std::min(src.h - 1, std::max(y3, 0)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } +} + +static void gridsample_2d_bicubic_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + { + // x0 + gx0 = _mm256_add_ps(gx0, v0p5fp8); + + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); + + gx0 = _mm256_sub_ps(gx0, v0p5fp8); + + _mm256_sub_ps(gx0, v0p5fp8); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + + // x1 + gx1 = _mm256_add_ps(gx1, v0p5fp8); + + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); + + gx1 = _mm256_sub_ps(gx1, v0p5fp8); + + _mm256_sub_ps(gx1, v0p5fp8); + + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + + // x2 + gx2 = _mm256_add_ps(gx2, v0p5fp8); + + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); + + gx2 = _mm256_sub_ps(gx2, v0p5fp8); + + _mm256_sub_ps(gx2, v0p5fp8); + + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + + // x3 + gx3 = _mm256_add_ps(gx3, v0p5fp8); + + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); + + gx3 = _mm256_sub_ps(gx3, v0p5fp8); + + _mm256_sub_ps(gx3, v0p5fp8); + + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + } + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + { + //y + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < src.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x0 = static_cast(reflect_coord(x0 + 0.5, src.w) - 0.5); + + y0 = static_cast(reflect_coord(y0 + 0.5, src.h) - 0.5); + + x0 = std::min(src.w - 1, std::max(x0, 0)); + y0 = std::min(src.h - 1, std::max(y0, 0)); + + x1 = static_cast(reflect_coord(x1 + 0.5, src.w) - 0.5); + + y1 = static_cast(reflect_coord(y1 + 0.5, src.h) - 0.5); + + x1 = std::min(src.w - 1, std::max(x1, 0)); + y1 = std::min(src.h - 1, std::max(y1, 0)); + + x2 = static_cast(reflect_coord(x2 + 0.5, src.w) - 0.5); + + y2 = static_cast(reflect_coord(y2 + 0.5, src.h) - 0.5); + + x2 = std::min(src.w - 1, std::max(x2, 0)); + y2 = std::min(src.h - 1, std::max(y2, 0)); + + x3 = static_cast(reflect_coord(x3 + 0.5, src.w) - 0.5); + + y3 = static_cast(reflect_coord(y3 + 0.5, src.h) - 0.5); + + x3 = std::min(src.w - 1, std::max(x3, 0)); + y3 = std::min(src.h - 1, std::max(y3, 0)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } +} + +static void gridsample_2d_bicubic_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + { + // x0 + gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); + gx0 = _mm256_sub_ps(border_x, reflectx0_v); + + // x1 + gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); + gx1 = _mm256_sub_ps(border_x, reflectx1_v); + + // x2 + gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); + gx2 = _mm256_sub_ps(border_x, reflectx2_v); + + // x3 + gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); + gx3 = _mm256_sub_ps(border_x, reflectx3_v); + } + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + { + //y + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + for (int q = 0; q < src.c; q++) + { + for (int i = 0; i < 4; i++) + { + __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); + __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); + __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); + __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); + + coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); + } + + __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + int x_floor = floor(sample_x); + int y_floor = floor(sample_y); + + int x1 = x_floor; + int y1 = y_floor; + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x0 = static_cast(reflect_coord(x0, src.w - 1)); + y0 = static_cast(reflect_coord(y0, src.h - 1)); + x1 = static_cast(reflect_coord(x1, src.w - 1)); + y1 = static_cast(reflect_coord(y1, src.h - 1)); + x2 = static_cast(reflect_coord(x2, src.w - 1)); + y2 = static_cast(reflect_coord(y2, src.h - 1)); + x3 = static_cast(reflect_coord(x3, src.w - 1)); + y3 = static_cast(reflect_coord(y3, src.h - 1)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + float v00 = image.row(y0)[x0]; + float v01 = image.row(y0)[x1]; + float v02 = image.row(y0)[x2]; + float v03 = image.row(y0)[x3]; + float v10 = image.row(y1)[x0]; + float v11 = image.row(y1)[x1]; + float v12 = image.row(y1)[x2]; + float v13 = image.row(y1)[x3]; + float v20 = image.row(y2)[x0]; + float v21 = image.row(y2)[x1]; + float v22 = image.row(y2)[x2]; + float v23 = image.row(y2)[x3]; + float v30 = image.row(y3)[x0]; + float v31 = image.row(y3)[x1]; + float v32 = image.row(y3)[x2]; + float v33 = image.row(y3)[x3]; + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x_floor, x_coeffs); + interpolate_cubic(sample_y - y_floor, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack1.h b/src/layer/x86/gridsample_bilinear_pack1.h new file mode 100644 index 00000000000..23e63216f29 --- /dev/null +++ b/src/layer/x86/gridsample_bilinear_pack1.h @@ -0,0 +1,2514 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif // __AVX2__ + + for (int q = 0; q < src.c; q++) + { +#if __AVX2__ + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif // __AVX2__ + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + bool v01_in_range = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); + bool v10_in_range = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); + bool v11_in_range = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } +} + + +static void gridsample_2d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif // __AVX2__ + + for (int q = 0; q < src.c; q++) + { +#if __AVX2__ + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif // __AVX2__ + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + bool v01_in_range = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); + bool v10_in_range = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); + bool v11_in_range = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v00 = v00_in_range ? image.row(y0)[x0] : 0; + float v01 = v01_in_range ? image.row(y0)[x1] : 0; + float v10 = v10_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } +} + +static void gridsample_2d_bilinear_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + +#pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif + + for (int q = 0; q < src.c; q++) + { + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = x1_in_range ? image.row(y0)[x1] : 0; + float v10 = y1_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } +} + +static void gridsample_2d_bilinear_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif + + for (int q = 0; q < src.c; q++) + { + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = x1_in_range ? image.row(y0)[x1] : 0; + float v10 = y1_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } +} + +static void gridsample_2d_bilinear_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif + + for (int q = 0; q < src.c; q++) + { + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + sample_x = abs(sample_x + 0.5f); + sample_x = src.w - abs(sample_x - src.w) - 0.5; + + sample_y = abs(sample_y + 0.5f); + sample_y = src.h - abs(sample_y - src.h) - 0.5; + + sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = x1_in_range ? image.row(y0)[x1] : 0; + float v10 = y1_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } +} + +static void gridsample_2d_bilinear_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); +#endif + + for (int q = 0; q < src.c; q++) + { + __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); +#if __AVX2__ + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); + __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); + __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); +#endif + + __m256 _v = _mm256_mul_ps(nw_val, nw); + _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); + _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); + _v = _mm256_comp_fmadd_ps(se_val, se, _v); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + sample_x = abs(sample_x); + sample_x = (src.w - 1) - abs(sample_x - (src.w - 1)); + + sample_y = abs(sample_y); + sample_y = (src.h - 1) - abs(sample_y - (src.h - 1)); + + sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool v11_in_range = x1_in_range & y1_in_range; + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v00 = image.row(y0)[x0]; + float v01 = x1_in_range ? image.row(y0)[x1] : 0; + float v10 = y1_in_range ? image.row(y1)[x0] : 0; + float v11 = v11_in_range ? image.row(y1)[x1] : 0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; + } + } + } +} + +static void gridsample_3d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); +#endif // __AVX2__ +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + //upzip (3) + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, *(__m256i*)_pi32_256_1); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, vImgWi); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, *(__m256i*)_pi32_256_1); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, *(__m256i*)_pi32_256_1); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, vImgWi); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); + __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); + __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); +#endif // __AVX2__ + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); +#if __AVX2__ + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, _mm256_castsi256_ps(v100_in_range)); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, _mm256_castsi256_ps(v110_in_range)); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, _mm256_castsi256_ps(v101_in_range)); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, v000_in_range); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, v100_in_range); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, v010_in_range); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, v001_in_range); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); +#endif + + __m256 _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + + nn = grid_size % 24; + +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = ((gx + 1) * src.w - 1) / 2.f; + gy = ((gy + 1) * src.h - 1) / 2.f; + gz = ((gz + 1) * src.d - 1) / 2.f; + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool z0_in_range = (z0 > -1) & (z0 < src.d); + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + bool v000_in_range = v00_in_range & z0_in_range; + bool v010_in_range = v10_in_range & z0_in_range; + bool v100_in_range = v00_in_range & z1_in_range; + bool v110_in_range = v10_in_range & z1_in_range; + + bool v001_in_range = v01_in_range & z0_in_range; + bool v011_in_range = v11_in_range & z0_in_range; + bool v101_in_range = v01_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; + float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } +} + +static void gridsample_3d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); +#endif // __AVX2__ +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, *(__m256i*)_pi32_256_1); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, vImgWi); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, *(__m256i*)_pi32_256_1); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, *(__m256i*)_pi32_256_1); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, vImgWi); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, *(__m256i*)_pi32_256_1); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); + __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); + __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); +#endif // __AVX2__ + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); +#if __AVX2__ + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, _mm256_castsi256_ps(v100_in_range)); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, _mm256_castsi256_ps(v110_in_range)); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, _mm256_castsi256_ps(v101_in_range)); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, v000_in_range); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, v100_in_range); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, v010_in_range); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, v001_in_range); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); +#endif + + __m256 _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (src.w - 1); + gy = (gy + 1) / 2.f * (src.h - 1); + gz = (gz + 1) / 2.f * (src.d - 1); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool z0_in_range = (z0 > -1) & (z0 < src.d); + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + bool v000_in_range = v00_in_range & z0_in_range; + bool v010_in_range = v10_in_range & z0_in_range; + bool v100_in_range = v00_in_range & z1_in_range; + bool v110_in_range = v10_in_range & z1_in_range; + + bool v001_in_range = v01_in_range & z0_in_range; + bool v011_in_range = v11_in_range & z0_in_range; + bool v101_in_range = v01_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; + float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } +} + +static void gridsample_3d_bilinear_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); + __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); + __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); + + __m256 _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = ((gx + 1) * src.w - 1) / 2.f; + gy = ((gy + 1) * src.h - 1) / 2.f; + gz = ((gz + 1) * src.d - 1) / 2.f; + + gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } +} + +static void gridsample_3d_bilinear_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); + __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); + __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); + + __m256 _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (src.w - 1); + gy = (gy + 1) / 2.f * (src.h - 1); + gz = (gz + 1) / 2.f * (src.d - 1); + + gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } +} + +static void gridsample_3d_bilinear_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_add_ps(gz, v0p5fp8); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(vImgDf, reflectz_v); + + gz = _mm256_sub_ps(gz, v0p5fp8); + + _mm256_sub_ps(gz, v0p5fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); + __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); + __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); + + __m256 _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = ((gx + 1) * src.w - 1) / 2.f; + gy = ((gy + 1) * src.h - 1) / 2.f; + gz = ((gz + 1) * src.d - 1) / 2.f; + + gx = abs(gx + 0.5f); + gx = src.w - abs(gx - src.w) - 0.5; + + gy = abs(gy + 0.5f); + gy = src.h - abs(gy - src.h) - 0.5; + + gz = abs(gz + 0.5f); + gz = src.d - abs(gz - src.d) - 0.5; + + gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } +} + +static void gridsample_3d_bilinear_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(border_z, reflectz_v); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); + + __m256 w = _mm256_sub_ps(gx, x_w); + __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); + __m256 n = _mm256_sub_ps(gy, y_n); + __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); + __m256 t = _mm256_sub_ps(gz, z_t); + __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); + + __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; + { + __m256 nw = _mm256_mul_ps(s, e); + __m256 ne = _mm256_mul_ps(s, w); + __m256 sw = _mm256_mul_ps(n, e); + __m256 se = _mm256_mul_ps(n, w); + + tnw = _mm256_mul_ps(b, nw); + tne = _mm256_mul_ps(b, ne); + tsw = _mm256_mul_ps(b, sw); + tse = _mm256_mul_ps(b, se); + + bnw = _mm256_mul_ps(t, nw); + bne = _mm256_mul_ps(t, ne); + bsw = _mm256_mul_ps(t, sw); + bse = _mm256_mul_ps(t, se); + } + + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), + _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); + __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); + + __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); + __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); + __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); + __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); + __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); + + __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); + __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); + __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); + __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); + + __m256 _v = _mm256_mul_ps(tnw_val, tnw); + _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); + _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); + _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); + + _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); + _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); + _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); + _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (src.w - 1); + gy = (gy + 1) / 2.f * (src.h - 1); + gz = (gz + 1) / 2.f * (src.d - 1); + + gx = abs(gx); + gx = (src.w - 1) - abs(gx - (src.w - 1)); + + gy = abs(gy); + gy = (src.h - 1) - abs(gy - (src.h - 1)); + + gz = abs(gz); + gz = (src.d - 1) - abs(gz - (src.d - 1)); + + gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + // bilinear interpolate + int x0 = (int)floor(gx); + int y0 = (int)floor(gy); + int z0 = (int)floor(gz); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v11_in_range = x1_in_range & y1_in_range; + + bool v110_in_range = y1_in_range & z1_in_range; + + bool v101_in_range = x1_in_range & z1_in_range; + bool v111_in_range = v11_in_range & z1_in_range; + + float alpha = gx - x0; + float beta = gy - y0; + float gamma = gz - z0; + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + float v000 = image.depth(z0).row(y0)[x0]; + float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; + float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; + float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; + + float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; + float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; + float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; + float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack1.h b/src/layer/x86/gridsample_nearest_pack1.h new file mode 100644 index 00000000000..88586e6ce93 --- /dev/null +++ b/src/layer/x86/gridsample_nearest_pack1.h @@ -0,0 +1,1167 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void gridsample_2d_nearest_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + dst.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; + } + } + } +} + +static void gridsample_2d_nearest_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + dst.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; + } + } + } +} + +static void gridsample_2d_nearest_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } +} + +static void gridsample_2d_nearest_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } +} + +static void gridsample_2d_nearest_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + } + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = ((sample_x + 1) * src.w - 1) / 2.f; + sample_y = ((sample_y + 1) * src.h - 1) / 2.f; + + sample_x = floor(sample_x + 0.5f); + sample_y = floor(sample_y + 0.5f); + + sample_x = abs(sample_x + 0.5f); + sample_x = src.w - abs(sample_x - src.w) - 0.5; + + sample_y = abs(sample_y + 0.5f); + sample_y = src.h - abs(sample_y - src.h) - 0.5; + + int x0 = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); + int y0 = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } +} + +static void gridsample_2d_nearest_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < grid_size; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + } + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); + } + } + + nn = grid_size & 15; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = gridptr[x]; + float sample_y = gridptr[x + 1]; + + sample_x = (sample_x + 1) / 2.f * (src.w - 1); + sample_y = (sample_y + 1) / 2.f * (src.h - 1); + + sample_x = floor(sample_x + 0.5f); + sample_y = floor(sample_y + 0.5f); + + sample_x = abs(sample_x); + int x0 = (src.w - 1) - abs(sample_x - (src.w - 1)); + + sample_y = abs(sample_y); + int y0 = (src.h - 1) - abs(sample_y - (src.h - 1)); + + for (int q = 0; q < src.c; q++) + { + const Mat& image = src.channel(q); + + dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; + } + } + } +} + +static void gridsample_3d_nearest_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + //upzip (3) + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = ((gx + 1) * src.w - 1) / 2.f; + gy = ((gy + 1) * src.h - 1) / 2.f; + gz = ((gz + 1) * src.d - 1) / 2.f; + + // bilinear interpolate + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + bool v_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) && (z0 > -1) && (z0 < src.d); + + for (int q = 0; q < src.c; q++) + { + dst.channel(q).depth(y)[x / 3] = v_in_range ? src.channel(q).depth(z0).row(y0)[x0] : 0; + } + } + } +} + +static void gridsample_3d_nearest_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (src.w - 1); + gy = (gy + 1) / 2.f * (src.h - 1); + gz = (gz + 1) / 2.f * (src.d - 1); + + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + bool v_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) && (z0 > -1) && (z0 < src.d); + + for (int q = 0; q < src.c; q++) + { + dst.channel(q).depth(y)[x / 3] = v_in_range ? src.channel(q).depth(z0).row(y0)[x0] : 0; + } + } + } +} + +static void gridsample_3d_nearest_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + // z + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = ((gx + 1) * src.w - 1) / 2.f; + gy = ((gy + 1) * src.h - 1) / 2.f; + gz = ((gz + 1) * src.d - 1) / 2.f; + + gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + for (int q = 0; q < src.c; q++) + { + dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; + } + } + } +} + +static void gridsample_3d_nearest_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + // x + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + // z + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (src.w - 1); + gy = (gy + 1) / 2.f * (src.h - 1); + gz = (gz + 1) / 2.f * (src.d - 1); + + gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + int x0 = static_cast(floor(gx + 0.5f)); + int y0 = static_cast(floor(gy + 0.5f)); + int z0 = static_cast(floor(gz + 0.5f)); + + for (int q = 0; q < src.c; q++) + { + dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; + } + } + } +} + +static void gridsample_3d_nearest_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + gx = _mm256_add_ps(gx, v0p5fp8); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(vImgWf, reflectx_v); + + gx = _mm256_sub_ps(gx, v0p5fp8); + + _mm256_sub_ps(gx, v0p5fp8); + + gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); + + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_add_ps(gy, v0p5fp8); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(vImgHf, reflecty_v); + + gy = _mm256_sub_ps(gy, v0p5fp8); + + _mm256_sub_ps(gy, v0p5fp8); + + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + // z + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_add_ps(gz, v0p5fp8); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(vImgDf, reflectz_v); + + gz = _mm256_sub_ps(gz, v0p5fp8); + + _mm256_sub_ps(gz, v0p5fp8); + + gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); + } + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = ((gx + 1) * src.w - 1) / 2.f; + gy = ((gy + 1) * src.h - 1) / 2.f; + gz = ((gz + 1) * src.d - 1) / 2.f; + + gx = floor(gx + 0.5f); + gy = floor(gy + 0.5f); + gz = floor(gz + 0.5f); + + gx = abs(gx + 0.5f); + gx = src.w - abs(gx - src.w) - 0.5; + + gy = abs(gy + 0.5f); + gy = src.h - abs(gy - src.h) - 0.5; + + gz = abs(gz + 0.5f); + gz = src.d - abs(gz - src.d) - 0.5; + + int x0 = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + int y0 = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + int z0 = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + for (int q = 0; q < src.c; q++) + { + dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; + } + } + } +} + +static void gridsample_3d_nearest_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) +{ + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); + __m256 gz = _mm256_loadu_ps(gridptr + x + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); + gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); + gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + // compute coord + { + // x + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); + gx = _mm256_sub_ps(border_x, reflectx_v); + + // y + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + + gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); + gy = _mm256_sub_ps(border_y, reflecty_v); + + // z + const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); + + gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); + gz = _mm256_sub_ps(border_z, reflectz_v); + } + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), + _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + for (int q = 0; q < src.c; q++) + { + __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); + + _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); + } + } + nn = grid_size % 24; +#endif // __AVX__ + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float gx = gridptr[x]; + float gy = gridptr[x + 1]; + float gz = gridptr[x + 2]; + + gx = (gx + 1) / 2.f * (src.w - 1); + gy = (gy + 1) / 2.f * (src.h - 1); + gz = (gz + 1) / 2.f * (src.d - 1); + + gx = floor(gx + 0.5f); + gy = floor(gy + 0.5f); + gz = floor(gz + 0.5f); + + gx = abs(gx); + gx = (src.w - 1) - abs(gx - (src.w - 1)); + + gy = abs(gy); + gy = (src.h - 1) - abs(gy - (src.h - 1)); + + gz = abs(gz); + gz = (src.d - 1) - abs(gz - (src.d - 1)); + + int x0 = std::min(src.w - 1.0f, std::max(gx, 0.0f)); + int y0 = std::min(src.h - 1.0f, std::max(gy, 0.0f)); + int z0 = std::min(src.d - 1.0f, std::max(gz, 0.0f)); + + for (int q = 0; q < src.c; q++) + { + dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 5c872d25f55..30170d31a62 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -134,9 +134,9 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, return v; } -#include "gridsample_bicubic_pack4.h" #include "gridsample_bilinear_pack4.h" #include "gridsample_nearest_pack4.h" +#include "gridsample_bicubic_pack4.h" static inline void interpolate_cubic(float fx, float* coeffs) { @@ -162,6 +162,10 @@ static inline float reflect_coord(float x, int high) #endif // __SSE2__ +#include "gridsample_bilinear_pack1.h" +#include "gridsample_nearest_pack1.h" +#include "gridsample_bicubic_pack1.h" + int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -169,9 +173,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - bool v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - bool v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); - bool v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } + gridsample_2d_bilinear_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif // __AVX2__ - - for (int q = 0; q < channels; q++) - { -#if __AVX2__ - __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, v00_in_range); - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, v10_in_range); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, v01_in_range); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); -#endif // __AVX2__ - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - bool v01_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - bool v10_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); - bool v11_in_range = (x1 > -1) & (x1 < bottom_blob.w) & (y1 > -1) & (y1 < bottom_blob.h); - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } + gridsample_2d_bilinear_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 2) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < channels; q++) - { - __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); - bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } + gridsample_2d_bilinear_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < channels; q++) - { - __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); - bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } + gridsample_2d_bilinear_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 3) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < channels; q++) - { - __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - sample_x = abs(sample_x + 0.5f); - sample_x = w - abs(sample_x - w) - 0.5; - - sample_y = abs(sample_y + 0.5f); - sample_y = h - abs(sample_y - h) - 0.5; - - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); - bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } + gridsample_2d_bilinear_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < channels; q++) - { - __m256 nw_val = mask_gather_ps256(bottom_blob.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(bottom_blob.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(bottom_blob.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(bottom_blob.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - sample_x = abs(sample_x); - sample_x = (w - 1) - abs(sample_x - (w - 1)); - - sample_y = abs(sample_y); - sample_y = (h - 1) - abs(sample_y - (h - 1)); - - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < bottom_blob.w); - bool y1_in_range = (y1 > -1) & (y1 < bottom_blob.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - top_blob.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } + gridsample_2d_bilinear_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else @@ -1879,488 +954,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - top_blob.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; - } - } - } + gridsample_2d_nearest_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - bool v00_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - top_blob.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; - } - } - } + gridsample_2d_nearest_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 2) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } + gridsample_2d_nearest_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - sample_x = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } + gridsample_2d_nearest_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 3) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - sample_x = floor(sample_x + 0.5f); - sample_y = floor(sample_y + 0.5f); - - sample_x = abs(sample_x + 0.5f); - sample_x = w - abs(sample_x - w) - 0.5; - - sample_y = abs(sample_y + 0.5f); - sample_y = h - abs(sample_y - h) - 0.5; - - int x0 = std::min(w - 1.0f, std::max(sample_x, 0.0f)); - int y0 = std::min(h - 1.0f, std::max(sample_y, 0.0f)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } + gridsample_2d_nearest_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < bottom_blob.c; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - sample_x = floor(sample_x + 0.5f); - sample_y = floor(sample_y + 0.5f); - - sample_x = abs(sample_x); - int x0 = (w - 1) - abs(sample_x - (w - 1)); - - sample_y = abs(sample_y); - int y0 = (h - 1) - abs(sample_y - (h - 1)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - top_blob.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } + gridsample_2d_nearest_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else @@ -2375,1070 +995,48 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool x0_in_range = (x0 > -1) & (x0 < w); - bool y0_in_range = (y0 > -1) & (y0 < h); - bool x2_in_range = (x2 > -1) & (x2 < w); - bool y2_in_range = (y2 > -1) & (y2 < h); - bool x3_in_range = (x3 > -1) & (x3 < w); - bool y3_in_range = (y3 > -1) & (y3 < h); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v02_in_range = x2_in_range & y0_in_range; - bool v03_in_range = x3_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - bool v12_in_range = x2_in_range & y1_in_range; - bool v13_in_range = x3_in_range & y1_in_range; - bool v20_in_range = x0_in_range & y2_in_range; - bool v21_in_range = x1_in_range & y2_in_range; - bool v22_in_range = x2_in_range & y2_in_range; - bool v23_in_range = x3_in_range & y2_in_range; - bool v30_in_range = x0_in_range & y3_in_range; - bool v31_in_range = x1_in_range & y3_in_range; - bool v32_in_range = x2_in_range & y3_in_range; - bool v33_in_range = x3_in_range & y3_in_range; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v02 = v02_in_range ? image.row(y0)[x2] : 0; - float v03 = v03_in_range ? image.row(y0)[x3] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - float v12 = v12_in_range ? image.row(y1)[x2] : 0; - float v13 = v13_in_range ? image.row(y1)[x3] : 0; - float v20 = v20_in_range ? image.row(y2)[x0] : 0; - float v21 = v21_in_range ? image.row(y2)[x1] : 0; - float v22 = v22_in_range ? image.row(y2)[x2] : 0; - float v23 = v23_in_range ? image.row(y2)[x3] : 0; - float v30 = v30_in_range ? image.row(y3)[x0] : 0; - float v31 = v31_in_range ? image.row(y3)[x1] : 0; - float v32 = v32_in_range ? image.row(y3)[x2] : 0; - float v33 = v33_in_range ? image.row(y3)[x3] : 0; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x1, x_coeffs); - interpolate_cubic(sample_y - y1, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } + gridsample_2d_bicubic_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], v0_in_range[i]); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], v1_in_range[i]); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], v2_in_range[i]); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], v3_in_range[i]); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool x0_in_range = (x0 > -1) & (x0 < w); - bool y0_in_range = (y0 > -1) & (y0 < h); - bool x2_in_range = (x2 > -1) & (x2 < w); - bool y2_in_range = (y2 > -1) & (y2 < h); - bool x3_in_range = (x3 > -1) & (x3 < w); - bool y3_in_range = (y3 > -1) & (y3 < h); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v02_in_range = x2_in_range & y0_in_range; - bool v03_in_range = x3_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - bool v12_in_range = x2_in_range & y1_in_range; - bool v13_in_range = x3_in_range & y1_in_range; - bool v20_in_range = x0_in_range & y2_in_range; - bool v21_in_range = x1_in_range & y2_in_range; - bool v22_in_range = x2_in_range & y2_in_range; - bool v23_in_range = x3_in_range & y2_in_range; - bool v30_in_range = x0_in_range & y3_in_range; - bool v31_in_range = x1_in_range & y3_in_range; - bool v32_in_range = x2_in_range & y3_in_range; - bool v33_in_range = x3_in_range & y3_in_range; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v02 = v02_in_range ? image.row(y0)[x2] : 0; - float v03 = v03_in_range ? image.row(y0)[x3] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - float v12 = v12_in_range ? image.row(y1)[x2] : 0; - float v13 = v13_in_range ? image.row(y1)[x3] : 0; - float v20 = v20_in_range ? image.row(y2)[x0] : 0; - float v21 = v21_in_range ? image.row(y2)[x1] : 0; - float v22 = v22_in_range ? image.row(y2)[x2] : 0; - float v23 = v23_in_range ? image.row(y2)[x3] : 0; - float v30 = v30_in_range ? image.row(y3)[x0] : 0; - float v31 = v31_in_range ? image.row(y3)[x1] : 0; - float v32 = v32_in_range ? image.row(y3)[x2] : 0; - float v33 = v33_in_range ? image.row(y3)[x3] : 0; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x1, x_coeffs); - interpolate_cubic(sample_y - y1, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } + gridsample_2d_bicubic_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 2) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x1 = std::min(w - 1, std::max(x1, 0)); - y1 = std::min(h - 1, std::max(y1, 0)); - x0 = std::min(w - 1, std::max(x0, 0)); - y0 = std::min(h - 1, std::max(y0, 0)); - x2 = std::min(w - 1, std::max(x2, 0)); - y2 = std::min(h - 1, std::max(y2, 0)); - x3 = std::min(w - 1, std::max(x3, 0)); - y3 = std::min(h - 1, std::max(y3, 0)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } + gridsample_2d_bicubic_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x1 = std::min(w - 1, std::max(x1, 0)); - y1 = std::min(h - 1, std::max(y1, 0)); - x0 = std::min(w - 1, std::max(x0, 0)); - y0 = std::min(h - 1, std::max(y0, 0)); - x2 = std::min(w - 1, std::max(x2, 0)); - y2 = std::min(h - 1, std::max(y2, 0)); - x3 = std::min(w - 1, std::max(x3, 0)); - y3 = std::min(h - 1, std::max(y3, 0)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } + gridsample_2d_bicubic_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 3) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - { - // x0 - gx0 = _mm256_add_ps(gx0, v0p5fp8); - - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); - - gx0 = _mm256_sub_ps(gx0, v0p5fp8); - - _mm256_sub_ps(gx0, v0p5fp8); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - - // x1 - gx1 = _mm256_add_ps(gx1, v0p5fp8); + gridsample_2d_bicubic_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); + } + else + { + gridsample_2d_bicubic_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); + } + } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } + } + } - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); - - gx1 = _mm256_sub_ps(gx1, v0p5fp8); - - _mm256_sub_ps(gx1, v0p5fp8); - - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - - // x2 - gx2 = _mm256_add_ps(gx2, v0p5fp8); - - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); - - gx2 = _mm256_sub_ps(gx2, v0p5fp8); - - _mm256_sub_ps(gx2, v0p5fp8); - - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - - // x3 - gx3 = _mm256_add_ps(gx3, v0p5fp8); - - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); - - gx3 = _mm256_sub_ps(gx3, v0p5fp8); - - _mm256_sub_ps(gx3, v0p5fp8); - - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - } - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - { - //y - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * w - 1) / 2.f; - sample_y = ((sample_y + 1) * h - 1) / 2.f; - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x0 = static_cast(reflect_coord(x0 + 0.5, w) - 0.5); - - y0 = static_cast(reflect_coord(y0 + 0.5, h) - 0.5); - - x0 = std::min(w - 1, std::max(x0, 0)); - y0 = std::min(h - 1, std::max(y0, 0)); - - x1 = static_cast(reflect_coord(x1 + 0.5, w) - 0.5); - - y1 = static_cast(reflect_coord(y1 + 0.5, h) - 0.5); - - x1 = std::min(w - 1, std::max(x1, 0)); - y1 = std::min(h - 1, std::max(y1, 0)); - - x2 = static_cast(reflect_coord(x2 + 0.5, w) - 0.5); - - y2 = static_cast(reflect_coord(y2 + 0.5, h) - 0.5); - - x2 = std::min(w - 1, std::max(x2, 0)); - y2 = std::min(h - 1, std::max(y2, 0)); - - x3 = static_cast(reflect_coord(x3 + 0.5, w) - 0.5); - - y3 = static_cast(reflect_coord(y3 + 0.5, h) - 0.5); - - x3 = std::min(w - 1, std::max(x3, 0)); - y3 = std::min(h - 1, std::max(y3, 0)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - { - // x0 - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(border_x, reflectx0_v); - - // x1 - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(border_x, reflectx1_v); - - // x2 - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(border_x, reflectx2_v); - - // x3 - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(border_x, reflectx3_v); - } - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - { - //y - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < bottom_blob.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(bottom_blob.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(bottom_blob.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(bottom_blob.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(bottom_blob.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(top_blob.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (w - 1); - sample_y = (sample_y + 1) / 2.f * (h - 1); - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x0 = static_cast(reflect_coord(x0, w - 1)); - y0 = static_cast(reflect_coord(y0, h - 1)); - x1 = static_cast(reflect_coord(x1, w - 1)); - y1 = static_cast(reflect_coord(y1, h - 1)); - x2 = static_cast(reflect_coord(x2, w - 1)); - y2 = static_cast(reflect_coord(y2, h - 1)); - x3 = static_cast(reflect_coord(x3, w - 1)); - y3 = static_cast(reflect_coord(y3, h - 1)); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - top_blob.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } - } - } - else - { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; - } - } - } - - if (dims == 4) - { -#if __AVX__ - const __m256 vImgDf = _mm256_set1_ps(d); -#if __AVX2__ - const __m256i vImgDi = _mm256_set1_epi32(d); -#endif // __AVX2__ -#endif // __AVX__ - int grid_size = grid_p1.w * grid_p1.h * grid_p1.d; - - top_blob.create(grid_p1.h, grid_p1.d, grid_p1.c, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + if (dims == 4) + { + top_blob.create(grid_p1.h, grid_p1.d, grid_p1.c, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; if (sample_type == 1) { @@ -3446,1445 +1044,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - - nn = grid_size % 24; - -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * w - 1) / 2.f; - gy = ((gy + 1) * h - 1) / 2.f; - gz = ((gz + 1) * d - 1) / 2.f; - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x0_in_range = (x0 > -1) & (x0 < w); - bool y0_in_range = (y0 > -1) & (y0 < h); - bool z0_in_range = (z0 > -1) & (z0 < d); - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool z1_in_range = (z1 > -1) & (z1 < d); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - - bool v000_in_range = v00_in_range & z0_in_range; - bool v010_in_range = v10_in_range & z0_in_range; - bool v100_in_range = v00_in_range & z1_in_range; - bool v110_in_range = v10_in_range & z1_in_range; - - bool v001_in_range = v01_in_range & z0_in_range; - bool v011_in_range = v11_in_range & z0_in_range; - bool v101_in_range = v01_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; - float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } + gridsample_3d_bilinear_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, *(__m256i*)_pi32_256_1); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, vImgWi); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, *(__m256i*)_pi32_256_1); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, *(__m256i*)_pi32_256_1); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, vImgWi); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); -#endif // __AVX2__ - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); -#if __AVX2__ - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, _mm256_castsi256_ps(v100_in_range)); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, _mm256_castsi256_ps(v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, _mm256_castsi256_ps(v101_in_range)); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, _mm256_castsi256_ps(v111_in_range)); -#else - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, v000_in_range); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, v100_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, v010_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, v001_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); -#endif - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (w - 1); - gy = (gy + 1) / 2.f * (h - 1); - gz = (gz + 1) / 2.f * (d - 1); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x0_in_range = (x0 > -1) & (x0 < w); - bool y0_in_range = (y0 > -1) & (y0 < h); - bool z0_in_range = (z0 > -1) & (z0 < d); - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool z1_in_range = (z1 > -1) & (z1 < d); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - - bool v000_in_range = v00_in_range & z0_in_range; - bool v010_in_range = v10_in_range & z0_in_range; - bool v100_in_range = v00_in_range & z1_in_range; - bool v110_in_range = v10_in_range & z1_in_range; - - bool v001_in_range = v01_in_range & z0_in_range; - bool v011_in_range = v11_in_range & z0_in_range; - bool v101_in_range = v01_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; - float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } + gridsample_3d_bilinear_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 2) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * w - 1) / 2.f; - gy = ((gy + 1) * h - 1) / 2.f; - gz = ((gz + 1) * d - 1) / 2.f; - - gx = std::min(w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool z1_in_range = (z1 > -1) & (z1 < d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (w - 1); - gy = (gy + 1) / 2.f * (h - 1); - gz = (gz + 1) / 2.f * (d - 1); - - gx = std::min(w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool z1_in_range = (z1 > -1) & (z1 < d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } + gridsample_3d_bilinear_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); + } + else + { + gridsample_3d_bilinear_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 3) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_add_ps(gz, v0p5fp8); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(vImgDf, reflectz_v); - - gz = _mm256_sub_ps(gz, v0p5fp8); - - _mm256_sub_ps(gz, v0p5fp8); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * w - 1) / 2.f; - gy = ((gy + 1) * h - 1) / 2.f; - gz = ((gz + 1) * d - 1) / 2.f; - - gx = abs(gx + 0.5f); - gx = w - abs(gx - w) - 0.5; - - gy = abs(gy + 0.5f); - gy = h - abs(gy - h) - 0.5; - - gz = abs(gz + 0.5f); - gz = d - abs(gz - d) - 0.5; - - gx = std::min(w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool z1_in_range = (z1 > -1) & (z1 < d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } + gridsample_3d_bilinear_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(border_z, reflectz_v); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (w - 1); - gy = (gy + 1) / 2.f * (h - 1); - gz = (gz + 1) / 2.f * (d - 1); - - gx = abs(gx); - gx = (w - 1) - abs(gx - (w - 1)); - - gy = abs(gy); - gy = (h - 1) - abs(gy - (h - 1)); - - gz = abs(gz); - gz = (d - 1) - abs(gz - (d - 1)); - - gx = std::min(w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < w); - bool y1_in_range = (y1 > -1) & (y1 < h); - bool z1_in_range = (z1 > -1) & (z1 < d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < channels; q++) - { - const Mat& image = bottom_blob.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - top_blob.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } + gridsample_3d_bilinear_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } } @@ -4894,594 +1080,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * w - 1) / 2.f; - gy = ((gy + 1) * h - 1) / 2.f; - gz = ((gz + 1) * d - 1) / 2.f; - - // bilinear interpolate - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - bool v_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h) && (z0 > -1) && (z0 < bottom_blob.d); - - for (int q = 0; q < channels; q++) - { - top_blob.channel(q).depth(y)[x / 3] = v_in_range ? bottom_blob.channel(q).depth(z0).row(y0)[x0] : 0; - } - } - } + gridsample_3d_nearest_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < channels; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (w - 1); - gy = (gy + 1) / 2.f * (h - 1); - gz = (gz + 1) / 2.f * (d - 1); - - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - bool v_in_range = (x0 > -1) & (x0 < bottom_blob.w) & (y0 > -1) & (y0 < bottom_blob.h) && (z0 > -1) && (z0 < bottom_blob.d); - - for (int q = 0; q < channels; q++) - { - top_blob.channel(q).depth(y)[x / 3] = v_in_range ? bottom_blob.channel(q).depth(z0).row(y0)[x0] : 0; - } - } - } + gridsample_3d_nearest_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 2) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < channels; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * w - 1) / 2.f; - gy = ((gy + 1) * h - 1) / 2.f; - gz = ((gz + 1) * d - 1) / 2.f; - - gx = std::min(w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(d - 1.0f, std::max(gz, 0.0f)); - - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - for (int q = 0; q < channels; q++) - { - top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; - } - } - } + gridsample_3d_nearest_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < channels; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (w - 1); - gy = (gy + 1) / 2.f * (h - 1); - gz = (gz + 1) / 2.f * (d - 1); - - gx = std::min(w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(d - 1.0f, std::max(gz, 0.0f)); - - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - for (int q = 0; q < channels; q++) - { - top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; - } - } - } + gridsample_3d_nearest_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } else if (padding_mode == 3) { if (align_corner == 0) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_add_ps(gz, v0p5fp8); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(vImgDf, reflectz_v); - - gz = _mm256_sub_ps(gz, v0p5fp8); - - _mm256_sub_ps(gz, v0p5fp8); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < channels; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * w - 1) / 2.f; - gy = ((gy + 1) * h - 1) / 2.f; - gz = ((gz + 1) * d - 1) / 2.f; - - gx = floor(gx + 0.5f); - gy = floor(gy + 0.5f); - gz = floor(gz + 0.5f); - - gx = abs(gx + 0.5f); - gx = w - abs(gx - w) - 0.5; - - gy = abs(gy + 0.5f); - gy = h - abs(gy - h) - 0.5; - - gz = abs(gz + 0.5f); - gz = d - abs(gz - d) - 0.5; - - int x0 = std::min(w - 1.0f, std::max(gx, 0.0f)); - int y0 = std::min(h - 1.0f, std::max(gy, 0.0f)); - int z0 = std::min(d - 1.0f, std::max(gz, 0.0f)); - - for (int q = 0; q < channels; q++) - { - top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; - } - } - } + gridsample_3d_nearest_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid_p1.c; y++) - { - float* gridptr = grid_p1.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - - // z - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(border_z, reflectz_v); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < channels; q++) - { - __m256 _v = mask_gather_ps256(bottom_blob.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(top_blob.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (w - 1); - gy = (gy + 1) / 2.f * (h - 1); - gz = (gz + 1) / 2.f * (d - 1); - - gx = floor(gx + 0.5f); - gy = floor(gy + 0.5f); - gz = floor(gz + 0.5f); - - gx = abs(gx); - gx = (w - 1) - abs(gx - (w - 1)); - - gy = abs(gy); - gy = (h - 1) - abs(gy - (h - 1)); - - gz = abs(gz); - gz = (d - 1) - abs(gz - (d - 1)); - - int x0 = std::min(w - 1.0f, std::max(gx, 0.0f)); - int y0 = std::min(h - 1.0f, std::max(gy, 0.0f)); - int z0 = std::min(d - 1.0f, std::max(gz, 0.0f)); - - for (int q = 0; q < channels; q++) - { - top_blob.channel(q).depth(y)[x / 3] = bottom_blob.channel(q).depth(z0).row(y0)[x0]; - } - } - } + gridsample_3d_nearest_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); } } } From b492ec6d0aa66968c3af84f3eefb31fe98a32ecb Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Mon, 19 Dec 2022 08:49:10 +0000 Subject: [PATCH 053/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_pack1.h | 2 +- src/layer/x86/gridsample_bilinear_pack1.h | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_pack1.h b/src/layer/x86/gridsample_bicubic_pack1.h index ca40d150019..07b2b846305 100644 --- a/src/layer/x86/gridsample_bicubic_pack1.h +++ b/src/layer/x86/gridsample_bicubic_pack1.h @@ -921,7 +921,7 @@ static void gridsample_2d_bicubic_align1_reflection_blob_pack1(const Mat& src, M const __m256 vImgHf = _mm256_set1_ps(src.h); #endif // __AVX__ -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/src/layer/x86/gridsample_bilinear_pack1.h b/src/layer/x86/gridsample_bilinear_pack1.h index 23e63216f29..3b88c20bfa7 100644 --- a/src/layer/x86/gridsample_bilinear_pack1.h +++ b/src/layer/x86/gridsample_bilinear_pack1.h @@ -116,7 +116,7 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else +#else __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); @@ -174,7 +174,6 @@ static void gridsample_2d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& } } - static void gridsample_2d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) { const int grid_size = grid.w * grid.h; @@ -187,7 +186,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& #endif // __AVX2__ #endif // __AVX__ -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); @@ -279,7 +278,7 @@ static void gridsample_2d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else +#else __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); @@ -348,7 +347,7 @@ static void gridsample_2d_bilinear_align0_border_blob_pack1(const Mat& src, Mat& #endif // __AVX2__ #endif // __AVX__ -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); @@ -1033,7 +1032,7 @@ static void gridsample_3d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& #endif // __AVX2__ #endif // __AVX__ - #pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); From 3f79b972ff8ea6ce423cc7ef7e646f0efd8ca324 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 30 Jan 2023 21:21:31 +0800 Subject: [PATCH 054/127] complete the permute and gridsample operator fusion --- tools/pnnx/src/CMakeLists.txt | 1 + tools/pnnx/src/pass_ncnn.cpp | 2 + tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 1 + .../src/pass_ncnn/fuse_permute_gridsample.cpp | 125 ++++++++++++++++++ .../src/pass_ncnn/fuse_permute_gridsample.h | 25 ++++ tools/pnnx/src/pass_ncnn/torch_permute.cpp | 3 + 6 files changed, 157 insertions(+) create mode 100644 tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp create mode 100644 tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 5b7b00372b6..556ac9e1c32 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -364,6 +364,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/fuse_innerproduct_activation.cpp pass_ncnn/fuse_transpose_matmul.cpp pass_ncnn/fuse_binaryop_eltwise.cpp + pass_ncnn/fuse_permute_gridsample.cpp pass_ncnn/insert_reshape_linear.cpp pass_ncnn/insert_reshape_pooling.cpp diff --git a/tools/pnnx/src/pass_ncnn.cpp b/tools/pnnx/src/pass_ncnn.cpp index 14dedf0e859..28f49dfac5c 100644 --- a/tools/pnnx/src/pass_ncnn.cpp +++ b/tools/pnnx/src/pass_ncnn.cpp @@ -43,6 +43,7 @@ #include "pass_ncnn/fuse_innerproduct_activation.h" #include "pass_ncnn/fuse_transpose_matmul.h" #include "pass_ncnn/fuse_binaryop_eltwise.h" +#include "pass_ncnn/fuse_permute_gridsample.h" #include "pass_ncnn/insert_reshape_linear.h" #include "pass_ncnn/insert_reshape_pooling.h" @@ -120,6 +121,7 @@ void pass_ncnn(Graph& g) ncnn::fuse_deconvolution_activation(g); ncnn::fuse_deconvolutiondepthwise_activation(g); ncnn::fuse_innerproduct_activation(g); + ncnn::fuse_permute_gridsample(g); ncnn::eliminate_tail_reshape_permute(g); dead_code_elimination(g); diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 7c681eb08b7..7b9872b1842 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -60,6 +60,7 @@ pnnx.Output output 1 0 out op->params["1"] = 3; op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; + op->params["3"] = 0; } }; diff --git a/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp b/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp new file mode 100644 index 00000000000..4d448f114e2 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp @@ -0,0 +1,125 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_permute_gridsample.h" + +#include "pass_level2.h" + +#include + +namespace pnnx { + +namespace ncnn { + +class fuse_permute_gridsample_4d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_a 0 1 a +pnnx.Input input_b 0 1 b +Permute op_0 1 1 b b1 0=3 1=2 +GridSample op_1 2 1 a b1 out 0=%c 1=%d 2=%e 3=0 +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "GridSample"; + } + + const char* name_str() const + { + return "permutegridsample"; + } + + void write(Operator* op, const std::map& captured_params) const + { + int mode = 0; + int padding_mode = 0; + int align_corner = 0; + if (captured_params.at("c").type == 2) + mode = captured_params.at("c").i; + if (captured_params.at("d").type == 2) + padding_mode = captured_params.at("d").i; + if (captured_params.at("e").type == 2) + align_corner = captured_params.at("e").i; + + op->params["0"] = mode; + op->params["1"] = padding_mode; + op->params["2"] = align_corner; + op->params["3"] = 1; + } +}; + +class fuse_permute_gridsample_5d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_a 0 1 a +pnnx.Input input_b 0 1 b +Permute op_0 1 1 b b1 0=9 1=3 +GridSample op_1 2 1 a b1 out 0=%c 1=%d 2=%e 3=0 +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "GridSample"; + } + + const char* name_str() const + { + return "permutegridsample"; + } + + void write(Operator* op, const std::map& captured_params) const + { + int mode = 0; + int padding_mode = 0; + int align_corner = 0; + if (captured_params.at("c").type == 2) + mode = captured_params.at("c").i; + if (captured_params.at("d").type == 2) + padding_mode = captured_params.at("d").i; + if (captured_params.at("e").type == 2) + align_corner = captured_params.at("e").i; + + op->params["0"] = mode; + op->params["1"] = padding_mode; + op->params["2"] = align_corner; + op->params["3"] = 1; + } +}; + +void fuse_permute_gridsample(Graph& graph) +{ + fuse_permute_gridsample_4d_pass a; + fuse_permute_gridsample_5d_pass b; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); +} + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h b/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h new file mode 100644 index 00000000000..61499dea32e --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h @@ -0,0 +1,25 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +namespace ncnn { + +void fuse_permute_gridsample(Graph& graph); + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/torch_permute.cpp b/tools/pnnx/src/pass_ncnn/torch_permute.cpp index 13705dc9be6..30546cfad1c 100644 --- a/tools/pnnx/src/pass_ncnn/torch_permute.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_permute.cpp @@ -94,6 +94,7 @@ pnnx.Output output 1 0 out op->type = "Noop"; else if (new_dims == std::vector{1, 0}) op->params["0"] = 1; + op->params["1"] = 1; } if (input_rank == 3) { @@ -109,6 +110,7 @@ pnnx.Output output 1 0 out op->params["0"] = 4; else if (new_dims == std::vector{2, 1, 0}) op->params["0"] = 5; + op->params["1"] = 2; } if (input_rank == 4) { @@ -160,6 +162,7 @@ pnnx.Output output 1 0 out op->params["0"] = 22; else if (new_dims == std::vector{3, 2, 1, 0}) op->params["0"] = 23; + op->params["1"] = 3; } } }; From c88d1a2584af7f26d59f9dacbcd7b32386f09f50 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 30 Jan 2023 21:24:19 +0800 Subject: [PATCH 055/127] spilt calculation into two stages and support permute fusion --- src/layer/gridsample.cpp | 225 ++++++++++++++++++++++++++++++--------- src/layer/gridsample.h | 4 +- 2 files changed, 178 insertions(+), 51 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 83e73eecb3d..fce559d02a4 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // coord compliance with the License. You may obtain a copy of the License at @@ -28,6 +28,7 @@ int GridSample::load_param(const ParamDict& pd) sample_type = pd.get(0, 1); padding_mode = pd.get(1, 1); align_corner = pd.get(2, 0); + permute_fusion = pd.get(3, 0); if (sample_type < 1 || sample_type > 3) { @@ -166,25 +167,79 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& if (top_blob.empty()) return -100; + Mat offset_blob; + offset_blob.create(outw, outh, grid.c, elemsize, opt.blob_allocator); + + //1 pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly + if (permute_fusion == 0) + { + float* offsetptr_x = offset_blob.channel(0); + float* offsetptr_y = offset_blob.channel(1); + + for (int y = 0; y < outh; y++) + { + const float* gridptr = grid.channel(y); + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + + *offsetptr_x = sample_x; + *offsetptr_y = sample_y; + + gridptr += 2; + offsetptr_x++; + offsetptr_y++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + float* offsetptr_x = offset_blob.channel(0); + float* offsetptr_y = offset_blob.channel(1); + + for (int y = 0; y < outh; y++) + { + for (int x = 0; x < outw; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + + *offsetptr_x = sample_x; + *offsetptr_y = sample_y; + + gridptr_x++; + gridptr_y++; + offsetptr_x++; + offsetptr_y++; + } + } + } + if (sample_type == 1) // bilinear { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); float* outptr = top_blob.channel(q); + const float* offsetptr_x = offset_blob.channel(0); + const float* offsetptr_y = offset_blob.channel(1); for (int y = 0; y < outh; y++) { - const float* gridptr = grid.channel(y); - for (int x = 0; x < outw; x++) { - float sample_x = gridptr[0]; - float sample_y = gridptr[1]; - - sample_x = grid_sample_unormalize(w, sample_x, align_corner); - sample_y = grid_sample_unormalize(h, sample_y, align_corner); + float sample_x = *offsetptr_x; + float sample_y = *offsetptr_y; // bilinear interpolate float v; @@ -211,30 +266,28 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& outptr[0] = v; outptr += 1; - gridptr += 2; + offsetptr_x++; + offsetptr_y++; } } } } else if (sample_type == 2) // nearest { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); float* outptr = top_blob.channel(q); + const float* offsetptr_x = offset_blob.channel(0); + const float* offsetptr_y = offset_blob.channel(1); for (int y = 0; y < outh; y++) { - const float* gridptr = grid.channel(y); - for (int x = 0; x < outw; x++) { - float sample_x = gridptr[0]; - float sample_y = gridptr[1]; - - sample_x = grid_sample_unormalize(w, sample_x, align_corner); - sample_y = grid_sample_unormalize(h, sample_y, align_corner); + float sample_x = *offsetptr_x; + float sample_y = *offsetptr_y; int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); @@ -244,30 +297,28 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& outptr[0] = v; outptr += 1; - gridptr += 2; + offsetptr_x++; + offsetptr_y++; } } } } else if (sample_type == 3) // bicubic { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); float* outptr = top_blob.channel(q); + const float* offsetptr_x = offset_blob.channel(0); + const float* offsetptr_y = offset_blob.channel(1); for (int y = 0; y < outh; y++) { - const float* gridptr = grid.channel(y); - for (int x = 0; x < outw; x++) { - float sample_x = gridptr[0]; - float sample_y = gridptr[1]; - - sample_x = grid_sample_unormalize(w, sample_x, align_corner); - sample_y = grid_sample_unormalize(h, sample_y, align_corner); + float sample_x = *offsetptr_x; + float sample_y = *offsetptr_y; // bicubic interpolate float v; @@ -314,7 +365,8 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& outptr[0] = v; outptr += 1; - gridptr += 2; + offsetptr_x++; + offsetptr_y++; } } } @@ -331,29 +383,101 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& if (top_blob.empty()) return -100; + Mat offset_blob; + offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.blob_allocator); + + //1 pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly + if (permute_fusion == 0) + { + float* offsetptr_x = offset_blob.channel(0); + float* offsetptr_y = offset_blob.channel(1); + float* offsetptr_z = offset_blob.channel(2); + + for (int z = 0; z < outd; z++) + { + const float* gridptr = grid.channel(z); + for (int y = 0; y < outh; y++) + { + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + float sample_z = gridptr[2]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + sample_z = grid_sample_unormalize(d, sample_z, align_corner); + + *offsetptr_x = sample_x; + *offsetptr_y = sample_y; + *offsetptr_z = sample_z; + + gridptr += 3; + offsetptr_x++; + offsetptr_y++; + offsetptr_z++; + } + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); + float* offsetptr_x = offset_blob.channel(0); + float* offsetptr_y = offset_blob.channel(1); + float* offsetptr_z = offset_blob.channel(2); + + for (int z = 0; z < outd; z++) + { + for (int y = 0; y < outh; y++) + { + for (int x = 0; x < outw; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + sample_z = grid_sample_unormalize(d, sample_z, align_corner); + + *offsetptr_x = sample_x; + *offsetptr_y = sample_y; + *offsetptr_z = sample_z; + + gridptr_x++; + gridptr_y++; + gridptr_z++; + offsetptr_x++; + offsetptr_y++; + offsetptr_z++; + } + } + } + } + if (sample_type == 1) // bilinear { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); float* outptr = top_blob.channel(q); + const float* offsetptr_x = offset_blob.channel(0); + const float* offsetptr_y = offset_blob.channel(1); + const float* offsetptr_z = offset_blob.channel(2); for (int z = 0; z < outd; z++) { - const float* gridptr = grid.channel(z); - for (int y = 0; y < outh; y++) { for (int x = 0; x < outw; x++) { - float sample_x = gridptr[0]; - float sample_y = gridptr[1]; - float sample_z = gridptr[2]; - - sample_x = grid_sample_unormalize(w, sample_x, align_corner); - sample_y = grid_sample_unormalize(h, sample_y, align_corner); - sample_z = grid_sample_unormalize(d, sample_z, align_corner); + float sample_x = *offsetptr_x; + float sample_y = *offsetptr_y; + float sample_z = *offsetptr_z; // bilinear interpolate float v; @@ -392,7 +516,9 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& outptr[0] = v; outptr += 1; - gridptr += 3; + offsetptr_x++; + offsetptr_y++; + offsetptr_z++; } } } @@ -400,27 +526,24 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } else if (sample_type == 2) // nearest { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); float* outptr = top_blob.channel(q); + const float* offsetptr_x = offset_blob.channel(0); + const float* offsetptr_y = offset_blob.channel(1); + const float* offsetptr_z = offset_blob.channel(2); for (int z = 0; z < outd; z++) { - const float* gridptr = grid.channel(z); - for (int y = 0; y < outh; y++) { for (int x = 0; x < outw; x++) { - float sample_x = gridptr[0]; - float sample_y = gridptr[1]; - float sample_z = gridptr[2]; - - sample_x = grid_sample_unormalize(w, sample_x, align_corner); - sample_y = grid_sample_unormalize(h, sample_y, align_corner); - sample_z = grid_sample_unormalize(d, sample_z, align_corner); + float sample_x = *offsetptr_x; + float sample_y = *offsetptr_y; + float sample_z = *offsetptr_z; int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); @@ -431,7 +554,9 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& outptr[0] = v; outptr += 1; - gridptr += 3; + offsetptr_x++; + offsetptr_y++; + offsetptr_z++; } } } diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 0ea540eb4ba..4826fc2c385 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -33,6 +33,8 @@ class GridSample : public Layer int sample_type; // 1=bilinear 2=nearest 3=bicubic int padding_mode; // 1=zeros 2=border 3=reflection int align_corner; + + int permute_fusion; }; } // namespace ncnn From e29fd7e0cfd75cf57aa5e8dbd5ca3146aedc8ced Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Mon, 30 Jan 2023 13:26:08 +0000 Subject: [PATCH 056/127] apply code-format changes --- src/layer/gridsample.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index fce559d02a4..66b9189db30 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -226,7 +226,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& if (sample_type == 1) // bilinear { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); @@ -274,7 +274,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } else if (sample_type == 2) // nearest { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); @@ -305,7 +305,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } else if (sample_type == 3) // bicubic { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); @@ -460,7 +460,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& if (sample_type == 1) // bilinear { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); @@ -526,7 +526,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } else if (sample_type == 2) // nearest { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const Mat image = bottom_blob.channel(q); From 4e6e9a93c2735d9a527b0d0b9569edda5d6e8e8f Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Thu, 9 Feb 2023 20:56:57 +0800 Subject: [PATCH 057/127] add pnnx_fuse_permute_gridsample unittest --- .../test_pnnx_fuse_permute_gridsample.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py diff --git a/tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py b/tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py new file mode 100644 index 00000000000..be95b15a896 --- /dev/null +++ b/tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py @@ -0,0 +1,75 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, xg1, xg2, y, yg1, yg2): + # norm to -1 ~ 1 + xg1 = xg1 * 2 - 1 + xg2 = xg2 * 2 - 1 + yg1 = yg1 * 2 - 1 + yg2 = yg2 * 2 - 1 + + xg1 = torch.permute(xg1, (0, 2, 3, 1)) + xg2 = torch.permute(xg2, (0, 2, 3, 1)) + yg1 = torch.permute(yg1, (0, 2, 3, 4, 1)) + yg2 = torch.permute(yg2, (0, 2, 3, 4, 1)) + + x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) + x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) + + y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) + y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) + + return x, y + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 12, 16) + xg1 = torch.rand(1, 2, 21, 27) + xg2 = torch.rand(1, 2, 12, 16) + y = torch.rand(1, 5, 10, 12, 16) + yg1 = torch.rand(1, 3, 10, 21, 27) + yg2 = torch.rand(1, 3, 10, 12, 16) + + a0, a1 = net(x, xg1, xg2, y, yg1, yg2) + + # export torchscript + mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) + mod.save("test_pnnx_fuse_permute_gridsample.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_pnnx_fuse_permute_gridsample.pt inputshape=[1,3,12,16],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,3,10,21,27],[1,3,10,12,16]") + + # pnnx inference + import test_pnnx_fuse_permute_gridsample_pnnx + b0, b1 = test_pnnx_fuse_permute_gridsample_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From 4a544f18e1b80c63fb4638cb476ad56e4e58515a Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 10 Feb 2023 15:41:53 +0800 Subject: [PATCH 058/127] add ncnn fuse_permute_grdisample unittest --- src/layer/gridsample.cpp | 10 +-- .../ncnn/test_ncnn_fuse_permute_gridsample.py | 75 +++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 66b9189db30..025dcf8a0dc 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -160,8 +160,8 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& if (dims == 3) { - int outw = grid.h; - int outh = grid.c; + int outw = permute_fusion == 0 ? grid.h : grid.w; + int outh = permute_fusion == 0 ? grid.c : grid.h; top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) @@ -375,9 +375,9 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& if (dims == 4) { - int outw = grid.h; - int outh = grid.d; - int outd = grid.c; + int outw = permute_fusion == 0 ? grid.h : grid.w; + int outh = permute_fusion == 0 ? grid.d : grid.h; + int outd = permute_fusion == 0 ? grid.c : grid.d; top_blob.create(outw, outh, outd, channels, elemsize, opt.blob_allocator); if (top_blob.empty()) diff --git a/tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py b/tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py new file mode 100644 index 00000000000..c6448a4f5c0 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py @@ -0,0 +1,75 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, xg1, xg2, y, yg1, yg2): + # norm to -1 ~ 1 + xg1 = xg1 * 2 - 1 + xg2 = xg2 * 2 - 1 + yg1 = yg1 * 2 - 1 + yg2 = yg2 * 2 - 1 + + xg1 = torch.permute(xg1, (0, 2, 3, 1)) + xg2 = torch.permute(xg2, (0, 2, 3, 1)) + yg1 = torch.permute(yg1, (0, 2, 3, 4, 1)) + yg2 = torch.permute(yg2, (0, 2, 3, 4, 1)) + + x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) + x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) + + y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) + y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) + + return x, y + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 12, 16) + xg1 = torch.rand(1, 2, 21, 27) + xg2 = torch.rand(1, 2, 12, 16) + y = torch.rand(1, 5, 10, 12, 16) + yg1 = torch.rand(1, 3, 10, 21, 27) + yg2 = torch.rand(1, 3, 10, 12, 16) + + a0, a1 = net(x, xg1, xg2, y, yg1, yg2) + + # export torchscript + mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) + mod.save("test_ncnn_fuse_permute_gridsample.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_ncnn_fuse_permute_gridsample.pt inputshape=[1,3,12,16],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,3,10,21,27],[1,3,10,12,16]") + + # ncnn inference + import test_ncnn_fuse_permute_gridsample_ncnn + b0, b1 = test_ncnn_fuse_permute_gridsample_ncnn.test_inference() + + return torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From d6a58518488861cd8f5e78ce52e9f962b1d96e67 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 14 Feb 2023 21:07:31 +0800 Subject: [PATCH 059/127] [WIP] x86 support, spilt calculation into two stages and finish bilinear&nearest compute_blob --- src/layer/gridsample.h | 14 + .../x86/gridsample_bicubic_compute_blob.h | 533 ++++ src/layer/x86/gridsample_bicubic_pack1.h | 1093 ------- src/layer/x86/gridsample_bicubic_pack16.h | 687 ----- src/layer/x86/gridsample_bicubic_pack4.h | 662 ----- src/layer/x86/gridsample_bicubic_pack8.h | 622 ---- .../x86/gridsample_bilinear_compute_blob.h | 613 ++++ src/layer/x86/gridsample_bilinear_pack1.h | 2513 ----------------- src/layer/x86/gridsample_bilinear_pack16.h | 1431 ---------- src/layer/x86/gridsample_bilinear_pack4.h | 1730 ------------ src/layer/x86/gridsample_bilinear_pack8.h | 1663 ----------- .../x86/gridsample_nearest_compute_blob.h | 330 +++ src/layer/x86/gridsample_nearest_pack1.h | 1167 -------- src/layer/x86/gridsample_nearest_pack16.h | 805 ------ src/layer/x86/gridsample_nearest_pack4.h | 799 ------ src/layer/x86/gridsample_nearest_pack8.h | 787 ------ src/layer/x86/gridsample_x86.cpp | 1092 ++----- 17 files changed, 1755 insertions(+), 14786 deletions(-) create mode 100644 src/layer/x86/gridsample_bicubic_compute_blob.h delete mode 100644 src/layer/x86/gridsample_bicubic_pack1.h delete mode 100644 src/layer/x86/gridsample_bicubic_pack16.h delete mode 100644 src/layer/x86/gridsample_bicubic_pack4.h delete mode 100644 src/layer/x86/gridsample_bicubic_pack8.h create mode 100644 src/layer/x86/gridsample_bilinear_compute_blob.h delete mode 100644 src/layer/x86/gridsample_bilinear_pack1.h delete mode 100644 src/layer/x86/gridsample_bilinear_pack16.h delete mode 100644 src/layer/x86/gridsample_bilinear_pack4.h delete mode 100644 src/layer/x86/gridsample_bilinear_pack8.h create mode 100644 src/layer/x86/gridsample_nearest_compute_blob.h delete mode 100644 src/layer/x86/gridsample_nearest_pack1.h delete mode 100644 src/layer/x86/gridsample_nearest_pack16.h delete mode 100644 src/layer/x86/gridsample_nearest_pack4.h delete mode 100644 src/layer/x86/gridsample_nearest_pack8.h diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 4826fc2c385..96b6b1aeb24 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -28,6 +28,20 @@ class GridSample : public Layer virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + enum InterpolationMode // 1=bilinear 2=nearest 3=bicubic + { + Bilinear = 1, + Nearest = 2, + Bicubic = 3 + }; + + enum PaddingMode // 1=zeros 2=border 3=reflection + { + Zeros = 1, + Border = 2, + Reflection = 3 + }; + public: // param int sample_type; // 1=bilinear 2=nearest 3=bicubic diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h new file mode 100644 index 00000000000..e6e989e1912 --- /dev/null +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -0,0 +1,533 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +template +struct gridsample_2d_bicubic_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i * 4 + 0] = offset.channel(i * 4 + 0); + v1_offset_ptr[i * 4 + 1] = offset.channel(i * 4 + 1); + v2_offset_ptr[i * 4 + 2] = offset.channel(i * 4 + 2); + v3_offset_ptr[i * 4 + 3] = offset.channel(i * 4 + 3); + } + + grid_sample_unormalize unormalize; + compute_coord get_coord; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + // y + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + gridptr += 16; + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + + // x + sample_x = unormalize(src.w, sample_x); + + // y + sample_y = unormalize(src.h, sample_x); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(src.w - 1, std::max(x1, 0)); + y1 = std::min(src.h - 1, std::max(y1, 0)); + x0 = std::min(src.w - 1, std::max(x0, 0)); + y0 = std::min(src.h - 1, std::max(y0, 0)); + x2 = std::min(src.w - 1, std::max(x2, 0)); + y2 = std::min(src.h - 1, std::max(y2, 0)); + x3 = std::min(src.w - 1, std::max(x3, 0)); + y3 = std::min(src.h - 1, std::max(y3, 0)); + + gridptr += 2; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + // y + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + + gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); + gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); + gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); + gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + gridptr_x += 8; + gridptr_y += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + + // y + sample_y = unormalize(src.h, sample_x); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + x1 = std::min(src.w - 1, std::max(x1, 0)); + y1 = std::min(src.h - 1, std::max(y1, 0)); + x0 = std::min(src.w - 1, std::max(x0, 0)); + y0 = std::min(src.h - 1, std::max(y0, 0)); + x2 = std::min(src.w - 1, std::max(x2, 0)); + y2 = std::min(src.h - 1, std::max(y2, 0)); + x3 = std::min(src.w - 1, std::max(x3, 0)); + y3 = std::min(src.h - 1, std::max(y3, 0)); + + gridptr_x++; + gridptr_y++; + } + } + } +}; + +template +struct gridsample_2d_bicubic_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; + + for (int i = 0; i < 4; i ++) + { + v0_offset_ptr[i * 4 + 0] = offset.channel(i * 4 + 0); + v0_offset_ptr[i * 4 + 1] = offset.channel(i * 4 + 1); + v0_offset_ptr[i * 4 + 2] = offset.channel(i * 4 + 2); + v0_offset_ptr[i * 4 + 3] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i * 4 + 0] = in_bound.channel(i * 4 + 0); + v0_in_bound_ptr[i * 4 + 1] = in_bound.channel(i * 4 + 1); + v0_in_bound_ptr[i * 4 + 2] = in_bound.channel(i * 4 + 2); + v0_in_bound_ptr[i * 4 + 3] = in_bound.channel(i * 4 + 3); + } + + grid_sample_unormalize unormalize; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + // y + gy = unormalize(vImgHf, gy); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); + + v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + gridptr += 16; + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_x); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool x2_in_range = (x2 > -1) & (x2 < src.w); + bool y2_in_range = (y2 > -1) & (y2 < src.h); + bool x3_in_range = (x3 > -1) & (x3 < src.w); + bool y3_in_range = (y3 > -1) & (y3 < src.h); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v02_in_range = x2_in_range & y0_in_range; + bool v03_in_range = x3_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + bool v12_in_range = x2_in_range & y1_in_range; + bool v13_in_range = x3_in_range & y1_in_range; + bool v20_in_range = x0_in_range & y2_in_range; + bool v21_in_range = x1_in_range & y2_in_range; + bool v22_in_range = x2_in_range & y2_in_range; + bool v23_in_range = x3_in_range & y2_in_range; + bool v30_in_range = x0_in_range & y3_in_range; + bool v31_in_range = x1_in_range & y3_in_range; + bool v32_in_range = x2_in_range & y3_in_range; + bool v33_in_range = x3_in_range & y3_in_range; + + gridptr += 2; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + // y + gy = unormalize(vImgHf, gy); + } + + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); + + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); + + __m256 coefficients[4]; + + __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); + __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); + + __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; + __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); + + v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); + v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); + v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); + v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); + + __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); + __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); + __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); + __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + + v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); + v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); + v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); + v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + } + + gridptr_x += 8; + gridptr_y += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_x); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool x2_in_range = (x2 > -1) & (x2 < src.w); + bool y2_in_range = (y2 > -1) & (y2 < src.h); + bool x3_in_range = (x3 > -1) & (x3 < src.w); + bool y3_in_range = (y3 > -1) & (y3 < src.h); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v02_in_range = x2_in_range & y0_in_range; + bool v03_in_range = x3_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + bool v12_in_range = x2_in_range & y1_in_range; + bool v13_in_range = x3_in_range & y1_in_range; + bool v20_in_range = x0_in_range & y2_in_range; + bool v21_in_range = x1_in_range & y2_in_range; + bool v22_in_range = x2_in_range & y2_in_range; + bool v23_in_range = x3_in_range & y2_in_range; + bool v30_in_range = x0_in_range & y3_in_range; + bool v31_in_range = x1_in_range & y3_in_range; + bool v32_in_range = x2_in_range & y3_in_range; + bool v33_in_range = x3_in_range & y3_in_range; + + gridptr_x++; + gridptr_y++; + } + } + } +}; + + diff --git a/src/layer/x86/gridsample_bicubic_pack1.h b/src/layer/x86/gridsample_bicubic_pack1.h deleted file mode 100644 index 07b2b846305..00000000000 --- a/src/layer/x86/gridsample_bicubic_pack1.h +++ /dev/null @@ -1,1093 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_bicubic_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < src.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); - bool x2_in_range = (x2 > -1) & (x2 < src.w); - bool y2_in_range = (y2 > -1) & (y2 < src.h); - bool x3_in_range = (x3 > -1) & (x3 < src.w); - bool y3_in_range = (y3 > -1) & (y3 < src.h); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v02_in_range = x2_in_range & y0_in_range; - bool v03_in_range = x3_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - bool v12_in_range = x2_in_range & y1_in_range; - bool v13_in_range = x3_in_range & y1_in_range; - bool v20_in_range = x0_in_range & y2_in_range; - bool v21_in_range = x1_in_range & y2_in_range; - bool v22_in_range = x2_in_range & y2_in_range; - bool v23_in_range = x3_in_range & y2_in_range; - bool v30_in_range = x0_in_range & y3_in_range; - bool v31_in_range = x1_in_range & y3_in_range; - bool v32_in_range = x2_in_range & y3_in_range; - bool v33_in_range = x3_in_range & y3_in_range; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v02 = v02_in_range ? image.row(y0)[x2] : 0; - float v03 = v03_in_range ? image.row(y0)[x3] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - float v12 = v12_in_range ? image.row(y1)[x2] : 0; - float v13 = v13_in_range ? image.row(y1)[x3] : 0; - float v20 = v20_in_range ? image.row(y2)[x0] : 0; - float v21 = v21_in_range ? image.row(y2)[x1] : 0; - float v22 = v22_in_range ? image.row(y2)[x2] : 0; - float v23 = v23_in_range ? image.row(y2)[x3] : 0; - float v30 = v30_in_range ? image.row(y3)[x0] : 0; - float v31 = v31_in_range ? image.row(y3)[x1] : 0; - float v32 = v32_in_range ? image.row(y3)[x2] : 0; - float v33 = v33_in_range ? image.row(y3)[x3] : 0; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x1, x_coeffs); - interpolate_cubic(sample_y - y1, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } -} - -static void gridsample_2d_bicubic_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < src.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); - bool x2_in_range = (x2 > -1) & (x2 < src.w); - bool y2_in_range = (y2 > -1) & (y2 < src.h); - bool x3_in_range = (x3 > -1) & (x3 < src.w); - bool y3_in_range = (y3 > -1) & (y3 < src.h); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v02_in_range = x2_in_range & y0_in_range; - bool v03_in_range = x3_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - bool v12_in_range = x2_in_range & y1_in_range; - bool v13_in_range = x3_in_range & y1_in_range; - bool v20_in_range = x0_in_range & y2_in_range; - bool v21_in_range = x1_in_range & y2_in_range; - bool v22_in_range = x2_in_range & y2_in_range; - bool v23_in_range = x3_in_range & y2_in_range; - bool v30_in_range = x0_in_range & y3_in_range; - bool v31_in_range = x1_in_range & y3_in_range; - bool v32_in_range = x2_in_range & y3_in_range; - bool v33_in_range = x3_in_range & y3_in_range; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v02 = v02_in_range ? image.row(y0)[x2] : 0; - float v03 = v03_in_range ? image.row(y0)[x3] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - float v12 = v12_in_range ? image.row(y1)[x2] : 0; - float v13 = v13_in_range ? image.row(y1)[x3] : 0; - float v20 = v20_in_range ? image.row(y2)[x0] : 0; - float v21 = v21_in_range ? image.row(y2)[x1] : 0; - float v22 = v22_in_range ? image.row(y2)[x2] : 0; - float v23 = v23_in_range ? image.row(y2)[x3] : 0; - float v30 = v30_in_range ? image.row(y3)[x0] : 0; - float v31 = v31_in_range ? image.row(y3)[x1] : 0; - float v32 = v32_in_range ? image.row(y3)[x2] : 0; - float v33 = v33_in_range ? image.row(y3)[x3] : 0; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x1, x_coeffs); - interpolate_cubic(sample_y - y1, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } -} - -static void gridsample_2d_bicubic_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < src.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x1 = std::min(src.w - 1, std::max(x1, 0)); - y1 = std::min(src.h - 1, std::max(y1, 0)); - x0 = std::min(src.w - 1, std::max(x0, 0)); - y0 = std::min(src.h - 1, std::max(y0, 0)); - x2 = std::min(src.w - 1, std::max(x2, 0)); - y2 = std::min(src.h - 1, std::max(y2, 0)); - x3 = std::min(src.w - 1, std::max(x3, 0)); - y3 = std::min(src.h - 1, std::max(y3, 0)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } -} - -static void gridsample_2d_bicubic_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < src.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x1 = std::min(src.w - 1, std::max(x1, 0)); - y1 = std::min(src.h - 1, std::max(y1, 0)); - x0 = std::min(src.w - 1, std::max(x0, 0)); - y0 = std::min(src.h - 1, std::max(y0, 0)); - x2 = std::min(src.w - 1, std::max(x2, 0)); - y2 = std::min(src.h - 1, std::max(y2, 0)); - x3 = std::min(src.w - 1, std::max(x3, 0)); - y3 = std::min(src.h - 1, std::max(y3, 0)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } -} - -static void gridsample_2d_bicubic_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - { - // x0 - gx0 = _mm256_add_ps(gx0, v0p5fp8); - - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); - - gx0 = _mm256_sub_ps(gx0, v0p5fp8); - - _mm256_sub_ps(gx0, v0p5fp8); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - - // x1 - gx1 = _mm256_add_ps(gx1, v0p5fp8); - - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); - - gx1 = _mm256_sub_ps(gx1, v0p5fp8); - - _mm256_sub_ps(gx1, v0p5fp8); - - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - - // x2 - gx2 = _mm256_add_ps(gx2, v0p5fp8); - - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); - - gx2 = _mm256_sub_ps(gx2, v0p5fp8); - - _mm256_sub_ps(gx2, v0p5fp8); - - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - - // x3 - gx3 = _mm256_add_ps(gx3, v0p5fp8); - - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); - - gx3 = _mm256_sub_ps(gx3, v0p5fp8); - - _mm256_sub_ps(gx3, v0p5fp8); - - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - } - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - { - //y - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < src.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x0 = static_cast(reflect_coord(x0 + 0.5, src.w) - 0.5); - - y0 = static_cast(reflect_coord(y0 + 0.5, src.h) - 0.5); - - x0 = std::min(src.w - 1, std::max(x0, 0)); - y0 = std::min(src.h - 1, std::max(y0, 0)); - - x1 = static_cast(reflect_coord(x1 + 0.5, src.w) - 0.5); - - y1 = static_cast(reflect_coord(y1 + 0.5, src.h) - 0.5); - - x1 = std::min(src.w - 1, std::max(x1, 0)); - y1 = std::min(src.h - 1, std::max(y1, 0)); - - x2 = static_cast(reflect_coord(x2 + 0.5, src.w) - 0.5); - - y2 = static_cast(reflect_coord(y2 + 0.5, src.h) - 0.5); - - x2 = std::min(src.w - 1, std::max(x2, 0)); - y2 = std::min(src.h - 1, std::max(y2, 0)); - - x3 = static_cast(reflect_coord(x3 + 0.5, src.w) - 0.5); - - y3 = static_cast(reflect_coord(y3 + 0.5, src.h) - 0.5); - - x3 = std::min(src.w - 1, std::max(x3, 0)); - y3 = std::min(src.h - 1, std::max(y3, 0)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } -} - -static void gridsample_2d_bicubic_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - { - // x0 - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(border_x, reflectx0_v); - - // x1 - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(border_x, reflectx1_v); - - // x2 - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(border_x, reflectx2_v); - - // x3 - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(border_x, reflectx3_v); - } - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - { - //y - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < src.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - int x_floor = floor(sample_x); - int y_floor = floor(sample_y); - - int x1 = x_floor; - int y1 = y_floor; - int x0 = x1 - 1; - int y0 = y1 - 1; - int x2 = x1 + 1; - int y2 = y1 + 1; - int x3 = x1 + 2; - int y3 = y1 + 2; - - x0 = static_cast(reflect_coord(x0, src.w - 1)); - y0 = static_cast(reflect_coord(y0, src.h - 1)); - x1 = static_cast(reflect_coord(x1, src.w - 1)); - y1 = static_cast(reflect_coord(y1, src.h - 1)); - x2 = static_cast(reflect_coord(x2, src.w - 1)); - y2 = static_cast(reflect_coord(y2, src.h - 1)); - x3 = static_cast(reflect_coord(x3, src.w - 1)); - y3 = static_cast(reflect_coord(y3, src.h - 1)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - float v00 = image.row(y0)[x0]; - float v01 = image.row(y0)[x1]; - float v02 = image.row(y0)[x2]; - float v03 = image.row(y0)[x3]; - float v10 = image.row(y1)[x0]; - float v11 = image.row(y1)[x1]; - float v12 = image.row(y1)[x2]; - float v13 = image.row(y1)[x3]; - float v20 = image.row(y2)[x0]; - float v21 = image.row(y2)[x1]; - float v22 = image.row(y2)[x2]; - float v23 = image.row(y2)[x3]; - float v30 = image.row(y3)[x0]; - float v31 = image.row(y3)[x1]; - float v32 = image.row(y3)[x2]; - float v33 = image.row(y3)[x3]; - - float x_coeffs[4]; - float y_coeffs[4]; - interpolate_cubic(sample_x - x_floor, x_coeffs); - interpolate_cubic(sample_y - y_floor, y_coeffs); - - float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; - float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; - float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; - float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; - - dst.channel(q).row(y)[x / 2] = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_pack16.h b/src/layer/x86/gridsample_bicubic_pack16.h deleted file mode 100644 index ea3d7950bd0..00000000000 --- a/src/layer/x86/gridsample_bicubic_pack16.h +++ /dev/null @@ -1,687 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static NCNN_FORCEINLINE __m512 cubic_interp1d_p16(const __m512& x0_v, const __m512& x1_v, const __m512& x2_v, const __m512& x3_v, const __m512& tx) -{ - const __m512 A = _mm512_set1_ps(-0.75f); - - const __m512 x0 = _mm512_add_ps(tx, *(__m512*)_ps512_1); - const __m512& x1 = tx; - const __m512 x2 = _mm512_sub_ps(*(__m512*)_ps512_1, tx); - //const __m512 x3 = _mm512_add_ps(x2, *(__m512*)_ps512_1); - - const __m512 coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); - const __m512 coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), *(__m512*)_ps512_1); - const __m512 coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), *(__m512*)_ps512_1); - const __m512 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); - - __m512 _v = _mm512_mul_ps(coeffs0, x0_v); - _v = _mm512_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm512_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm512_fmadd_ps(coeffs3, x3_v, _v); - - return _v; -} - -static void gridsample_2d_bicubic_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512 vElempackf = _mm512_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - } - - __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - const __m512 tx = _mm512_sub_ps(gx, gx_floor); - const __m512 ty = _mm512_sub_ps(gy, gy_floor); - - __m512 coefficients[4]; - - __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); - __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); - __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); - - __m512i x0 = _mm512_cvtps_epi32(gx0); - __m512i x1 = _mm512_cvtps_epi32(gx1); - __m512i x2 = _mm512_cvtps_epi32(gx2); - __m512i x3 = _mm512_cvtps_epi32(gx3); - - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x2); - __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x3); - - __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __mmask16 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); - - __m512i y = _mm512_cvtps_epi32(gy); - - __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y); - - v0_in_range[i] = x0_in_range & y_in_range; - v1_in_range[i] = x1_in_range & y_in_range; - v2_in_range[i] = x2_in_range & y_in_range; - v3_in_range[i] = x3_in_range & y_in_range; - - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v0_in_range[i], v0_offset[i], src.channel(q), sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v1_in_range[i], v1_offset[i], src.channel(q), sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v2_in_range[i], v2_offset[i], src.channel(q), sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v3_in_range[i], v3_offset[i], src.channel(q), sizeof(float)); - - coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512 vElempackf = _mm512_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - } - - __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - const __m512 tx = _mm512_sub_ps(gx, gx_floor); - const __m512 ty = _mm512_sub_ps(gy, gy_floor); - - __m512 coefficients[4]; - - __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); - __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); - __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); - - __m512i x0 = _mm512_cvtps_epi32(gx0); - __m512i x1 = _mm512_cvtps_epi32(gx1); - __m512i x2 = _mm512_cvtps_epi32(gx2); - __m512i x3 = _mm512_cvtps_epi32(gx3); - - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 x2_in_range = _mm512_cmpgt_epi32_mask(x2, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x2); - __mmask16 x3_in_range = _mm512_cmpgt_epi32_mask(x3, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x3); - - __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __mmask16 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); - - __m512i y = _mm512_cvtps_epi32(gy); - - __mmask16 y_in_range = _mm512_cmpgt_epi32_mask(y, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y); - - v0_in_range[i] = x0_in_range & y_in_range; - v1_in_range[i] = x1_in_range & y_in_range; - v2_in_range[i] = x2_in_range & y_in_range; - v3_in_range[i] = x3_in_range & y_in_range; - - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v0_in_range[i], v0_offset[i], src.channel(q), sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v1_in_range[i], v1_offset[i], src.channel(q), sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v2_in_range[i], v2_offset[i], src.channel(q), sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v3_in_range[i], v3_offset[i], src.channel(q), sizeof(float)); - - coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - - const __m512 vElempackf = _mm512_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - const __m512 tx = _mm512_sub_ps(gx, gx_floor); - const __m512 ty = _mm512_sub_ps(gy, gy_floor); - - __m512 coefficients[4]; - - __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); - __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); - __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); - - gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); - gx1 = _mm512_min_ps(border_x, _mm512_max_ps(gx1, _mm512_setzero_ps())); - gx2 = _mm512_min_ps(border_x, _mm512_max_ps(gx2, _mm512_setzero_ps())); - gx3 = _mm512_min_ps(border_x, _mm512_max_ps(gx3, _mm512_setzero_ps())); - - __m512i x0 = _mm512_cvtps_epi32(gx0); - __m512i x1 = _mm512_cvtps_epi32(gx1); - __m512i x2 = _mm512_cvtps_epi32(gx2); - __m512i x3 = _mm512_cvtps_epi32(gx3); - - __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); - - coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - - const __m512 vElempackf = _mm512_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - const __m512 tx = _mm512_sub_ps(gx, gx_floor); - const __m512 ty = _mm512_sub_ps(gy, gy_floor); - - __m512 coefficients[4]; - - __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); - __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); - __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); - - gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); - gx1 = _mm512_min_ps(border_x, _mm512_max_ps(gx1, _mm512_setzero_ps())); - gx2 = _mm512_min_ps(border_x, _mm512_max_ps(gx2, _mm512_setzero_ps())); - gx3 = _mm512_min_ps(border_x, _mm512_max_ps(gx3, _mm512_setzero_ps())); - - __m512i x0 = _mm512_cvtps_epi32(gx0); - __m512i x1 = _mm512_cvtps_epi32(gx1); - __m512i x2 = _mm512_cvtps_epi32(gx2); - __m512i x3 = _mm512_cvtps_epi32(gx3); - - __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); - - coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - - const __m512 vElempackf = _mm512_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - const __m512 tx = _mm512_sub_ps(gx, gx_floor); - const __m512 ty = _mm512_sub_ps(gy, gy_floor); - - __m512 coefficients[4]; - - __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); - __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); - __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); - const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); - { - // x0 - gx0 = _mm512_add_ps(gx0, v0p5fp16); - - gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx0_v = _mm512_and_ps(_mm512_sub_ps(gx0, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx0 = _mm512_sub_ps(vImgWf, reflectx0_v); - - gx0 = _mm512_sub_ps(gx0, v0p5fp16); - - _mm512_sub_ps(gx0, v0p5fp16); - - gx0 = _mm512_min_ps(border_x, _mm512_max_ps(gx0, _mm512_setzero_ps())); - - // x1 - gx1 = _mm512_add_ps(gx1, v0p5fp16); - - gx1 = _mm512_and_ps(gx1, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx1_v = _mm512_and_ps(_mm512_sub_ps(gx1, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx1 = _mm512_sub_ps(vImgWf, reflectx1_v); - - gx1 = _mm512_sub_ps(gx1, v0p5fp16); - - _mm512_sub_ps(gx1, v0p5fp16); - - gx1 = _mm512_min_ps(border_x, _mm512_max_ps(gx1, _mm512_setzero_ps())); - - // x2 - gx2 = _mm512_add_ps(gx2, v0p5fp16); - - gx2 = _mm512_and_ps(gx2, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx2_v = _mm512_and_ps(_mm512_sub_ps(gx2, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx2 = _mm512_sub_ps(vImgWf, reflectx2_v); - - gx2 = _mm512_sub_ps(gx2, v0p5fp16); - - _mm512_sub_ps(gx2, v0p5fp16); - - gx2 = _mm512_min_ps(border_x, _mm512_max_ps(gx2, _mm512_setzero_ps())); - - // x3 - gx3 = _mm512_add_ps(gx3, v0p5fp16); - - gx3 = _mm512_and_ps(gx3, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx3_v = _mm512_and_ps(_mm512_sub_ps(gx3, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx3 = _mm512_sub_ps(vImgWf, reflectx3_v); - - gx3 = _mm512_sub_ps(gx3, v0p5fp16); - - _mm512_sub_ps(gx3, v0p5fp16); - - gx3 = _mm512_min_ps(border_x, _mm512_max_ps(gx3, _mm512_setzero_ps())); - } - - __m512i x0 = _mm512_cvtps_epi32(gx0); - __m512i x1 = _mm512_cvtps_epi32(gx1); - __m512i x2 = _mm512_cvtps_epi32(gx2); - __m512i x3 = _mm512_cvtps_epi32(gx3); - - __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); - - { - //y - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_add_ps(gy, v0p5fp16); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(vImgHf, reflecty_v); - - gy = _mm512_sub_ps(gy, v0p5fp16); - - _mm512_sub_ps(gy, v0p5fp16); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); - - coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - float* outptr = static_cast(dst.data); - - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - - const __m512 vElempackf = _mm512_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - const __m512 two = _mm512_set1_ps(2.f); - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - __m512 gx_floor = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 gy_floor = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - const __m512 tx = _mm512_sub_ps(gx, gx_floor); - const __m512 ty = _mm512_sub_ps(gy, gy_floor); - - __m512 coefficients[4]; - - __m512 gx0 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_n1); - __m512 gx1 = gx_floor; - __m512 gx2 = _mm512_add_ps(gx_floor, *(__m512*)_ps512_1); - __m512 gx3 = _mm512_add_ps(gx_floor, _mm512_set1_ps(2.0f)); - const __m512 v0p5fp16 = _mm512_set1_ps(0.5f); - { - // x0 - gx0 = _mm512_and_ps(gx0, *(__m512*)_ps512_inv_sign_mask); - __m512 reflectx0_v = _mm512_and_ps(_mm512_sub_ps(gx0, border_x), *(__m512*)_ps512_inv_sign_mask); - gx0 = _mm512_sub_ps(border_x, reflectx0_v); - - // x1 - gx1 = _mm512_and_ps(gx1, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx1_v = _mm512_and_ps(_mm512_sub_ps(gx1, border_x), *(__m512*)_ps512_inv_sign_mask); - gx1 = _mm512_sub_ps(border_x, reflectx1_v); - - // x2 - gx2 = _mm512_and_ps(gx2, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx2_v = _mm512_and_ps(_mm512_sub_ps(gx2, border_x), *(__m512*)_ps512_inv_sign_mask); - gx2 = _mm512_sub_ps(border_x, reflectx2_v); - - // x3 - gx3 = _mm512_and_ps(gx3, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx3_v = _mm512_and_ps(_mm512_sub_ps(gx3, border_x), *(__m512*)_ps512_inv_sign_mask); - gx3 = _mm512_sub_ps(border_x, reflectx3_v); - } - - __m512i x0 = _mm512_cvtps_epi32(gx0); - __m512i x1 = _mm512_cvtps_epi32(gx1); - __m512i x2 = _mm512_cvtps_epi32(gx2); - __m512i x3 = _mm512_cvtps_epi32(gx3); - - __m512i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm512_add_ps(gy_floor, _mm512_set1_ps(-1.0f + i)); - - { - //y - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(border_y, reflecty_v); - } - - __m512 v0_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v1_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v2_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m512 v3_offset_f = _mm512_add_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm512_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm512_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm512_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm512_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v0_offset[i], src.channel(q), sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v1_offset[i], src.channel(q), sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v2_offset[i], src.channel(q), sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, v3_offset[i], src.channel(q), sizeof(float)); - - coefficients[i] = cubic_interp1d_p16(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m512 _v = cubic_interp1d_p16(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_pack4.h b/src/layer/x86/gridsample_bicubic_pack4.h deleted file mode 100644 index 624f9005ecb..00000000000 --- a/src/layer/x86/gridsample_bicubic_pack4.h +++ /dev/null @@ -1,662 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static NCNN_FORCEINLINE __m128 cubic_interp1d_p4(const __m128& x0_v, const __m128& x1_v, const __m128& x2_v, const __m128& x3_v, const __m128& tx) -{ - const __m128 A = _mm_set1_ps(-0.75f); - - const __m128 x0 = _mm_add_ps(tx, v1fp4); - const __m128& x1 = tx; - const __m128 x2 = _mm_sub_ps(v1fp4, tx); - //const __m128 x3 = _mm_add_ps(x2, v1fp4); - - const __m128 coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); - const __m128 coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), v1fp4); - const __m128 coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), v1fp4); - const __m128 coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(v1fp4, coeffs0), coeffs1), coeffs2); - - __m128 _v = _mm_mul_ps(coeffs0, x0_v); - _v = _mm_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm_comp_fmadd_ps(coeffs3, x3_v, _v); - - return _v; -} - -static void gridsample_2d_bicubic_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - } - - __m128 gx_floor = floor_ps(gx); - __m128 gy_floor = floor_ps(gy); - - const __m128 tx = _mm_sub_ps(gx, gx_floor); - const __m128 ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); - __m128 gx1 = gx_floor; - __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); - __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - - __m128i x0 = _mm_cvtps_epi32(gx0); - __m128i x1 = _mm_cvtps_epi32(gx1); - __m128i x2 = _mm_cvtps_epi32(gx2); - __m128i x3 = _mm_cvtps_epi32(gx3); - - __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); - __m128i x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); - - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - - __m128i y = _mm_cvtps_epi32(gy); - - __m128i y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); - - v0_in_range[i] = _mm_and_si128(x0_in_range, y_in_range); - v1_in_range[i] = _mm_and_si128(x1_in_range, y_in_range); - v2_in_range[i] = _mm_and_si128(x2_in_range, y_in_range); - v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); - - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], _mm_castsi128_ps(v0_in_range[i])); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], _mm_castsi128_ps(v1_in_range[i])); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], _mm_castsi128_ps(v2_in_range[i])); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], _mm_castsi128_ps(v3_in_range[i])); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - } - - __m128 gx_floor = floor_ps(gx); - __m128 gy_floor = floor_ps(gy); - - const __m128 tx = _mm_sub_ps(gx, gx_floor); - const __m128 ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); - __m128 gx1 = gx_floor; - __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); - __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - - __m128i x0 = _mm_cvtps_epi32(gx0); - __m128i x1 = _mm_cvtps_epi32(gx1); - __m128i x2 = _mm_cvtps_epi32(gx2); - __m128i x3 = _mm_cvtps_epi32(gx3); - - __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i x2_in_range = _mm_and_si128(_mm_cmpgt_epi32(x2, vn1ip4), _mm_cmpgt_epi32(vImgWi, x2)); - __m128i x3_in_range = _mm_and_si128(_mm_cmpgt_epi32(x3, vn1ip4), _mm_cmpgt_epi32(vImgWi, x3)); - - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4], - v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - - __m128i y = _mm_cvtps_epi32(gy); - - __m128i y_in_range = _mm_and_si128(_mm_cmpgt_epi32(y, vn1ip4), _mm_cmpgt_epi32(vImgHi, y)); - - v0_in_range[i] = _mm_and_si128(x0_in_range, y_in_range); - v1_in_range[i] = _mm_and_si128(x1_in_range, y_in_range); - v2_in_range[i] = _mm_and_si128(x2_in_range, y_in_range); - v3_in_range[i] = _mm_and_si128(x3_in_range, y_in_range); - - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], _mm_castsi128_ps(v0_in_range[i])); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], _mm_castsi128_ps(v1_in_range[i])); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], _mm_castsi128_ps(v2_in_range[i])); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], _mm_castsi128_ps(v3_in_range[i])); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - const __m128 two = _mm_set1_ps(2.f); - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - __m128 gx_floor = floor_ps(gx); - __m128 gy_floor = floor_ps(gy); - - const __m128 tx = _mm_sub_ps(gx, gx_floor); - const __m128 ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); - __m128 gx1 = gx_floor; - __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); - __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - - gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); - gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); - gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); - gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - const __m128 two = _mm_set1_ps(2.f); - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - __m128 gx_floor = floor_ps(gx); - __m128 gy_floor = floor_ps(gy); - - const __m128 tx = _mm_sub_ps(gx, gx_floor); - const __m128 ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); - __m128 gx1 = gx_floor; - __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); - __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - - gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); - gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); - gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); - gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - const __m128 two = _mm_set1_ps(2.f); - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - __m128 gx_floor = floor_ps(gx); - __m128 gy_floor = floor_ps(gy); - - const __m128 tx = _mm_sub_ps(gx, gx_floor); - const __m128 ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); - __m128 gx1 = gx_floor; - __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); - __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - const __m128 v0p5fp4 = _mm_set1_ps(0.5f); - { - // x0 - gx0 = _mm_add_ps(gx0, v0p5fp4); - - gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx0 = _mm_sub_ps(vImgWf, reflectx0_v); - - gx0 = _mm_sub_ps(gx0, v0p5fp4); - - _mm_sub_ps(gx0, v0p5fp4); - - gx0 = _mm_min_ps(border_x, _mm_max_ps(gx0, _mm_setzero_ps())); - - // x1 - gx1 = _mm_add_ps(gx1, v0p5fp4); - - gx1 = _mm_and_ps(gx1, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx1 = _mm_sub_ps(vImgWf, reflectx1_v); - - gx1 = _mm_sub_ps(gx1, v0p5fp4); - - _mm_sub_ps(gx1, v0p5fp4); - - gx1 = _mm_min_ps(border_x, _mm_max_ps(gx1, _mm_setzero_ps())); - - // x2 - gx2 = _mm_add_ps(gx2, v0p5fp4); - - gx2 = _mm_and_ps(gx2, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx2 = _mm_sub_ps(vImgWf, reflectx2_v); - - gx2 = _mm_sub_ps(gx2, v0p5fp4); - - _mm_sub_ps(gx2, v0p5fp4); - - gx2 = _mm_min_ps(border_x, _mm_max_ps(gx2, _mm_setzero_ps())); - - // x3 - gx3 = _mm_add_ps(gx3, v0p5fp4); - - gx3 = _mm_and_ps(gx3, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx3 = _mm_sub_ps(vImgWf, reflectx3_v); - - gx3 = _mm_sub_ps(gx3, v0p5fp4); - - _mm_sub_ps(gx3, v0p5fp4); - - gx3 = _mm_min_ps(border_x, _mm_max_ps(gx3, _mm_setzero_ps())); - } - - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - - { - //y - gy = _mm_add_ps(gy, v0p5fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(vImgHf, reflecty_v); - - gy = _mm_sub_ps(gy, v0p5fp4); - - _mm_sub_ps(gy, v0p5fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - const __m128 two = _mm_set1_ps(2.f); - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - __m128 gx_floor = floor_ps(gx); - __m128 gy_floor = floor_ps(gy); - - const __m128 tx = _mm_sub_ps(gx, gx_floor); - const __m128 ty = _mm_sub_ps(gy, gy_floor); - - __m128 coefficients[4]; - - __m128 gx0 = _mm_add_ps(gx_floor, vn1fp4); - __m128 gx1 = gx_floor; - __m128 gx2 = _mm_add_ps(gx_floor, v1fp4); - __m128 gx3 = _mm_add_ps(gx_floor, _mm_set1_ps(2.0f)); - { - // x0 - gx0 = _mm_and_ps(gx0, *(__m128*)_ps_inv_sign_mask); - __m128 reflectx0_v = _mm_and_ps(_mm_sub_ps(gx0, border_x), *(__m128*)_ps_inv_sign_mask); - gx0 = _mm_sub_ps(border_x, reflectx0_v); - - // x1 - gx1 = _mm_and_ps(gx1, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx1_v = _mm_and_ps(_mm_sub_ps(gx1, border_x), *(__m128*)_ps_inv_sign_mask); - gx1 = _mm_sub_ps(border_x, reflectx1_v); - - // x2 - gx2 = _mm_and_ps(gx2, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx2_v = _mm_and_ps(_mm_sub_ps(gx2, border_x), *(__m128*)_ps_inv_sign_mask); - gx2 = _mm_sub_ps(border_x, reflectx2_v); - - // x3 - gx3 = _mm_and_ps(gx3, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx3_v = _mm_and_ps(_mm_sub_ps(gx3, border_x), *(__m128*)_ps_inv_sign_mask); - gx3 = _mm_sub_ps(border_x, reflectx3_v); - } - - __m128i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm_add_ps(gy_floor, _mm_set1_ps(-1.0f + i)); - - { - //y - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(border_y, reflecty_v); - } - - __m128 v0_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v1_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v2_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - __m128 v3_offset_f = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m128 x0_val = mask_gather_ps(src.channel(q), v0_offset[i], vn1fp4); - __m128 x1_val = mask_gather_ps(src.channel(q), v1_offset[i], vn1fp4); - __m128 x2_val = mask_gather_ps(src.channel(q), v2_offset[i], vn1fp4); - __m128 x3_val = mask_gather_ps(src.channel(q), v3_offset[i], vn1fp4); - - coefficients[i] = cubic_interp1d_p4(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m128 _v = cubic_interp1d_p4(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_pack8.h b/src/layer/x86/gridsample_bicubic_pack8.h deleted file mode 100644 index 1d33f0f527d..00000000000 --- a/src/layer/x86/gridsample_bicubic_pack8.h +++ /dev/null @@ -1,622 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_bicubic_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], v0_in_range[i]); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], v1_in_range[i]); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], v2_in_range[i]); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], v3_in_range[i]); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - const __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - { - // x0 - gx0 = _mm256_add_ps(gx0, v0p5fp8); - - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(vImgWf, reflectx0_v); - - gx0 = _mm256_sub_ps(gx0, v0p5fp8); - - _mm256_sub_ps(gx0, v0p5fp8); - - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - - // x1 - gx1 = _mm256_add_ps(gx1, v0p5fp8); - - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(vImgWf, reflectx1_v); - - gx1 = _mm256_sub_ps(gx1, v0p5fp8); - - _mm256_sub_ps(gx1, v0p5fp8); - - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - - // x2 - gx2 = _mm256_add_ps(gx2, v0p5fp8); - - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(vImgWf, reflectx2_v); - - gx2 = _mm256_sub_ps(gx2, v0p5fp8); - - _mm256_sub_ps(gx2, v0p5fp8); - - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - - // x3 - gx3 = _mm256_add_ps(gx3, v0p5fp8); - - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(vImgWf, reflectx3_v); - - gx3 = _mm256_sub_ps(gx3, v0p5fp8); - - _mm256_sub_ps(gx3, v0p5fp8); - - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - } - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - { - //y - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bicubic_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - const __m256 two = _mm256_set1_ps(2.f); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 coefficients[4]; - - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); - { - // x0 - gx0 = _mm256_and_ps(gx0, *(__m256*)_ps256_inv_sign_mask); - __m256 reflectx0_v = _mm256_and_ps(_mm256_sub_ps(gx0, border_x), *(__m256*)_ps256_inv_sign_mask); - gx0 = _mm256_sub_ps(border_x, reflectx0_v); - - // x1 - gx1 = _mm256_and_ps(gx1, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx1_v = _mm256_and_ps(_mm256_sub_ps(gx1, border_x), *(__m256*)_ps256_inv_sign_mask); - gx1 = _mm256_sub_ps(border_x, reflectx1_v); - - // x2 - gx2 = _mm256_and_ps(gx2, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx2_v = _mm256_and_ps(_mm256_sub_ps(gx2, border_x), *(__m256*)_ps256_inv_sign_mask); - gx2 = _mm256_sub_ps(border_x, reflectx2_v); - - // x3 - gx3 = _mm256_and_ps(gx3, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx3_v = _mm256_and_ps(_mm256_sub_ps(gx3, border_x), *(__m256*)_ps256_inv_sign_mask); - gx3 = _mm256_sub_ps(border_x, reflectx3_v); - } - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - { - //y - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } - - for (int q = 0; q < dst.c; q++) - { - for (int i = 0; i < 4; i++) - { - __m256 x0_val = mask_gather_ps256(src.channel(q), v0_offset[i], *(__m256*)_ps256_n1); - __m256 x1_val = mask_gather_ps256(src.channel(q), v1_offset[i], *(__m256*)_ps256_n1); - __m256 x2_val = mask_gather_ps256(src.channel(q), v2_offset[i], *(__m256*)_ps256_n1); - __m256 x3_val = mask_gather_ps256(src.channel(q), v3_offset[i], *(__m256*)_ps256_n1); - - coefficients[i] = cubic_interp1d_p8(x0_val, x1_val, x2_val, x3_val, tx); - } - - __m256 _v = cubic_interp1d_p8(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h new file mode 100644 index 00000000000..5c6089cfc69 --- /dev/null +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -0,0 +1,613 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +template +struct gridsample_2d_bilinear_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + int* offset_ptr = offset.channel(0); + + float* in_bound_ptr_00 = in_bound.channel(0); + float* in_bound_ptr_01 = in_bound.channel(1); + float* in_bound_ptr_10 = in_bound.channel(2); + float* in_bound_ptr_11 = in_bound.channel(3); + + float* value_ptr_alpha = value.channel(0); + float* value_ptr_beta = value.channel(1); + + grid_sample_unormalize unormalize; + compute_coord get_coord; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + // y + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + + _mm256_storeu_epi32(in_bound_ptr_00, *(__m256i*)_pi32_256_1); + _mm256_storeu_epi32(in_bound_ptr_01, x1_in_range); + _mm256_storeu_epi32(in_bound_ptr_10, y1_in_range); + _mm256_storeu_epi32(in_bound_ptr_11, v11_in_range); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); + __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); + __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + + _mm256_storeu_ps(in_bound_ptr_00, *(__m256*)_ps256_n1); + _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); + _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); + _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); +#endif + + _mm256_storeu_epi32(offset_ptr, i_nw_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + + gridptr += 16; + + offset_ptr += 8; + + in_bound_ptr_00 += 8; + in_bound_ptr_01 += 8; + in_bound_ptr_10 += 8; + in_bound_ptr_11 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + + // x + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + // y + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); + *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + + *offset_ptr = x0 + y0 * src.w; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + + gridptr += 2; + + offset_ptr++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + // y + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + + _mm256_storeu_epi32(in_bound_ptr_00, *(__m256i*)_pi32_256_1); + _mm256_storeu_epi32(in_bound_ptr_01, x1_in_range); + _mm256_storeu_epi32(in_bound_ptr_10, y1_in_range); + _mm256_storeu_epi32(in_bound_ptr_11, v11_in_range); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + + _mm256_storeu_ps(in_bound_ptr_00, *(__m256*)_ps256_n1); + _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); + _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); + _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); +#endif + + _mm256_storeu_epi32(offset_ptr, i_nw_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + + gridptr_x += 8; + gridptr_y += 8; + + offset_ptr += 8; + + in_bound_ptr_00 += 8; + in_bound_ptr_01 += 8; + in_bound_ptr_10 += 8; + in_bound_ptr_11 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + // y + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); + *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + + *offset_ptr = x0 + y0 * src.w; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + + gridptr_x++; + gridptr_y++; + + offset_ptr++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + } + } + } +}; + +template +struct gridsample_2d_bilinear_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + int* offset_ptr = offset.channel(0); + + float* in_bound_ptr_00 = in_bound.channel(0); + float* in_bound_ptr_01 = in_bound.channel(1); + float* in_bound_ptr_10 = in_bound.channel(2); + float* in_bound_ptr_11 = in_bound.channel(3); + + float* value_ptr_alpha = value.channel(0); + float* value_ptr_beta = value.channel(1); + + grid_sample_unormalize unormalize; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + // y + gy = unormalize(vImgHf, gy); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + + _mm256_storeu_ps(in_bound_ptr_00, _mm256_castsi256_ps(v00_in_range)); + _mm256_storeu_ps(in_bound_ptr_01, _mm256_castsi256_ps(v01_in_range)); + _mm256_storeu_ps(in_bound_ptr_10, _mm256_castsi256_ps(v10_in_range)); + _mm256_storeu_ps(in_bound_ptr_11, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + + _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); + _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); + _mm256_storeu_ps(in_bound_ptr_10, v10_in_range); + _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); +#endif // __AVX2__ + + _mm256_storeu_epi32(offset_ptr, i_nw_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + + gridptr += 16; + + offset_ptr += 8; + + in_bound_ptr_00 += 8; + in_bound_ptr_01 += 8; + in_bound_ptr_10 += 8; + in_bound_ptr_11 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_y); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); + *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + + *offset_ptr = x0 + y0 * src.w; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + + gridptr += 2; + + offset_ptr++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + // y + gy = unormalize(vImgHf, gy); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + + _mm256_storeu_ps(in_bound_ptr_00, _mm256_castsi256_ps(v00_in_range)); + _mm256_storeu_ps(in_bound_ptr_01, _mm256_castsi256_ps(v01_in_range)); + _mm256_storeu_ps(in_bound_ptr_10, _mm256_castsi256_ps(v10_in_range)); + _mm256_storeu_ps(in_bound_ptr_11, _mm256_castsi256_ps(v11_in_range)); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + + __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + + _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); + _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); + _mm256_storeu_ps(in_bound_ptr_10, v10_in_range); + _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); +#endif // __AVX2__ + + _mm256_storeu_epi32(offset_ptr, i_nw_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + + gridptr_x += 8; + gridptr_y += 8; + + offset_ptr += 8; + + in_bound_ptr_00 += 8; + in_bound_ptr_01 += 8; + in_bound_ptr_10 += 8; + in_bound_ptr_11 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_y); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); + *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + + *offset_ptr = x0 + y0 * src.w; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + + gridptr_x++; + gridptr_y++; + + offset_ptr++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + } + } + } +}; \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack1.h b/src/layer/x86/gridsample_bilinear_pack1.h deleted file mode 100644 index 3b88c20bfa7..00000000000 --- a/src/layer/x86/gridsample_bilinear_pack1.h +++ /dev/null @@ -1,2513 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif // __AVX2__ - - for (int q = 0; q < src.c; q++) - { -#if __AVX2__ - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif // __AVX2__ - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - bool v01_in_range = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); - bool v10_in_range = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); - bool v11_in_range = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } -} - -static void gridsample_2d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif // __AVX2__ - - for (int q = 0; q < src.c; q++) - { -#if __AVX2__ - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif // __AVX2__ - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - bool v01_in_range = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); - bool v10_in_range = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); - bool v11_in_range = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v00 = v00_in_range ? image.row(y0)[x0] : 0; - float v01 = v01_in_range ? image.row(y0)[x1] : 0; - float v10 = v10_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } -} - -static void gridsample_2d_bilinear_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < src.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } -} - -static void gridsample_2d_bilinear_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < src.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } -} - -static void gridsample_2d_bilinear_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < src.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - sample_x = abs(sample_x + 0.5f); - sample_x = src.w - abs(sample_x - src.w) - 0.5; - - sample_y = abs(sample_y + 0.5f); - sample_y = src.h - abs(sample_y - src.h) - 0.5; - - sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } -} - -static void gridsample_2d_bilinear_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, *(__m256i*)_pi32_256_1); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, vImgWi); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif - - for (int q = 0; q < src.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); -#if __AVX2__ - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(x1_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(y1_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - sample_x = abs(sample_x); - sample_x = (src.w - 1) - abs(sample_x - (src.w - 1)); - - sample_y = abs(sample_y); - sample_y = (src.h - 1) - abs(sample_y - (src.h - 1)); - - sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool v11_in_range = x1_in_range & y1_in_range; - - float alpha = sample_x - x0; - float beta = sample_y - y0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v00 = image.row(y0)[x0]; - float v01 = x1_in_range ? image.row(y0)[x1] : 0; - float v10 = y1_in_range ? image.row(y1)[x0] : 0; - float v11 = v11_in_range ? image.row(y1)[x1] : 0; - - float v0 = v00 * (1 - alpha) + v01 * alpha; - float v1 = v10 * (1 - alpha) + v11 * alpha; - - dst.channel(q).row(y)[x / 2] = v0 * (1 - beta) + v1 * beta; - } - } - } -} - -static void gridsample_3d_bilinear_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - //upzip (3) - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, *(__m256i*)_pi32_256_1); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, vImgWi); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, *(__m256i*)_pi32_256_1); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, *(__m256i*)_pi32_256_1); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, vImgWi); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); -#endif // __AVX2__ - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); -#if __AVX2__ - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, _mm256_castsi256_ps(v100_in_range)); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, _mm256_castsi256_ps(v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, _mm256_castsi256_ps(v101_in_range)); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, _mm256_castsi256_ps(v111_in_range)); -#else - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, v000_in_range); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, v100_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, v010_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, v001_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); -#endif - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - - nn = grid_size % 24; - -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * src.w - 1) / 2.f; - gy = ((gy + 1) * src.h - 1) / 2.f; - gz = ((gz + 1) * src.d - 1) / 2.f; - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); - bool z0_in_range = (z0 > -1) & (z0 < src.d); - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - - bool v000_in_range = v00_in_range & z0_in_range; - bool v010_in_range = v10_in_range & z0_in_range; - bool v100_in_range = v00_in_range & z1_in_range; - bool v110_in_range = v10_in_range & z1_in_range; - - bool v001_in_range = v01_in_range & z0_in_range; - bool v011_in_range = v11_in_range & z0_in_range; - bool v101_in_range = v01_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; - float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } -} - -static void gridsample_3d_bilinear_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); -#endif // __AVX2__ -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, *(__m256i*)_pi32_256_1); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, vImgWi); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, *(__m256i*)_pi32_256_1); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, *(__m256i*)_pi32_256_1); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, vImgWi); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, *(__m256i*)_pi32_256_1); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); -#endif // __AVX2__ - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); -#if __AVX2__ - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, _mm256_castsi256_ps(v100_in_range)); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, _mm256_castsi256_ps(v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, _mm256_castsi256_ps(v101_in_range)); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, _mm256_castsi256_ps(v111_in_range)); -#else - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, v000_in_range); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, v100_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, v010_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, v001_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); -#endif - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (src.w - 1); - gy = (gy + 1) / 2.f * (src.h - 1); - gz = (gz + 1) / 2.f * (src.d - 1); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); - bool z0_in_range = (z0 > -1) & (z0 < src.d); - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - - bool v000_in_range = v00_in_range & z0_in_range; - bool v010_in_range = v10_in_range & z0_in_range; - bool v100_in_range = v00_in_range & z1_in_range; - bool v110_in_range = v10_in_range & z1_in_range; - - bool v001_in_range = v01_in_range & z0_in_range; - bool v011_in_range = v11_in_range & z0_in_range; - bool v101_in_range = v01_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v000 = v000_in_range ? image.depth(z0).row(y0)[x0] : 0; - float v010 = v010_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = v100_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = v001_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v011_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } -} - -static void gridsample_3d_bilinear_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * src.w - 1) / 2.f; - gy = ((gy + 1) * src.h - 1) / 2.f; - gz = ((gz + 1) * src.d - 1) / 2.f; - - gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } -} - -static void gridsample_3d_bilinear_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (src.w - 1); - gy = (gy + 1) / 2.f * (src.h - 1); - gz = (gz + 1) / 2.f * (src.d - 1); - - gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } -} - -static void gridsample_3d_bilinear_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_add_ps(gz, v0p5fp8); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(vImgDf, reflectz_v); - - gz = _mm256_sub_ps(gz, v0p5fp8); - - _mm256_sub_ps(gz, v0p5fp8); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * src.w - 1) / 2.f; - gy = ((gy + 1) * src.h - 1) / 2.f; - gz = ((gz + 1) * src.d - 1) / 2.f; - - gx = abs(gx + 0.5f); - gx = src.w - abs(gx - src.w) - 0.5; - - gy = abs(gy + 0.5f); - gy = src.h - abs(gy - src.h) - 0.5; - - gz = abs(gz + 0.5f); - gz = src.d - abs(gz - src.d) - 0.5; - - gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } -} - -static void gridsample_3d_bilinear_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(border_z, reflectz_v); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, *(__m256*)_ps256_1); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, vImgWf); - __m256 tse_offset = _mm256_add_ps(tsw_offset, *(__m256*)_ps256_1); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(vImgWf, vImgHf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, *(__m256*)_ps256_1); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, vImgWf); - __m256 bse_offset = _mm256_add_ps(bsw_offset, *(__m256*)_ps256_1); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - __m256 tnw_val = mask_gather_ps256(image, i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(image, i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(image, i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(image, i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(image, i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(image, i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(image, i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(image, i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (src.w - 1); - gy = (gy + 1) / 2.f * (src.h - 1); - gz = (gz + 1) / 2.f * (src.d - 1); - - gx = abs(gx); - gx = (src.w - 1) - abs(gx - (src.w - 1)); - - gy = abs(gy); - gy = (src.h - 1) - abs(gy - (src.h - 1)); - - gz = abs(gz); - gz = (src.d - 1) - abs(gz - (src.d - 1)); - - gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - // bilinear interpolate - int x0 = (int)floor(gx); - int y0 = (int)floor(gy); - int z0 = (int)floor(gz); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v11_in_range = x1_in_range & y1_in_range; - - bool v110_in_range = y1_in_range & z1_in_range; - - bool v101_in_range = x1_in_range & z1_in_range; - bool v111_in_range = v11_in_range & z1_in_range; - - float alpha = gx - x0; - float beta = gy - y0; - float gamma = gz - z0; - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - float v000 = image.depth(z0).row(y0)[x0]; - float v010 = y1_in_range ? image.depth(z0).row(y1)[x0] : 0; - float v100 = z1_in_range ? image.depth(z1).row(y0)[x0] : 0; - float v110 = v110_in_range ? image.depth(z1).row(y1)[x0] : 0; - - float v001 = x1_in_range ? image.depth(z0).row(y0)[x1] : 0; - float v011 = v11_in_range ? image.depth(z0).row(y1)[x1] : 0; - float v101 = v101_in_range ? image.depth(z1).row(y0)[x1] : 0; - float v111 = v111_in_range ? image.depth(z1).row(y1)[x1] : 0; - - float v00 = v000 * (1 - alpha) + v001 * alpha; - float v01 = v010 * (1 - alpha) + v011 * alpha; - float v10 = v100 * (1 - alpha) + v101 * alpha; - float v11 = v110 * (1 - alpha) + v111 * alpha; - - float v0 = v00 * (1 - beta) + v01 * beta; - float v1 = v10 * (1 - beta) + v11 * beta; - - dst.channel(q).depth(y)[x / 3] = v0 * (1 - gamma) + v1 * gamma; - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack16.h b/src/layer/x86/gridsample_bilinear_pack16.h deleted file mode 100644 index 66d7c87ea84..00000000000 --- a/src/layer/x86/gridsample_bilinear_pack16.h +++ /dev/null @@ -1,1431 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - - __mmask16 v00_in_range = x0_in_range & y0_in_range; - __mmask16 v01_in_range = x0_in_range & y1_in_range; - __mmask16 v10_in_range = x1_in_range & y0_in_range; - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - // (W*y + x) * elempack + vec(8) - __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); - __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v00_in_range, i_nw_offset, src.channel(q), sizeof(float)); - __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v10_in_range, i_ne_offset, src.channel(q), sizeof(float)); - __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v01_in_range, i_sw_offset, src.channel(q), sizeof(float)); - __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(nw_val, nw); - _v = _mm512_fmadd_ps(ne_val, ne, _v); - _v = _mm512_fmadd_ps(sw_val, sw, _v); - _v = _mm512_fmadd_ps(se_val, se, _v); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - - __mmask16 v00_in_range = x0_in_range & y0_in_range; - __mmask16 v01_in_range = x0_in_range & y1_in_range; - __mmask16 v10_in_range = x1_in_range & y0_in_range; - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - // (W*y + x) * elempack + vec(8) - __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); - __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v00_in_range, i_nw_offset, src.channel(q), sizeof(float)); - __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v10_in_range, i_ne_offset, src.channel(q), sizeof(float)); - __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v01_in_range, i_sw_offset, src.channel(q), sizeof(float)); - __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(nw_val, nw); - _v = _mm512_fmadd_ps(ne_val, ne, _v); - _v = _mm512_fmadd_ps(sw_val, sw, _v); - _v = _mm512_fmadd_ps(se_val, se, _v); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - // (W*y + x) * elempack + vec(8) - __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); - __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); - __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); - __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); - __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(nw_val, nw); - _v = _mm512_fmadd_ps(ne_val, ne, _v); - _v = _mm512_fmadd_ps(sw_val, sw, _v); - _v = _mm512_fmadd_ps(se_val, se, _v); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - // (W*y + x) * elempack + vec(8) - __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); - __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); - __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); - __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); - __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(nw_val, nw); - _v = _mm512_fmadd_ps(ne_val, ne, _v); - _v = _mm512_fmadd_ps(sw_val, sw, _v); - _v = _mm512_fmadd_ps(se_val, se, _v); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - __m512 v0p5fp16 = _mm512_set1_ps(0.5f); - gx = _mm512_add_ps(gx, v0p5fp16); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(vImgWf, reflectx_v); - - gx = _mm512_sub_ps(gx, v0p5fp16); - - _mm512_sub_ps(gx, v0p5fp16); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_add_ps(gy, v0p5fp16); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(vImgHf, reflecty_v); - - gy = _mm512_sub_ps(gy, v0p5fp16); - - _mm512_sub_ps(gy, v0p5fp16); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - // (W*y + x) * elempack + vec(8) - __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); - __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); - __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); - __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); - __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(nw_val, nw); - _v = _mm512_fmadd_ps(ne_val, ne, _v); - _v = _mm512_fmadd_ps(sw_val, sw, _v); - _v = _mm512_fmadd_ps(se_val, se, _v); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(border_x, reflectx_v); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(border_y, reflecty_v); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - // (W*y + x) * elempack + vec(8) - __m512i i_nw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_ne_offset = _mm512_add_epi32(i_nw_offset, vElempacki); - __m512i i_sw_offset = _mm512_add_epi32(i_nw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_se_offset = _mm512_add_epi32(i_sw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 nw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_nw_offset, src.channel(q), sizeof(float)); - __m512 ne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_ne_offset, src.channel(q), sizeof(float)); - __m512 sw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_sw_offset, src.channel(q), sizeof(float)); - __m512 se_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v11_in_range, i_se_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(nw_val, nw); - _v = _mm512_fmadd_ps(ne_val, ne, _v); - _v = _mm512_fmadd_ps(sw_val, sw, _v); - _v = _mm512_fmadd_ps(se_val, se, _v); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_3d_bilinear_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); - - __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - tnw = _mm512_mul_ps(b, nw); - tne = _mm512_mul_ps(b, ne); - tsw = _mm512_mul_ps(b, sw); - tse = _mm512_mul_ps(b, se); - - bnw = _mm512_mul_ps(t, nw); - bne = _mm512_mul_ps(t, ne); - bsw = _mm512_mul_ps(t, sw); - bse = _mm512_mul_ps(t, se); - } - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z0); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); - - __mmask16 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __mmask16 v00_in_range = x0_in_range & y0_in_range; - __mmask16 v01_in_range = x0_in_range & y1_in_range; - __mmask16 v10_in_range = x1_in_range & y0_in_range; - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - v000_in_range = v00_in_range & z0_in_range; - v010_in_range = v01_in_range & z0_in_range; - v100_in_range = v10_in_range & z0_in_range; - v110_in_range = v11_in_range & z0_in_range; - - v001_in_range = v00_in_range & z1_in_range; - v011_in_range = v01_in_range & z1_in_range; - v101_in_range = v10_in_range & z1_in_range; - v111_in_range = v11_in_range & z1_in_range; - } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); - __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - - __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); - __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v000_in_range, i_tnw_offset, src.channel(q), sizeof(float)); - __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v100_in_range, i_tne_offset, src.channel(q), sizeof(float)); - __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v010_in_range, i_tsw_offset, src.channel(q), sizeof(float)); - __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); - - __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v001_in_range, i_bnw_offset, src.channel(q), sizeof(float)); - __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); - __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); - __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(tnw_val, tnw); - _v = _mm512_fmadd_ps(tne_val, tne, _v); - _v = _mm512_fmadd_ps(tsw_val, tsw, _v); - _v = _mm512_fmadd_ps(tse_val, tse, _v); - - _v = _mm512_fmadd_ps(bnw_val, bnw, _v); - _v = _mm512_fmadd_ps(bne_val, bne, _v); - _v = _mm512_fmadd_ps(bsw_val, bsw, _v); - _v = _mm512_fmadd_ps(bse_val, bse, _v); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); - - __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - tnw = _mm512_mul_ps(b, nw); - tne = _mm512_mul_ps(b, ne); - tsw = _mm512_mul_ps(b, sw); - tse = _mm512_mul_ps(b, se); - - bnw = _mm512_mul_ps(t, nw); - bne = _mm512_mul_ps(t, ne); - bsw = _mm512_mul_ps(t, sw); - bse = _mm512_mul_ps(t, se); - } - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - - __mmask16 x0_in_range = _mm512_cmpgt_epi32_mask(x0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x0); - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y0_in_range = _mm512_cmpgt_epi32_mask(y0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y0); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z0_in_range = _mm512_cmpgt_epi32_mask(z0, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z0); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); - - __mmask16 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __mmask16 v00_in_range = x0_in_range & y0_in_range; - __mmask16 v01_in_range = x0_in_range & y1_in_range; - __mmask16 v10_in_range = x1_in_range & y0_in_range; - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - v000_in_range = v00_in_range & z0_in_range; - v010_in_range = v01_in_range & z0_in_range; - v100_in_range = v10_in_range & z0_in_range; - v110_in_range = v11_in_range & z0_in_range; - - v001_in_range = v00_in_range & z1_in_range; - v011_in_range = v01_in_range & z1_in_range; - v101_in_range = v10_in_range & z1_in_range; - v111_in_range = v11_in_range & z1_in_range; - } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); - __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - - __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); - __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v000_in_range, i_tnw_offset, src.channel(q), sizeof(float)); - __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v100_in_range, i_tne_offset, src.channel(q), sizeof(float)); - __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v010_in_range, i_tsw_offset, src.channel(q), sizeof(float)); - __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); - - __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v001_in_range, i_bnw_offset, src.channel(q), sizeof(float)); - __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); - __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); - __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(tnw_val, tnw); - _v = _mm512_fmadd_ps(tne_val, tne, _v); - _v = _mm512_fmadd_ps(tsw_val, tsw, _v); - _v = _mm512_fmadd_ps(tse_val, tse, _v); - - _v = _mm512_fmadd_ps(bnw_val, bnw, _v); - _v = _mm512_fmadd_ps(bne_val, bne, _v); - _v = _mm512_fmadd_ps(bsw_val, bsw, _v); - _v = _mm512_fmadd_ps(bse_val, bse, _v); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); - - __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - tnw = _mm512_mul_ps(b, nw); - tne = _mm512_mul_ps(b, ne); - tsw = _mm512_mul_ps(b, sw); - tse = _mm512_mul_ps(b, se); - - bnw = _mm512_mul_ps(t, nw); - bne = _mm512_mul_ps(t, ne); - bsw = _mm512_mul_ps(t, sw); - bse = _mm512_mul_ps(t, se); - } - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); - - __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - v110_in_range = x1_in_range & y1_in_range; - - v011_in_range = y1_in_range & z1_in_range; - v101_in_range = x1_in_range & z1_in_range; - v111_in_range = v11_in_range & z1_in_range; - } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); - __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - - __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); - __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); - __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); - __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); - __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); - - __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); - __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); - __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); - __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(tnw_val, tnw); - _v = _mm512_fmadd_ps(tne_val, tne, _v); - _v = _mm512_fmadd_ps(tsw_val, tsw, _v); - _v = _mm512_fmadd_ps(tse_val, tse, _v); - - _v = _mm512_fmadd_ps(bnw_val, bnw, _v); - _v = _mm512_fmadd_ps(bne_val, bne, _v); - _v = _mm512_fmadd_ps(bsw_val, bsw, _v); - _v = _mm512_fmadd_ps(bse_val, bse, _v); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); - - __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - tnw = _mm512_mul_ps(b, nw); - tne = _mm512_mul_ps(b, ne); - tsw = _mm512_mul_ps(b, sw); - tse = _mm512_mul_ps(b, se); - - bnw = _mm512_mul_ps(t, nw); - bne = _mm512_mul_ps(t, ne); - bsw = _mm512_mul_ps(t, sw); - bse = _mm512_mul_ps(t, se); - } - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); - - __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - v110_in_range = x1_in_range & y1_in_range; - - v011_in_range = y1_in_range & z1_in_range; - v101_in_range = x1_in_range & z1_in_range; - v111_in_range = v11_in_range & z1_in_range; - } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); - __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - - __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); - __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); - __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); - __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); - __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); - - __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); - __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); - __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); - __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(tnw_val, tnw); - _v = _mm512_fmadd_ps(tne_val, tne, _v); - _v = _mm512_fmadd_ps(tsw_val, tsw, _v); - _v = _mm512_fmadd_ps(tse_val, tse, _v); - - _v = _mm512_fmadd_ps(bnw_val, bnw, _v); - _v = _mm512_fmadd_ps(bne_val, bne, _v); - _v = _mm512_fmadd_ps(bsw_val, bsw, _v); - _v = _mm512_fmadd_ps(bse_val, bse, _v); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - __m512 v0p5fp16 = _mm512_set1_ps(0.5f); - gx = _mm512_add_ps(gx, v0p5fp16); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(vImgWf, reflectx_v); - - gx = _mm512_sub_ps(gx, v0p5fp16); - - _mm512_sub_ps(gx, v0p5fp16); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_add_ps(gy, v0p5fp16); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(vImgHf, reflecty_v); - - gy = _mm512_sub_ps(gy, v0p5fp16); - - _mm512_sub_ps(gy, v0p5fp16); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_add_ps(gz, v0p5fp16); - - gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, vImgDf), *(__m512*)_ps512_inv_sign_mask); - gz = _mm512_sub_ps(vImgDf, reflectz_v); - - gz = _mm512_sub_ps(gz, v0p5fp16); - - _mm512_sub_ps(gz, v0p5fp16); - - gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); - - __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - tnw = _mm512_mul_ps(b, nw); - tne = _mm512_mul_ps(b, ne); - tsw = _mm512_mul_ps(b, sw); - tse = _mm512_mul_ps(b, se); - - bnw = _mm512_mul_ps(t, nw); - bne = _mm512_mul_ps(t, ne); - bsw = _mm512_mul_ps(t, sw); - bse = _mm512_mul_ps(t, se); - } - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); - - __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - v110_in_range = x1_in_range & y1_in_range; - - v011_in_range = y1_in_range & z1_in_range; - v101_in_range = x1_in_range & z1_in_range; - v111_in_range = v11_in_range & z1_in_range; - } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); - __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - - __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); - __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); - __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); - __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); - __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); - - __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); - __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); - __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); - __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(tnw_val, tnw); - _v = _mm512_fmadd_ps(tne_val, tne, _v); - _v = _mm512_fmadd_ps(tsw_val, tsw, _v); - _v = _mm512_fmadd_ps(tse_val, tse, _v); - - _v = _mm512_fmadd_ps(bnw_val, bnw, _v); - _v = _mm512_fmadd_ps(bne_val, bne, _v); - _v = _mm512_fmadd_ps(bsw_val, bsw, _v); - _v = _mm512_fmadd_ps(bse_val, bse, _v); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(border_x, reflectx_v); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(border_y, reflecty_v); - - // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, border_z), *(__m512*)_ps512_inv_sign_mask); - gz = _mm512_sub_ps(border_z, reflectz_v); - } - - __m512 x_w = _mm512_roundscale_ps(gx, _MM_FROUND_TO_NEG_INF); - __m512 y_n = _mm512_roundscale_ps(gy, _MM_FROUND_TO_NEG_INF); - __m512 z_t = _mm512_roundscale_ps(gz, _MM_FROUND_TO_NEG_INF); - - __m512 w = _mm512_sub_ps(gx, x_w); - __m512 e = _mm512_sub_ps(*(__m512*)_ps512_1, w); - __m512 n = _mm512_sub_ps(gy, y_n); - __m512 s = _mm512_sub_ps(*(__m512*)_ps512_1, n); - __m512 t = _mm512_sub_ps(gz, z_t); - __m512 b = _mm512_sub_ps(*(__m512*)_ps512_1, t); - - __m512 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m512 nw = _mm512_mul_ps(s, e); - __m512 ne = _mm512_mul_ps(s, w); - __m512 sw = _mm512_mul_ps(n, e); - __m512 se = _mm512_mul_ps(n, w); - - tnw = _mm512_mul_ps(b, nw); - tne = _mm512_mul_ps(b, ne); - tsw = _mm512_mul_ps(b, sw); - tse = _mm512_mul_ps(b, se); - - bnw = _mm512_mul_ps(t, nw); - bne = _mm512_mul_ps(t, ne); - bsw = _mm512_mul_ps(t, sw); - bse = _mm512_mul_ps(t, se); - } - - __m512i x0 = _mm512_cvtps_epi32(x_w); - __m512i x1 = _mm512_add_epi32(x0, *(__m512i*)_pi32_512_1); - __m512i y0 = _mm512_cvtps_epi32(y_n); - __m512i y1 = _mm512_add_epi32(y0, *(__m512i*)_pi32_512_1); - __m512i z0 = _mm512_cvtps_epi32(z_t); - __m512i z1 = _mm512_add_epi32(z0, *(__m512i*)_pi32_512_1); - - __mmask16 x1_in_range = _mm512_cmpgt_epi32_mask(x1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, x1); - __mmask16 y1_in_range = _mm512_cmpgt_epi32_mask(y1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, y1); - __mmask16 z1_in_range = _mm512_cmpgt_epi32_mask(z1, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, z1); - - __mmask16 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __mmask16 v11_in_range = x1_in_range & y1_in_range; - - v110_in_range = x1_in_range & y1_in_range; - - v011_in_range = y1_in_range & z1_in_range; - v101_in_range = x1_in_range & z1_in_range; - v111_in_range = v11_in_range & z1_in_range; - } - - // (W*H*z + W*y + x) * elempack + vec(8) - __m512i i_tnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), z0), _mm512_add_epi32(_mm512_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i i_tne_offset = _mm512_add_epi32(i_tnw_offset, vElempacki); - __m512i i_tsw_offset = _mm512_add_epi32(i_tnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_tse_offset = _mm512_add_epi32(i_tsw_offset, vElempacki); - - __m512i i_bnw_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m512i i_bne_offset = _mm512_add_epi32(i_bnw_offset, vElempacki); - __m512i i_bsw_offset = _mm512_add_epi32(i_bnw_offset, _mm512_mullo_epi32(vImgWi, vElempacki)); - __m512i i_bse_offset = _mm512_add_epi32(i_bsw_offset, vElempacki); - - for (int q = 0; q < dst.c; q++) - { - __m512 tnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 0b1111111111111111, i_tnw_offset, src.channel(q), sizeof(float)); - __m512 tne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), x1_in_range, i_tne_offset, src.channel(q), sizeof(float)); - __m512 tsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), y1_in_range, i_tsw_offset, src.channel(q), sizeof(float)); - __m512 tse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v110_in_range, i_tse_offset, src.channel(q), sizeof(float)); - - __m512 bnw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), z1_in_range, i_bnw_offset, src.channel(q), sizeof(float)); - __m512 bne_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v101_in_range, i_bne_offset, src.channel(q), sizeof(float)); - __m512 bsw_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v011_in_range, i_bsw_offset, src.channel(q), sizeof(float)); - __m512 bse_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v111_in_range, i_bse_offset, src.channel(q), sizeof(float)); - - __m512 _v = _mm512_mul_ps(tnw_val, tnw); - _v = _mm512_fmadd_ps(tne_val, tne, _v); - _v = _mm512_fmadd_ps(tsw_val, tsw, _v); - _v = _mm512_fmadd_ps(tse_val, tse, _v); - - _v = _mm512_fmadd_ps(bnw_val, bnw, _v); - _v = _mm512_fmadd_ps(bne_val, bne, _v); - _v = _mm512_fmadd_ps(bsw_val, bsw, _v); - _v = _mm512_fmadd_ps(bse_val, bse, _v); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack4.h b/src/layer/x86/gridsample_bilinear_pack4.h deleted file mode 100644 index 416711c12b2..00000000000 --- a/src/layer/x86/gridsample_bilinear_pack4.h +++ /dev/null @@ -1,1730 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - - __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - - __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); -#else - __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); - __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); - - __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); - __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); - __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); - __m128i i_se_offset = _mm_cvtps_epi32(se_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, _mm_castsi128_ps(v00_in_range)); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(v10_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(v01_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); - - __m128 _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - - __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - - __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); -#else - __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); - __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); - - __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); - __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); - __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); - __m128i i_se_offset = _mm_cvtps_epi32(se_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, _mm_castsi128_ps(v00_in_range)); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(v10_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(v01_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); - - __m128 _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); -#else - __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); - __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); - - __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); - __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); - __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); - __m128i i_se_offset = _mm_cvtps_epi32(se_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); - - __m128 _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); -#else - __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); - __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); - - __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); - __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); - __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); - __m128i i_se_offset = _mm_cvtps_epi32(se_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); - - __m128 _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - __m128 v0p5fp4 = _mm_set1_ps(0.5f); - gx = _mm_add_ps(gx, v0p5fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(vImgWf, reflectx_v); - - gx = _mm_sub_ps(gx, v0p5fp4); - - _mm_sub_ps(gx, v0p5fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_add_ps(gy, v0p5fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(vImgHf, reflecty_v); - - gy = _mm_sub_ps(gy, v0p5fp4); - - _mm_sub_ps(gy, v0p5fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); -#else - __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); - __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); - - __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); - __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); - __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); - __m128i i_se_offset = _mm_cvtps_epi32(se_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); - - __m128 _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(border_x, reflectx_v); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(border_y, reflecty_v); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - // (W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_nw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm_set_epi32(3, 2, 1, 0)); - __m128i i_ne_offset = _mm_add_epi32(i_nw_offset, vElempacki); - __m128i i_sw_offset = _mm_add_epi32(i_nw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_se_offset = _mm_add_epi32(i_sw_offset, vElempacki); -#else - __m128 nw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128 ne_offset = _mm_add_ps(nw_offset, vElempackf); - __m128 sw_offset = _mm_add_ps(nw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 se_offset = _mm_add_ps(sw_offset, vElempackf); - - __m128i i_nw_offset = _mm_cvtps_epi32(nw_offset); - __m128i i_ne_offset = _mm_cvtps_epi32(ne_offset); - __m128i i_sw_offset = _mm_cvtps_epi32(sw_offset); - __m128i i_se_offset = _mm_cvtps_epi32(se_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 nw_val = mask_gather_ps(src.channel(q), i_nw_offset, vn1fp4); - __m128 ne_val = mask_gather_ps(src.channel(q), i_ne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 sw_val = mask_gather_ps(src.channel(q), i_sw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 se_val = mask_gather_ps(src.channel(q), i_se_offset, _mm_castsi128_ps(v11_in_range)); - - __m128 _v = _mm_mul_ps(nw_val, nw); - _v = _mm_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm_comp_fmadd_ps(se_val, se, _v); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_3d_bilinear_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - // z - gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); - __m128 z_t = _mm_floor_ps(gz); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); - __m128 z_t = floor_ps(gz); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - __m128 t = _mm_sub_ps(gz, z_t); - __m128 b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - __m128i z0 = _mm_cvtps_epi32(z_t); - __m128i z1 = _mm_add_epi32(z0, v1ip4); - - __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - __m128i z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); - __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); - - __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); - v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); - v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); - v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); - - v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); - v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); - v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); -#else - __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); - __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); - __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - - __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); - __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); - - __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); - __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); - __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); - __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); - - __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); - __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); - __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); - __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, _mm_castsi128_ps(v000_in_range)); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(v100_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(v010_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(v001_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); - - __m128 _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - // z - gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - } - -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128 x_w = _mm_floor_ps(gx); - __m128 y_n = _mm_floor_ps(gy); - __m128 z_t = _mm_floor_ps(gz); -#else - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); - __m128 z_t = floor_ps(gz); -#endif // __SSE4_1__ - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - __m128 t = _mm_sub_ps(gz, z_t); - __m128 b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - __m128i z0 = _mm_cvtps_epi32(z_t); - __m128i z1 = _mm_add_epi32(z0, v1ip4); - - __m128i x0_in_range = _mm_and_si128(_mm_cmpgt_epi32(x0, vn1ip4), _mm_cmpgt_epi32(vImgWi, x0)); - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y0_in_range = _mm_and_si128(_mm_cmpgt_epi32(y0, vn1ip4), _mm_cmpgt_epi32(vImgHi, y0)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - __m128i z0_in_range = _mm_and_si128(_mm_cmpgt_epi32(z0, vn1ip4), _mm_cmpgt_epi32(vImgDi, z0)); - __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); - - __m128i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m128i v00_in_range = _mm_and_si128(x0_in_range, y0_in_range); - __m128i v01_in_range = _mm_and_si128(x0_in_range, y1_in_range); - __m128i v10_in_range = _mm_and_si128(x1_in_range, y0_in_range); - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v000_in_range = _mm_and_si128(v00_in_range, z0_in_range); - v010_in_range = _mm_and_si128(v01_in_range, z0_in_range); - v100_in_range = _mm_and_si128(v10_in_range, z0_in_range); - v110_in_range = _mm_and_si128(v11_in_range, z0_in_range); - - v001_in_range = _mm_and_si128(v00_in_range, z1_in_range); - v011_in_range = _mm_and_si128(v01_in_range, z1_in_range); - v101_in_range = _mm_and_si128(v10_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); -#else - __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); - __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); - __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - - __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); - __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); - - __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); - __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); - __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); - __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); - - __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); - __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); - __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); - __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, _mm_castsi128_ps(v000_in_range)); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(v100_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(v010_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(v001_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); - - __m128 _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - // z - gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); - } - - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); - __m128 z_t = floor_ps(gz); - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - __m128 t = _mm_sub_ps(gz, z_t); - __m128 b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - __m128i z0 = _mm_cvtps_epi32(z_t); - __m128i z1 = _mm_add_epi32(z0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); - - __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); - v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); -#else - __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); - __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); - __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - - __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); - __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); - - __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); - __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); - __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); - __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); - - __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); - __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); - __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); - __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); - - __m128 _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - // z - gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); - } - - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); - __m128 z_t = floor_ps(gz); - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - __m128 t = _mm_sub_ps(gz, z_t); - __m128 b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - __m128i z0 = _mm_cvtps_epi32(z_t); - __m128i z1 = _mm_add_epi32(z0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); - - __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); - v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); -#else - __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); - __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); - __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - - __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); - __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); - - __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); - __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); - __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); - __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); - - __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); - __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); - __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); - __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); - - __m128 _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - __m128 v0p5fp4 = _mm_set1_ps(0.5f); - gx = _mm_add_ps(gx, v0p5fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(vImgWf, reflectx_v); - - gx = _mm_sub_ps(gx, v0p5fp4); - - _mm_sub_ps(gx, v0p5fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_add_ps(gy, v0p5fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(vImgHf, reflecty_v); - - gy = _mm_sub_ps(gy, v0p5fp4); - - _mm_sub_ps(gy, v0p5fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - // z - gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_add_ps(gz, v0p5fp4); - - gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps_inv_sign_mask); - gz = _mm_sub_ps(vImgDf, reflectz_v); - - gz = _mm_sub_ps(gz, v0p5fp4); - - _mm_sub_ps(gz, v0p5fp4); - - gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); - } - - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); - __m128 z_t = floor_ps(gz); - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - __m128 t = _mm_sub_ps(gz, z_t); - __m128 b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - __m128i z0 = _mm_cvtps_epi32(z_t); - __m128i z1 = _mm_add_epi32(z0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); - - __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); - v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); -#else - __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); - __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); - __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - - __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); - __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); - - __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); - __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); - __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); - __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); - - __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); - __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); - __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); - __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); - - __m128 _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); -#if ((_MSC_VER && __AVX__) || __SSE4_1__) - const __m128i vElempacki = _mm_set1_epi32(src.elempack); -#else - const __m128 vElempackf = _mm_set1_ps(src.elempack); -#endif // !__SSE4_1__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(border_x, reflectx_v); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(border_y, reflecty_v); - - // z - gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps_inv_sign_mask); - gz = _mm_sub_ps(border_z, reflectz_v); - } - - __m128 x_w = floor_ps(gx); - __m128 y_n = floor_ps(gy); - __m128 z_t = floor_ps(gz); - - __m128 w = _mm_sub_ps(gx, x_w); - __m128 e = _mm_sub_ps(v1fp4, w); - __m128 n = _mm_sub_ps(gy, y_n); - __m128 s = _mm_sub_ps(v1fp4, n); - __m128 t = _mm_sub_ps(gz, z_t); - __m128 b = _mm_sub_ps(v1fp4, t); - - __m128 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m128 nw = _mm_mul_ps(s, e); - __m128 ne = _mm_mul_ps(s, w); - __m128 sw = _mm_mul_ps(n, e); - __m128 se = _mm_mul_ps(n, w); - - tnw = _mm_mul_ps(b, nw); - tne = _mm_mul_ps(b, ne); - tsw = _mm_mul_ps(b, sw); - tse = _mm_mul_ps(b, se); - - bnw = _mm_mul_ps(t, nw); - bne = _mm_mul_ps(t, ne); - bsw = _mm_mul_ps(t, sw); - bse = _mm_mul_ps(t, se); - } - - __m128i x0 = _mm_cvtps_epi32(x_w); - __m128i x1 = _mm_add_epi32(x0, v1ip4); - __m128i y0 = _mm_cvtps_epi32(y_n); - __m128i y1 = _mm_add_epi32(y0, v1ip4); - __m128i z0 = _mm_cvtps_epi32(z_t); - __m128i z1 = _mm_add_epi32(z0, v1ip4); - - __m128i x1_in_range = _mm_and_si128(_mm_cmpgt_epi32(x1, vn1ip4), _mm_cmpgt_epi32(vImgWi, x1)); - __m128i y1_in_range = _mm_and_si128(_mm_cmpgt_epi32(y1, vn1ip4), _mm_cmpgt_epi32(vImgHi, y1)); - __m128i z1_in_range = _mm_and_si128(_mm_cmpgt_epi32(z1, vn1ip4), _mm_cmpgt_epi32(vImgDi, z1)); - - __m128i v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m128i v11_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v110_in_range = _mm_and_si128(x1_in_range, y1_in_range); - - v011_in_range = _mm_and_si128(y1_in_range, z1_in_range); - v101_in_range = _mm_and_si128(x1_in_range, z1_in_range); - v111_in_range = _mm_and_si128(v11_in_range, z1_in_range); - } - - // (W*H*z + W*y + x) * elempack + vec(4) -#if (_MSC_VER && __AVX__) || __SSE4_1__ - __m128i i_tnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), z0), _mm_add_epi32(_mm_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm_set_epi32(3, 2, 1, 0)); - __m128i i_tne_offset = _mm_add_epi32(i_tnw_offset, vElempacki); - __m128i i_tsw_offset = _mm_add_epi32(i_tnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_tse_offset = _mm_add_epi32(i_tsw_offset, vElempacki); - - __m128i i_bnw_offset = _mm_add_epi32(_mm_mullo_epi32(_mm_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m128i i_bne_offset = _mm_add_epi32(i_bnw_offset, vElempacki); - __m128i i_bsw_offset = _mm_add_epi32(i_bnw_offset, _mm_mullo_epi32(vImgWi, vElempacki)); - __m128i i_bse_offset = _mm_add_epi32(i_bsw_offset, vElempacki); -#else - __m128 tnw_offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), z_t), _mm_add_ps(_mm_mul_ps(y_n, vImgWf), x_w)), vElempackf), _mm_set_ps(3, 2, 1, 0)); - __m128 tne_offset = _mm_add_ps(tnw_offset, vElempackf); - __m128 tsw_offset = _mm_add_ps(tnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 tse_offset = _mm_add_ps(tsw_offset, vElempackf); - - __m128 bnw_offset = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m128 bne_offset = _mm_add_ps(bnw_offset, vElempackf); - __m128 bsw_offset = _mm_add_ps(bnw_offset, _mm_mul_ps(vImgWf, vElempackf)); - __m128 bse_offset = _mm_add_ps(bsw_offset, vElempackf); - - __m128i i_tnw_offset = _mm_cvtps_epi32(tnw_offset); - __m128i i_tne_offset = _mm_cvtps_epi32(tne_offset); - __m128i i_tsw_offset = _mm_cvtps_epi32(tsw_offset); - __m128i i_tse_offset = _mm_cvtps_epi32(tse_offset); - - __m128i i_bnw_offset = _mm_cvtps_epi32(bnw_offset); - __m128i i_bne_offset = _mm_cvtps_epi32(bne_offset); - __m128i i_bsw_offset = _mm_cvtps_epi32(bsw_offset); - __m128i i_bse_offset = _mm_cvtps_epi32(bse_offset); -#endif // __SSE4_1__ - - for (int q = 0; q < dst.c; q++) - { - __m128 tnw_val = mask_gather_ps(src.channel(q), i_tnw_offset, vn1fp4); - __m128 tne_val = mask_gather_ps(src.channel(q), i_tne_offset, _mm_castsi128_ps(x1_in_range)); - __m128 tsw_val = mask_gather_ps(src.channel(q), i_tsw_offset, _mm_castsi128_ps(y1_in_range)); - __m128 tse_val = mask_gather_ps(src.channel(q), i_tse_offset, _mm_castsi128_ps(v110_in_range)); - - __m128 bnw_val = mask_gather_ps(src.channel(q), i_bnw_offset, _mm_castsi128_ps(z1_in_range)); - __m128 bne_val = mask_gather_ps(src.channel(q), i_bne_offset, _mm_castsi128_ps(v101_in_range)); - __m128 bsw_val = mask_gather_ps(src.channel(q), i_bsw_offset, _mm_castsi128_ps(v011_in_range)); - __m128 bse_val = mask_gather_ps(src.channel(q), i_bse_offset, _mm_castsi128_ps(v111_in_range)); - - __m128 _v = _mm_mul_ps(tnw_val, tnw); - _v = _mm_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm_comp_fmadd_ps(bse_val, bse, _v); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_pack8.h b/src/layer/x86/gridsample_bilinear_pack8.h deleted file mode 100644 index 944ccf5c639..00000000000 --- a/src/layer/x86/gridsample_bilinear_pack8.h +++ /dev/null @@ -1,1663 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#else - const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // __AVX2__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif // __AVX2__ - - for (int q = 0; q < dst.c; q++) - { -#if __AVX2__ - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif // __AVX2__ - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#else - const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // __AVX2__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki), - _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); -#endif // __AVX2__ - - for (int q = 0; q < dst.c; q++) - { -#if __AVX2__ - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, _mm256_castsi256_ps(v00_in_range)); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, _mm256_castsi256_ps(v10_in_range)); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, _mm256_castsi256_ps(v01_in_range)); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, _mm256_castsi256_ps(v11_in_range)); -#else - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, v00_in_range); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, v10_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, v01_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); -#endif // __AVX2__ - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 nw_val = mask_gather_ps256(src.channel(q), i_nw_offset, *(__m256*)_ps256_n1); - __m256 ne_val = mask_gather_ps256(src.channel(q), i_ne_offset, x1_in_range); - __m256 sw_val = mask_gather_ps256(src.channel(q), i_sw_offset, y1_in_range); - __m256 se_val = mask_gather_ps256(src.channel(q), i_se_offset, v11_in_range); - - __m256 _v = _mm256_mul_ps(nw_val, nw); - _v = _mm256_comp_fmadd_ps(ne_val, ne, _v); - _v = _mm256_comp_fmadd_ps(sw_val, sw, _v); - _v = _mm256_comp_fmadd_ps(se_val, se, _v); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_3d_bilinear_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); - - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#else - const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // __AVX2__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); -#endif // __AVX2__ - - for (int q = 0; q < dst.c; q++) - { -#if __AVX2__ - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, _mm256_castsi256_ps(v100_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, _mm256_castsi256_ps(v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, _mm256_castsi256_ps(v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, _mm256_castsi256_ps(v111_in_range)); -#else - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, v000_in_range); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, v100_in_range); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, v010_in_range); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, v001_in_range); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); -#endif - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); - - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#else - const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // __AVX2__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - - __m256i i_tnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); -#else - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v100_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v110_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v001_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v011_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); -#endif // __AVX2__ - - for (int q = 0; q < dst.c; q++) - { -#if __AVX2__ - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, _mm256_castsi256_ps(v000_in_range)); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, _mm256_castsi256_ps(v100_in_range)); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, _mm256_castsi256_ps(v010_in_range)); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, _mm256_castsi256_ps(v110_in_range)); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, _mm256_castsi256_ps(v001_in_range)); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, _mm256_castsi256_ps(v101_in_range)); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, _mm256_castsi256_ps(v011_in_range)); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, _mm256_castsi256_ps(v111_in_range)); -#else - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, v000_in_range); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, v100_in_range); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, v010_in_range); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, v001_in_range); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); -#endif - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_add_ps(gz, v0p5fp8); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(vImgDf, reflectz_v); - - gz = _mm256_sub_ps(gz, v0p5fp8); - - _mm256_sub_ps(gz, v0p5fp8); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_bilinear_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(border_z, reflectz_v); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 w = _mm256_sub_ps(gx, x_w); - __m256 e = _mm256_sub_ps(*(__m256*)_ps256_1, w); - __m256 n = _mm256_sub_ps(gy, y_n); - __m256 s = _mm256_sub_ps(*(__m256*)_ps256_1, n); - __m256 t = _mm256_sub_ps(gz, z_t); - __m256 b = _mm256_sub_ps(*(__m256*)_ps256_1, t); - - __m256 tnw, tne, tsw, tse, bnw, bne, bsw, bse; - { - __m256 nw = _mm256_mul_ps(s, e); - __m256 ne = _mm256_mul_ps(s, w); - __m256 sw = _mm256_mul_ps(n, e); - __m256 se = _mm256_mul_ps(n, w); - - tnw = _mm256_mul_ps(b, nw); - tne = _mm256_mul_ps(b, ne); - tsw = _mm256_mul_ps(b, sw); - tse = _mm256_mul_ps(b, se); - - bnw = _mm256_mul_ps(t, nw); - bne = _mm256_mul_ps(t, ne); - bsw = _mm256_mul_ps(t, sw); - bse = _mm256_mul_ps(t, se); - } - - __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); - __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); - __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); - - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); - - __m256 v110_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v110_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v011_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - __m256 tnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t), - _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); - - __m256 bnw_offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); - __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 tnw_val = mask_gather_ps256(src.channel(q), i_tnw_offset, *(__m256*)_ps256_n1); - __m256 tne_val = mask_gather_ps256(src.channel(q), i_tne_offset, x1_in_range); - __m256 tsw_val = mask_gather_ps256(src.channel(q), i_tsw_offset, y1_in_range); - __m256 tse_val = mask_gather_ps256(src.channel(q), i_tse_offset, v110_in_range); - - __m256 bnw_val = mask_gather_ps256(src.channel(q), i_bnw_offset, z1_in_range); - __m256 bne_val = mask_gather_ps256(src.channel(q), i_bne_offset, v101_in_range); - __m256 bsw_val = mask_gather_ps256(src.channel(q), i_bsw_offset, v011_in_range); - __m256 bse_val = mask_gather_ps256(src.channel(q), i_bse_offset, v111_in_range); - - __m256 _v = _mm256_mul_ps(tnw_val, tnw); - _v = _mm256_comp_fmadd_ps(tne_val, tne, _v); - _v = _mm256_comp_fmadd_ps(tsw_val, tsw, _v); - _v = _mm256_comp_fmadd_ps(tse_val, tse, _v); - - _v = _mm256_comp_fmadd_ps(bnw_val, bnw, _v); - _v = _mm256_comp_fmadd_ps(bne_val, bne, _v); - _v = _mm256_comp_fmadd_ps(bsw_val, bsw, _v); - _v = _mm256_comp_fmadd_ps(bse_val, bse, _v); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h new file mode 100644 index 00000000000..c93bf2f1a52 --- /dev/null +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -0,0 +1,330 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +template +struct gridsample_2d_nearest_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + int* offset_ptr = offset.channel(0); + + grid_sample_unormalize unormalize; + compute_coord get_coord; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + // y + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr += 16; + + offset_ptr += 8; + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + + // x + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + // y + sample_y = unormalize(src.h, sample_x); + sample_y = get_coord(src.h, sample_x); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + *offset_ptr = x0 + y0 * src.w; + + gridptr += 2; + + offset_ptr++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + // y + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr_x += 8; + gridptr_y += 8; + + offset_ptr += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + // y + sample_y = unormalize(src.h, sample_x); + sample_y = get_coord(src.h, sample_x); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + *offset_ptr = x0 + y0 * src.w; + + gridptr_x++; + gridptr_y++; + + offset_ptr++; + } + } + } +}; + +template +struct gridsample_2d_nearest_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); +#endif // __AVX2__ +#endif // __AVX__ + + int* offset_ptr = offset.channel(0); + + float* in_bound_ptr = in_bound.channel(0); + + grid_sample_unormalize unormalize; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 15 < nn; x += 16) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr + x); + __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + // y + gy = unormalize(vImgHf, gy); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_ps(in_bound_ptr, v_in_range); + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr += 16; + offset_ptr += 8; + in_bound_ptr += 8; + } + + nn = grid_size & 15; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 2) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_x); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + *in_bound_ptr = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + *offset_ptr = x0 + y0 * src.w; + + gridptr += 2; + offset_ptr++; + in_bound_ptr++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + + // compute coord + { + // x + gx = unormalize(vImgWf, gx); + // y + gy = unormalize(vImgHf, gy); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + + __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_ps(in_bound_ptr, v_in_range); + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr_x += 8; + gridptr_y += 8; + offset_ptr += 8; + in_bound_ptr += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_x); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + + *in_bound_ptr = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + + *offset_ptr = x0 + y0 * src.w; + + gridptr_x++; + gridptr_y++; + + offset_ptr++; + + in_bound_ptr++; + } + } + } +}; + diff --git a/src/layer/x86/gridsample_nearest_pack1.h b/src/layer/x86/gridsample_nearest_pack1.h deleted file mode 100644 index 88586e6ce93..00000000000 --- a/src/layer/x86/gridsample_nearest_pack1.h +++ /dev/null @@ -1,1167 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_nearest_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - dst.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; - } - } - } -} - -static void gridsample_2d_nearest_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - bool v00_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - dst.channel(q).row(y)[x / 2] = v00_in_range ? image.row(y0)[x0] : 0; - } - } - } -} - -static void gridsample_2d_nearest_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } -} - -static void gridsample_2d_nearest_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - sample_x = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - sample_y = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } -} - -static void gridsample_2d_nearest_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = ((sample_x + 1) * src.w - 1) / 2.f; - sample_y = ((sample_y + 1) * src.h - 1) / 2.f; - - sample_x = floor(sample_x + 0.5f); - sample_y = floor(sample_y + 0.5f); - - sample_x = abs(sample_x + 0.5f); - sample_x = src.w - abs(sample_x - src.w) - 0.5; - - sample_y = abs(sample_y + 0.5f); - sample_y = src.h - abs(sample_y - src.h) - 0.5; - - int x0 = std::min(src.w - 1.0f, std::max(sample_x, 0.0f)); - int y0 = std::min(src.h - 1.0f, std::max(sample_y, 0.0f)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } -} - -static void gridsample_2d_nearest_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); - __m256i i_offset = _mm256_cvtps_epi32(offset); - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(dst.channel(q).row(y) + x / 2, _v); - } - } - - nn = grid_size & 15; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) - { - float sample_x = gridptr[x]; - float sample_y = gridptr[x + 1]; - - sample_x = (sample_x + 1) / 2.f * (src.w - 1); - sample_y = (sample_y + 1) / 2.f * (src.h - 1); - - sample_x = floor(sample_x + 0.5f); - sample_y = floor(sample_y + 0.5f); - - sample_x = abs(sample_x); - int x0 = (src.w - 1) - abs(sample_x - (src.w - 1)); - - sample_y = abs(sample_y); - int y0 = (src.h - 1) - abs(sample_y - (src.h - 1)); - - for (int q = 0; q < src.c; q++) - { - const Mat& image = src.channel(q); - - dst.channel(q).row(y)[x / 2] = image.row(y0)[x0]; - } - } - } -} - -static void gridsample_3d_nearest_align0_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - //upzip (3) - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * src.w - 1) / 2.f; - gy = ((gy + 1) * src.h - 1) / 2.f; - gz = ((gz + 1) * src.d - 1) / 2.f; - - // bilinear interpolate - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - bool v_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) && (z0 > -1) && (z0 < src.d); - - for (int q = 0; q < src.c; q++) - { - dst.channel(q).depth(y)[x / 3] = v_in_range ? src.channel(q).depth(z0).row(y0)[x0] : 0; - } - } - } -} - -static void gridsample_3d_nearest_align1_zeros_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (src.w - 1); - gy = (gy + 1) / 2.f * (src.h - 1); - gz = (gz + 1) / 2.f * (src.d - 1); - - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - bool v_in_range = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) && (z0 > -1) && (z0 < src.d); - - for (int q = 0; q < src.c; q++) - { - dst.channel(q).depth(y)[x / 3] = v_in_range ? src.channel(q).depth(z0).row(y0)[x0] : 0; - } - } - } -} - -static void gridsample_3d_nearest_align0_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * src.w - 1) / 2.f; - gy = ((gy + 1) * src.h - 1) / 2.f; - gz = ((gz + 1) * src.d - 1) / 2.f; - - gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - for (int q = 0; q < src.c; q++) - { - dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; - } - } - } -} - -static void gridsample_3d_nearest_align1_border_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (src.w - 1); - gy = (gy + 1) / 2.f * (src.h - 1); - gz = (gz + 1) / 2.f * (src.d - 1); - - gx = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - gy = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - gz = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - int x0 = static_cast(floor(gx + 0.5f)); - int y0 = static_cast(floor(gy + 0.5f)); - int z0 = static_cast(floor(gz + 0.5f)); - - for (int q = 0; q < src.c; q++) - { - dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; - } - } - } -} - -static void gridsample_3d_nearest_align0_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), *(__m256*)_ps256_2); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_add_ps(gz, v0p5fp8); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(vImgDf, reflectz_v); - - gz = _mm256_sub_ps(gz, v0p5fp8); - - _mm256_sub_ps(gz, v0p5fp8); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = ((gx + 1) * src.w - 1) / 2.f; - gy = ((gy + 1) * src.h - 1) / 2.f; - gz = ((gz + 1) * src.d - 1) / 2.f; - - gx = floor(gx + 0.5f); - gy = floor(gy + 0.5f); - gz = floor(gz + 0.5f); - - gx = abs(gx + 0.5f); - gx = src.w - abs(gx - src.w) - 0.5; - - gy = abs(gy + 0.5f); - gy = src.h - abs(gy - src.h) - 0.5; - - gz = abs(gz + 0.5f); - gz = src.d - abs(gz - src.d) - 0.5; - - int x0 = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - int y0 = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - int z0 = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - for (int q = 0; q < src.c; q++) - { - dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; - } - } - } -} - -static void gridsample_3d_nearest_align1_reflection_blob_pack1(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const int grid_size = grid.w * grid.h * grid.d; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int nn = grid_size; -#if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 tmp_y = _mm256_loadu_ps(gridptr + x + 8); - __m256 gz = _mm256_loadu_ps(gridptr + x + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - - // z - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(border_z, reflectz_v); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)); - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < src.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, *(__m256*)_ps256_n1); - - _mm256_storeu_ps(static_cast(dst.channel(q).depth(y).data) + x / 3, _v); - } - } - nn = grid_size % 24; -#endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) - { - float gx = gridptr[x]; - float gy = gridptr[x + 1]; - float gz = gridptr[x + 2]; - - gx = (gx + 1) / 2.f * (src.w - 1); - gy = (gy + 1) / 2.f * (src.h - 1); - gz = (gz + 1) / 2.f * (src.d - 1); - - gx = floor(gx + 0.5f); - gy = floor(gy + 0.5f); - gz = floor(gz + 0.5f); - - gx = abs(gx); - gx = (src.w - 1) - abs(gx - (src.w - 1)); - - gy = abs(gy); - gy = (src.h - 1) - abs(gy - (src.h - 1)); - - gz = abs(gz); - gz = (src.d - 1) - abs(gz - (src.d - 1)); - - int x0 = std::min(src.w - 1.0f, std::max(gx, 0.0f)); - int y0 = std::min(src.h - 1.0f, std::max(gy, 0.0f)); - int z0 = std::min(src.d - 1.0f, std::max(gz, 0.0f)); - - for (int q = 0; q < src.c; q++) - { - dst.channel(q).depth(y)[x / 3] = src.channel(q).depth(z0).row(y0)[x0]; - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack16.h b/src/layer/x86/gridsample_nearest_pack16.h deleted file mode 100644 index 8969fac63e2..00000000000 --- a/src/layer/x86/gridsample_nearest_pack16.h +++ /dev/null @@ -1,805 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - // compute coord - { - // x - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - __m512 v0p5fp16 = _mm512_set1_ps(0.5f); - gx = _mm512_add_ps(gx, v0p5fp16); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(vImgWf, reflectx_v); - - gx = _mm512_sub_ps(gx, v0p5fp16); - - _mm512_sub_ps(gx, v0p5fp16); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_add_ps(gy, v0p5fp16); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(vImgHf, reflecty_v); - - gy = _mm512_sub_ps(gy, v0p5fp16); - - _mm512_sub_ps(gy, v0p5fp16); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - } - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - - const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - // compute coord - { - // x - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(border_x, reflectx_v); - - // y - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(border_y, reflecty_v); - } - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix), vElempacki), - _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_3d_nearest_align0_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - __m512i iz = _mm512_cvtps_epi32(gz); - - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_zeros_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - const __m512i vImgDi = _mm512_set1_epi32(src.d); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - __m512i iz = _mm512_cvtps_epi32(gz); - - __mmask16 v_in_range = (_mm512_cmpgt_epi32_mask(ix, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgWi, ix)) & (_mm512_cmpgt_epi32_mask(iy, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgHi, iy)); - v_in_range = v_in_range & (_mm512_cmpgt_epi32_mask(iz, *(__m512i*)_pi32_512_n1) & _mm512_cmpgt_epi32_mask(vImgDi, iz)); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), v_in_range, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align0_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - // z - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - __m512i iz = _mm512_cvtps_epi32(gz); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_border_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m512 two = _mm512_set1_ps(2.f); - - // x - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - // z - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); - } - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - __m512i iz = _mm512_cvtps_epi32(gz); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align0_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), vImgWf, *(__m512*)_ps512_1), two); - gy = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), vImgHf, *(__m512*)_ps512_1), two); - gz = _mm512_div_ps(_mm512_fmsub_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), vImgDf, *(__m512*)_ps512_1), two); - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - // compute coord - { - // x - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - __m512 v0p5fp16 = _mm512_set1_ps(0.5f); - gx = _mm512_add_ps(gx, v0p5fp16); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, vImgWf), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(vImgWf, reflectx_v); - - gx = _mm512_sub_ps(gx, v0p5fp16); - - _mm512_sub_ps(gx, v0p5fp16); - - gx = _mm512_min_ps(border_x, _mm512_max_ps(gx, _mm512_setzero_ps())); - - // y - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_add_ps(gy, v0p5fp16); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, vImgHf), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(vImgHf, reflecty_v); - - gy = _mm512_sub_ps(gy, v0p5fp16); - - _mm512_sub_ps(gy, v0p5fp16); - - gy = _mm512_min_ps(border_y, _mm512_max_ps(gy, _mm512_setzero_ps())); - - // z - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_add_ps(gz, v0p5fp16); - - gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, vImgDf), *(__m512*)_ps512_inv_sign_mask); - gz = _mm512_sub_ps(vImgDf, reflectz_v); - - gz = _mm512_sub_ps(gz, v0p5fp16); - - _mm512_sub_ps(gz, v0p5fp16); - - gz = _mm512_min_ps(border_z, _mm512_max_ps(gz, _mm512_setzero_ps())); - } - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - __m512i iz = _mm512_cvtps_epi32(gz); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_reflection_blob_pack16(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m512 vImgWf = _mm512_set1_ps(src.w); - const __m512 vImgHf = _mm512_set1_ps(src.h); - const __m512 vImgDf = _mm512_set1_ps(src.d); - const __m512i vImgWi = _mm512_set1_epi32(src.w); - const __m512i vImgHi = _mm512_set1_epi32(src.h); - - const __m512i vElempacki = _mm512_set1_epi32(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m512 gx = _mm512_set1_ps(gridptr[0]); - __m512 gy = _mm512_set1_ps(gridptr[grid.elempack]); - __m512 gz = _mm512_set1_ps(gridptr[grid.elempack * 2]); - - const __m512 two = _mm512_set1_ps(2.f); - gx = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gx, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1)); - gy = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gy, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1)); - gz = _mm512_mul_ps(_mm512_div_ps(_mm512_add_ps(gz, *(__m512*)_ps512_1), two), _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1)); - - gx = _mm512_roundscale_ps(_mm512_add_ps(gx, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gy = _mm512_roundscale_ps(_mm512_add_ps(gy, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - gz = _mm512_roundscale_ps(_mm512_add_ps(gz, _mm512_set1_ps(0.5f)), _MM_FROUND_TO_NEG_INF); - - // compute coord - { - // x - const __m512 border_x = _mm512_sub_ps(vImgWf, *(__m512*)_ps512_1); - - gx = _mm512_and_ps(gx, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectx_v = _mm512_and_ps(_mm512_sub_ps(gx, border_x), *(__m512*)_ps512_inv_sign_mask); - gx = _mm512_sub_ps(border_x, reflectx_v); - - // y - const __m512 border_y = _mm512_sub_ps(vImgHf, *(__m512*)_ps512_1); - - gy = _mm512_and_ps(gy, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflecty_v = _mm512_and_ps(_mm512_sub_ps(gy, border_y), *(__m512*)_ps512_inv_sign_mask); - gy = _mm512_sub_ps(border_y, reflecty_v); - - // z - const __m512 border_z = _mm512_sub_ps(vImgDf, *(__m512*)_ps512_1); - - gz = _mm512_and_ps(gz, *(__m512*)_ps512_inv_sign_mask); - - __m512 reflectz_v = _mm512_and_ps(_mm512_sub_ps(gz, border_z), *(__m512*)_ps512_inv_sign_mask); - gz = _mm512_sub_ps(border_z, reflectz_v); - } - - __m512i ix = _mm512_cvtps_epi32(gx); - __m512i iy = _mm512_cvtps_epi32(gy); - __m512i iz = _mm512_cvtps_epi32(gz); - - __m512i i_offset = _mm512_add_epi32(_mm512_mullo_epi32(_mm512_add_epi32(_mm512_mullo_epi32(_mm512_mullo_epi32(vImgWi, vImgHi), iz), _mm512_add_epi32(_mm512_mullo_epi32(iy, vImgWi), ix)), vElempacki), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - for (int q = 0; q < dst.c; q++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), 65535, i_offset, src.channel(q), sizeof(float)); - - _mm512_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack4.h b/src/layer/x86/gridsample_nearest_pack4.h deleted file mode 100644 index 6d44dd9b822..00000000000 --- a/src/layer/x86/gridsample_nearest_pack4.h +++ /dev/null @@ -1,799 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - - __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - - __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - const __m128 two = _mm_set1_ps(2.f); - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - // compute coord - { - // x - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - __m128 v0p5fp4 = _mm_set1_ps(0.5f); - gx = _mm_add_ps(gx, v0p5fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(vImgWf, reflectx_v); - - gx = _mm_sub_ps(gx, v0p5fp4); - - _mm_sub_ps(gx, v0p5fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_add_ps(gy, v0p5fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(vImgHf, reflecty_v); - - gy = _mm_sub_ps(gy, v0p5fp4); - - _mm_sub_ps(gy, v0p5fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - } - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - - const __m128 two = _mm_set1_ps(2.f); - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - - // compute coord - { - // x - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(border_x, reflectx_v); - - // y - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(border_y, reflecty_v); - } - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(gy, vImgWf), gx), vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_3d_nearest_align0_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - // z - gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - __m128i iz = _mm_cvtps_epi32(gz); - - __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_zeros_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - const __m128i vImgWi = _mm_set1_epi32(src.w); - const __m128i vImgHi = _mm_set1_epi32(src.h); - const __m128i vImgDi = _mm_set1_epi32(src.d); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - // z - gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - __m128i ix = _mm_cvtps_epi32(gx); - __m128i iy = _mm_cvtps_epi32(gy); - __m128i iz = _mm_cvtps_epi32(gz); - - __m128i v_in_range = _mm_and_si128(_mm_and_si128(_mm_cmpgt_epi32(ix, vn1ip4), _mm_cmpgt_epi32(vImgWi, ix)), - _mm_and_si128(_mm_cmpgt_epi32(iy, vn1ip4), _mm_cmpgt_epi32(vImgHi, iy))); - v_in_range = _mm_and_si128(v_in_range, _mm_and_si128(_mm_cmpgt_epi32(iz, vn1ip4), _mm_cmpgt_epi32(vImgDi, iz))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_castsi128_ps(v_in_range)); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align0_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - // z - gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_border_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m128 two = _mm_set1_ps(2.f); - - // x - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - // z - gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); - } - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align0_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - const __m128 two = _mm_set1_ps(2.f); - gx = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gx, v1fp4), vImgWf, v1fp4), two); - gy = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gy, v1fp4), vImgHf, v1fp4), two); - gz = _mm_div_ps(_mm_comp_fmsub_ps(_mm_add_ps(gz, v1fp4), vImgDf, v1fp4), two); - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - // compute coord - { - // x - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - __m128 v0p5fp4 = _mm_set1_ps(0.5f); - gx = _mm_add_ps(gx, v0p5fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, vImgWf), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(vImgWf, reflectx_v); - - gx = _mm_sub_ps(gx, v0p5fp4); - - _mm_sub_ps(gx, v0p5fp4); - - gx = _mm_min_ps(border_x, _mm_max_ps(gx, _mm_setzero_ps())); - - // y - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_add_ps(gy, v0p5fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, vImgHf), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(vImgHf, reflecty_v); - - gy = _mm_sub_ps(gy, v0p5fp4); - - _mm_sub_ps(gy, v0p5fp4); - - gy = _mm_min_ps(border_y, _mm_max_ps(gy, _mm_setzero_ps())); - - // z - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_add_ps(gz, v0p5fp4); - - gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, vImgDf), *(__m128*)_ps_inv_sign_mask); - gz = _mm_sub_ps(vImgDf, reflectz_v); - - gz = _mm_sub_ps(gz, v0p5fp4); - - _mm_sub_ps(gz, v0p5fp4); - - gz = _mm_min_ps(border_z, _mm_max_ps(gz, _mm_setzero_ps())); - } - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_reflection_blob_pack4(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m128 vImgWf = _mm_set1_ps(src.w); - const __m128 vImgHf = _mm_set1_ps(src.h); - const __m128 vImgDf = _mm_set1_ps(src.d); - - const __m128 vElempackf = _mm_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m128 gx = _mm_set1_ps(gridptr[0]); - __m128 gy = _mm_set1_ps(gridptr[grid.elempack]); - __m128 gz = _mm_set1_ps(gridptr[grid.elempack * 2]); - - const __m128 two = _mm_set1_ps(2.f); - gx = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gx, v1fp4), two), _mm_sub_ps(vImgWf, v1fp4)); - gy = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gy, v1fp4), two), _mm_sub_ps(vImgHf, v1fp4)); - gz = _mm_mul_ps(_mm_div_ps(_mm_add_ps(gz, v1fp4), two), _mm_sub_ps(vImgDf, v1fp4)); - - gx = floor_ps(_mm_add_ps(gx, _mm_set1_ps(0.5f))); - gy = floor_ps(_mm_add_ps(gy, _mm_set1_ps(0.5f))); - gz = floor_ps(_mm_add_ps(gz, _mm_set1_ps(0.5f))); - - // compute coord - { - // x - const __m128 border_x = _mm_sub_ps(vImgWf, v1fp4); - - gx = _mm_and_ps(gx, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectx_v = _mm_and_ps(_mm_sub_ps(gx, border_x), *(__m128*)_ps_inv_sign_mask); - gx = _mm_sub_ps(border_x, reflectx_v); - - // y - const __m128 border_y = _mm_sub_ps(vImgHf, v1fp4); - - gy = _mm_and_ps(gy, *(__m128*)_ps_inv_sign_mask); - - __m128 reflecty_v = _mm_and_ps(_mm_sub_ps(gy, border_y), *(__m128*)_ps_inv_sign_mask); - gy = _mm_sub_ps(border_y, reflecty_v); - - // z - const __m128 border_z = _mm_sub_ps(vImgDf, v1fp4); - - gz = _mm_and_ps(gz, *(__m128*)_ps_inv_sign_mask); - - __m128 reflectz_v = _mm_and_ps(_mm_sub_ps(gz, border_z), *(__m128*)_ps_inv_sign_mask); - gz = _mm_sub_ps(border_z, reflectz_v); - } - - __m128 offset = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_mul_ps(vImgWf, vImgHf), gz), - _mm_add_ps(_mm_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm_set_ps(3, 2, 1, 0)); - __m128i i_offset = _mm_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m128 _v = mask_gather_ps(src.channel(q), i_offset, _mm_set1_ps(-1.0f)); - - _mm_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_pack8.h b/src/layer/x86/gridsample_nearest_pack8.h deleted file mode 100644 index a8baf3bc1dc..00000000000 --- a/src/layer/x86/gridsample_nearest_pack8.h +++ /dev/null @@ -1,787 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -static void gridsample_2d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_2d_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(y / grid.elempack).row(x) + y % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - - const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).row(y) + x * dst.elempack, _v); - } - } - } -} - -static void gridsample_3d_nearest_align0_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_zeros_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, v_in_range); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align0_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_border_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - // compute coord - { - const __m256 two = _mm256_set1_ps(2.f); - - // x - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align0_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), vImgWf, *(__m256*)_ps256_1), two); - gy = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), vImgHf, *(__m256*)_ps256_1), two); - gz = _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), vImgDf, *(__m256*)_ps256_1), two); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - gx = _mm256_add_ps(gx, v0p5fp8); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, vImgWf), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(vImgWf, reflectx_v); - - gx = _mm256_sub_ps(gx, v0p5fp8); - - _mm256_sub_ps(gx, v0p5fp8); - - gx = _mm256_min_ps(border_x, _mm256_max_ps(gx, _mm256_setzero_ps())); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_add_ps(gy, v0p5fp8); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, vImgHf), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(vImgHf, reflecty_v); - - gy = _mm256_sub_ps(gy, v0p5fp8); - - _mm256_sub_ps(gy, v0p5fp8); - - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); - - // z - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_add_ps(gz, v0p5fp8); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, vImgDf), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(vImgDf, reflectz_v); - - gz = _mm256_sub_ps(gz, v0p5fp8); - - _mm256_sub_ps(gz, v0p5fp8); - - gz = _mm256_min_ps(border_z, _mm256_max_ps(gz, _mm256_setzero_ps())); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} - -static void gridsample_3d_nearest_align1_reflection_blob_pack8(const Mat& src, Mat& dst, const Mat& grid, const Option& opt) -{ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vImgDf = _mm256_set1_ps(src.d); - - const __m256 vElempackf = _mm256_set1_ps(src.elempack); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int z = 0; z < dst.d; z++) - { - for (int y = 0; y < dst.h; y++) - { - for (int x = 0; x < dst.w; x++) - { - //grid tensor has been packed - const float* gridptr = grid.channel(z / grid.elempack).depth(y).row(x) + z % grid.elempack; - __m256 gx = _mm256_set1_ps(gridptr[0]); - __m256 gy = _mm256_set1_ps(gridptr[grid.elempack]); - __m256 gz = _mm256_set1_ps(gridptr[grid.elempack * 2]); - - const __m256 two = _mm256_set1_ps(2.f); - gx = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gx, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1)); - gy = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gy, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1)); - gz = _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(gz, *(__m256*)_ps256_1), two), _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1)); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - // compute coord - { - // x - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); - - gx = _mm256_and_ps(gx, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(gx, border_x), *(__m256*)_ps256_inv_sign_mask); - gx = _mm256_sub_ps(border_x, reflectx_v); - - // y - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - - gy = _mm256_and_ps(gy, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflecty_v = _mm256_and_ps(_mm256_sub_ps(gy, border_y), *(__m256*)_ps256_inv_sign_mask); - gy = _mm256_sub_ps(border_y, reflecty_v); - - // z - const __m256 border_z = _mm256_sub_ps(vImgDf, *(__m256*)_ps256_1); - - gz = _mm256_and_ps(gz, *(__m256*)_ps256_inv_sign_mask); - - __m256 reflectz_v = _mm256_and_ps(_mm256_sub_ps(gz, border_z), *(__m256*)_ps256_inv_sign_mask); - gz = _mm256_sub_ps(border_z, reflectz_v); - } - - __m256 offset = _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vImgWf, vImgHf), gz), - _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx)), - vElempackf), - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); - - __m256i i_offset = _mm256_cvtps_epi32(offset); - - for (int q = 0; q < dst.c; q++) - { - __m256 _v = mask_gather_ps256(src.channel(q), i_offset, _mm256_set1_ps(-1.0f)); - - _mm256_storeu_ps(dst.channel(q).depth(z).row(y) + x * dst.elempack, _v); - } - } - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 30170d31a62..26e125a54ec 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -38,21 +38,131 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ -#if __AVX512F__ - -_PS512_CONST(n1, -1.0f); -_PI32_CONST512(n1, -1); - -#include "gridsample_bilinear_pack16.h" -#include "gridsample_nearest_pack16.h" -#include "gridsample_bicubic_pack16.h" - -#endif // __AVX512F__ _PS256_CONST(n1, -1.0f); _PS256_CONST(2, 2.0f); _PI32_CONST256(n1, -1); +using PaddingMode = ncnn::GridSample::PaddingMode; + +template +struct grid_sample_unormalize; + +template<> +struct grid_sample_unormalize +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), length, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return (coord + 1) / 2.f * (length - 1); + } +}; + +template<> +struct grid_sample_unormalize +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), length, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return ((coord + 1) * length - 1) / 2.f; + } +}; + +template +struct compute_coord; + +template +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); + + coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); + + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return std::min(length - 1.0f, std::max(coord, 0.0f)); + } +}; + +template<> +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); + + coord = _mm256_and_ps(coord, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(coord, border_x), *(__m256*)_ps256_inv_sign_mask); + coord = _mm256_sub_ps(border_x, reflectx_v); + + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + coord = abs(coord); + coord = (length - 1) - abs(coord - (length - 1)); + + return std::min(length - 1.0f, std::max(coord, 0.0f)); + } +}; + +template<> +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + coord = _mm256_add_ps(coord, v0p5fp8); + + coord = _mm256_and_ps(coord, *(__m256*)_ps256_inv_sign_mask); + + __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(coord, length), *(__m256*)_ps256_inv_sign_mask); + coord = _mm256_sub_ps(length, reflectx_v); + + coord = _mm256_sub_ps(coord, v0p5fp8); + + _mm256_sub_ps(coord, v0p5fp8); + + coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); + + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + coord = abs(coord + 0.5f); + coord = length - abs(coord - length) - 0.5; + + return std::min(length - 1.0f, std::max(coord, 0.0f)); + } +}; + +#include "gridsample_bilinear_compute_blob.h" +#include "gridsample_bicubic_compute_blob.h" +#include "gridsample_nearest_compute_blob.h" + static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) { #if __AVX2__ @@ -99,10 +209,6 @@ static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m25 return _v; } -#include "gridsample_bilinear_pack8.h" -#include "gridsample_nearest_pack8.h" -#include "gridsample_bicubic_pack8.h" - #endif // __AVX__ const __m128 v1fp4 = _mm_set1_ps(1.0f); @@ -134,10 +240,6 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, return v; } -#include "gridsample_bilinear_pack4.h" -#include "gridsample_nearest_pack4.h" -#include "gridsample_bicubic_pack4.h" - static inline void interpolate_cubic(float fx, float* coeffs) { const float A = -0.75f; @@ -162,10 +264,6 @@ static inline float reflect_coord(float x, int high) #endif // __SSE2__ -#include "gridsample_bilinear_pack1.h" -#include "gridsample_nearest_pack1.h" -#include "gridsample_bicubic_pack1.h" - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -177,950 +275,290 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample sample_type error\n"); - return -100; + gridsample_2d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 2) + else if (padding_mode == PaddingMode::Border) { - if (padding_mode == 1) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_zeros_blob_pack16(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_zeros_blob_pack16(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } - else if (padding_mode == 2) + else { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_border_blob_pack16(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_border_blob_pack16(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } - else if (padding_mode == 3) + } + else if (padding_mode == PaddingMode::Reflection) + { + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_reflection_blob_pack16(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_reflection_blob_pack16(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample sample_type error\n"); - return -100; + gridsample_2d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 3) + else { - NCNN_LOGE("unsupported bicubic when dims == 4"); + NCNN_LOGE("gridsample padding_mode error\n"); return -100; } } - } -#endif // __AVX512F__ -#if __AVX__ - - if (elempack == 8) - { - if (dims == 3) + if (sample_type == InterpolationMode::Nearest) { - top_blob.create(grid.h, grid.c * grid.elempack, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) + offset_blob.create(outw, outh, 1, elemsize, 1, opt.blob_allocator); + in_bound_blob.create(outw, outh, 1, elemsize, 1, opt.blob_allocator); + if (offset_blob.empty() || in_bound_blob.empty()) return -100; - if (sample_type == 1) + if (padding_mode == PaddingMode::Zeros) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bilinear_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bilinear_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bilinear_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_2d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 2) + else if (padding_mode == PaddingMode::Border) { - if (padding_mode == 1) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_nearest_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_nearest_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_nearest_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_2d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 3) + else if (padding_mode == PaddingMode::Reflection) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bicubic_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bicubic_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bicubic_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_2d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } + else + { + NCNN_LOGE("gridsample padding_mode error\n"); + return -100; + } } - if (dims == 4) + if (sample_type == InterpolationMode::Bicubic) { - top_blob.create(grid.h, grid.d, grid.c * grid.elempack, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) + offset_blob.create(outw, outh, 16, elemsize, 1, opt.blob_allocator); + in_bound_blob.create(outw, outh, 16, elemsize, 1, opt.blob_allocator); + if (offset_blob.empty() || in_bound_blob.empty() || value_blob.empty()) return -100; - if (sample_type == 1) + if (padding_mode == PaddingMode::Zeros) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_bilinear_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_bilinear_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_bilinear_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bicubic_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample sample_type error\n"); - return -100; + gridsample_2d_bicubic_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 2) + else if (padding_mode == PaddingMode::Border) { - if (padding_mode == 1) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_zeros_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bicubic_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } - else if (padding_mode == 2) + else { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_border_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bicubic_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } - else if (padding_mode == 3) + } + else if (padding_mode == PaddingMode::Reflection) + { + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_reflection_blob_pack8(bottom_blob, top_blob, grid, opt); - } + gridsample_2d_bicubic_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample sample_type error\n"); - return -100; + gridsample_2d_bicubic_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 3) + else { - NCNN_LOGE("unsupported bicubic when dims == 4"); + NCNN_LOGE("gridsample padding_mode error\n"); return -100; } } } + -#endif // __AVX__ - - if (elempack == 4) + /*if (dims == 4) { - if (dims == 3) - { - top_blob.create(grid.h, grid.c * grid.elempack, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + top_blob.create(grid.h, grid.d, grid.c * grid.elempack, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (sample_type == 1) - { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bilinear_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bilinear_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bilinear_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else - { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; - } - } - - if (sample_type == 2) - { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_nearest_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_nearest_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_nearest_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else - { - NCNN_LOGE("gridsample sample_type error\n"); - return -100; - } - } - - if (sample_type == 3) + if (sample_type == 1) + { + if (padding_mode == 1) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bicubic_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bicubic_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_2d_bicubic_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } + gridsample_3d_bilinear_align0_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_3d_bilinear_align1_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - } - - if (dims == 4) - { - top_blob.create(grid.h, grid.d, grid.c * grid.elempack, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (sample_type == 1) + else if (padding_mode == 2) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_bilinear_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_bilinear_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_bilinear_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } + gridsample_3d_bilinear_align0_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_3d_bilinear_align1_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 2) + else if (padding_mode == 3) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_zeros_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_border_blob_pack4(bottom_blob, top_blob, grid, opt); - } - } - else if (padding_mode == 3) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } - else - { - gridsample_3d_nearest_align1_reflection_blob_pack4(bottom_blob, top_blob, grid, opt); - } + gridsample_3d_bilinear_align0_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample sample_type error\n"); - return -100; + gridsample_3d_bilinear_align1_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - - if (sample_type == 3) + else { - NCNN_LOGE("unsupported bicubic when dims == 4"); + NCNN_LOGE("gridsample sample_type error\n"); return -100; } } - } - -#endif // __SSE2__ - - if (elempack == 1) - { -#if __SSE2__ - ncnn::Mat grid_tmp; - - if (grid.elempack != 1) - { - ncnn::convert_packing(grid, grid_tmp, 1, opt); - } - ncnn::Mat grid_p1 = (grid.elempack == 1) ? grid : grid_tmp; - - if (dims == 3) + if (sample_type == 2) { - top_blob.create(grid_p1.h, grid_p1.c, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (sample_type == 1) + if (padding_mode == 1) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_bilinear_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_bilinear_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 3) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_bilinear_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_bilinear_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } + gridsample_3d_nearest_align0_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_3d_nearest_align1_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (sample_type == 2) + else if (padding_mode == 2) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_nearest_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_nearest_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 3) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_nearest_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_nearest_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } + gridsample_3d_nearest_align0_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; + gridsample_3d_nearest_align1_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (sample_type == 3) + else if (padding_mode == 3) { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_bicubic_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 2) + if (align_corner == 0) { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_bicubic_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_2d_bicubic_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_2d_bicubic_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } + gridsample_3d_nearest_align0_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - NCNN_LOGE("gridsample padding_mode error\n"); - return -100; - } - } - } - - if (dims == 4) - { - top_blob.create(grid_p1.h, grid_p1.d, grid_p1.c, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (sample_type == 1) - { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_3d_bilinear_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_3d_bilinear_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_3d_bilinear_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_3d_bilinear_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - } - else if (sample_type == 2) - { - if (padding_mode == 1) - { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_3d_nearest_align1_zeros_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 2) - { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_3d_nearest_align1_border_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - } - else if (padding_mode == 3) - { - if (align_corner == 0) - { - gridsample_3d_nearest_align0_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } - else - { - gridsample_3d_nearest_align1_reflection_blob_pack1(bottom_blob, top_blob, grid_p1, opt); - } + gridsample_3d_nearest_align1_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } else { - NCNN_LOGE("unsupported bicubic when dims == 4"); - return -1; + NCNN_LOGE("gridsample sample_type error\n"); + return -100; } } - return 0; -#else - return GridSample::forward(bottom_blobs, top_blobs, opt); + + if (sample_type == 3) + { + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -100; + } + }*/ + +#endif // __AVX__ + #endif // __SSE2__ - } return 0; } From 4fa3478bc26d2b8f9ea9ac57b25ef25af707a0ce Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Tue, 14 Feb 2023 13:10:49 +0000 Subject: [PATCH 060/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_compute_blob.h | 8 +++----- .../x86/gridsample_bilinear_compute_blob.h | 2 +- src/layer/x86/gridsample_nearest_compute_blob.h | 1 - src/layer/x86/gridsample_x86.cpp | 17 ++++++++--------- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index e6e989e1912..8ce26bf5036 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -27,7 +27,7 @@ struct gridsample_2d_bicubic_compute_blob #endif // __AVX2__ #endif // __AVX__ - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; for (int i = 0; i < 4; i++) { @@ -266,11 +266,11 @@ struct gridsample_2d_bicubic_compute_blob #endif // __AVX2__ #endif // __AVX__ - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - for (int i = 0; i < 4; i ++) + for (int i = 0; i < 4; i++) { v0_offset_ptr[i * 4 + 0] = offset.channel(i * 4 + 0); v0_offset_ptr[i * 4 + 1] = offset.channel(i * 4 + 1); @@ -529,5 +529,3 @@ struct gridsample_2d_bicubic_compute_blob } } }; - - diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 5c6089cfc69..3e200d4e88d 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -105,7 +105,7 @@ struct gridsample_2d_bilinear_compute_blob __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - + _mm256_storeu_ps(in_bound_ptr_00, *(__m256*)_ps256_n1); _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index c93bf2f1a52..162ae5c6dfe 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -327,4 +327,3 @@ struct gridsample_2d_nearest_compute_blob } } }; - diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 26e125a54ec..b76c8b07f2d 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -280,15 +280,15 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 16 Feb 2023 23:53:16 +0800 Subject: [PATCH 061/127] [WIP] finish 2d_compute_blob and pack8_interpolation --- src/layer/gridsample.cpp | 40 +- .../x86/gridsample_bicubic_compute_blob.h | 536 ++++++++++++------ .../x86/gridsample_bilinear_compute_blob.h | 314 +++++++--- .../x86/gridsample_nearest_compute_blob.h | 84 ++- src/layer/x86/gridsample_x86.cpp | 343 +++++++---- tests/test_gridsample.cpp | 191 ++++--- 6 files changed, 1036 insertions(+), 472 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 025dcf8a0dc..8059628310f 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -59,9 +59,9 @@ static float grid_sample_unormalize(int w, float coordx, int align_corner) return align_corner ? (coordx + 1) / 2.f * (w - 1) : ((coordx + 1) * w - 1) / 2.f; } -static float border_coord(int x, int border) +static float border_coord(float x, float border) { - return std::min(border, std::max(x, 0)); + return std::min(border, std::max(x, 0.0f)); } static float reflect_coord(float x, int high) @@ -71,7 +71,7 @@ static float reflect_coord(float x, int high) return x; } -static int compute_coord(int sx, int w, int padding_mode, int align_corner) +static float compute_coord(float sx, int w, int padding_mode, int align_corner) { if (padding_mode == 2) // border { @@ -85,7 +85,7 @@ static int compute_coord(int sx, int w, int padding_mode, int align_corner) } else { - sx = static_cast(reflect_coord(sx + 0.5, w) - 0.5); + sx = reflect_coord(sx + 0.5, w) - 0.5; sx = border_coord(sx, w - 1); } } @@ -170,7 +170,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& Mat offset_blob; offset_blob.create(outw, outh, grid.c, elemsize, opt.blob_allocator); - //1 pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly + //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly if (permute_fusion == 0) { float* offsetptr_x = offset_blob.channel(0); @@ -224,7 +224,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } - if (sample_type == 1) // bilinear + if (sample_type == InterpolationMode::Bilinear) // bilinear { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -244,15 +244,17 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& // bilinear interpolate float v; { - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); + sample_x = compute_coord(sample_x, w, padding_mode, align_corner); + sample_y = compute_coord(sample_y, h, padding_mode, align_corner); + int x0 = floor(sample_x); + int y0 = floor(sample_y); int x1 = x0 + 1; int y1 = y0 + 1; - float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner); - float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner); - float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner); - float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner); + float v00 = get_value_bounded(image, x0, y0); + float v01 = get_value_bounded(image, x1, y0); + float v10 = get_value_bounded(image, x0, y1); + float v11 = get_value_bounded(image, x1, y1); float alpha = sample_x - x0; float beta = sample_y - y0; @@ -272,7 +274,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == 2) // nearest + else if (sample_type == InterpolationMode::Nearest) // nearest { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -288,11 +290,13 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& { float sample_x = *offsetptr_x; float sample_y = *offsetptr_y; + sample_x = compute_coord(sample_x, w, padding_mode, align_corner); + sample_y = compute_coord(sample_y, h, padding_mode, align_corner); int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - float v = get_value_bounded(image, x0, y0, padding_mode, align_corner); + float v = get_value_bounded(image, x0, y0); outptr[0] = v; outptr += 1; @@ -303,7 +307,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == 3) // bicubic + else if (sample_type == InterpolationMode::Bicubic) // bicubic { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -386,7 +390,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& Mat offset_blob; offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.blob_allocator); - //1 pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly + //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly if (permute_fusion == 0) { float* offsetptr_x = offset_blob.channel(0); @@ -458,7 +462,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } - if (sample_type == 1) // bilinear + if (sample_type == InterpolationMode::Bilinear) // bilinear { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -524,7 +528,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == 2) // nearest + else if (sample_type == InterpolationMode::Nearest) // nearest { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index e6e989e1912..34bf27d919d 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -21,22 +21,22 @@ struct gridsample_2d_bicubic_compute_blob #if __AVX__ const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; for (int i = 0; i < 4; i++) { - v0_offset_ptr[i * 4 + 0] = offset.channel(i * 4 + 0); - v1_offset_ptr[i * 4 + 1] = offset.channel(i * 4 + 1); - v2_offset_ptr[i * 4 + 2] = offset.channel(i * 4 + 2); - v3_offset_ptr[i * 4 + 3] = offset.channel(i * 4 + 3); + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); } + float* value_x = value.channel(0); + float* value_y = value.channel(1); + grid_sample_unormalize unormalize; compute_coord get_coord; @@ -49,8 +49,8 @@ struct gridsample_2d_bicubic_compute_blob #if __AVX__ for (int x = 0; x + 15 < nn; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); @@ -63,11 +63,9 @@ struct gridsample_2d_bicubic_compute_blob { // x gx = unormalize(vImgWf, gx); - gx = get_coord(vImgWf, gx); // y gy = unormalize(vImgHf, gy); - gy = get_coord(vImgHf, gy); } __m256 gx_floor = _mm256_floor_ps(gx); @@ -76,38 +74,45 @@ struct gridsample_2d_bicubic_compute_blob const __m256 tx = _mm256_sub_ps(gx, gx_floor); const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx3 = _mm256_add_ps(gx2, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + gx0 = get_coord(vImgWf, gx0); + gx1 = get_coord(vImgWf, gx1); + gx2 = get_coord(vImgWf, gx2); + gx3 = get_coord(vImgWf, gx3); - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + gy = get_coord(vImgHf, gy); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + __m256 gy_offset = _mm256_mul_ps(gy, vImgWf); - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), vElempackf); + __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), vElempackf); + __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), vElempackf); + __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), vElempackf); + + _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); + _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); + _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); + _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + + v0_offset_ptr[i] += 8; + v1_offset_ptr[i] += 8; + v2_offset_ptr[i] += 8; + v3_offset_ptr[i] += 8; } + _mm256_storeu_ps(value_x, tx); + _mm256_storeu_ps(value_y, ty); + + value_x += 8; + value_y += 8; + gridptr += 16; } @@ -123,25 +128,39 @@ struct gridsample_2d_bicubic_compute_blob sample_x = unormalize(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); int x1 = floor(sample_x); int y1 = floor(sample_y); int x0 = x1 - 1; - int y0 = y1 - 1; int x2 = x1 + 1; - int y2 = y1 + 1; int x3 = x1 + 2; - int y3 = y1 + 2; - x1 = std::min(src.w - 1, std::max(x1, 0)); - y1 = std::min(src.h - 1, std::max(y1, 0)); - x0 = std::min(src.w - 1, std::max(x0, 0)); - y0 = std::min(src.h - 1, std::max(y0, 0)); - x2 = std::min(src.w - 1, std::max(x2, 0)); - y2 = std::min(src.h - 1, std::max(y2, 0)); - x3 = std::min(src.w - 1, std::max(x3, 0)); - y3 = std::min(src.h - 1, std::max(y3, 0)); + *value_x = sample_x - static_cast(x1); + *value_y = sample_y - static_cast(y1); + + x1 = get_coord(src.w, x1); + x0 = get_coord(src.w, x0); + x2 = get_coord(src.w, x2); + x3 = get_coord(src.w, x3); + + for (int i = 0; i < 4; i ++) + { + int offset_y = get_coord(src.h, y1 + i - 1) * src.w; + + *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; + *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; + *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; + *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + + v0_offset_ptr[i]++; + v1_offset_ptr[i]++; + v2_offset_ptr[i]++; + v3_offset_ptr[i]++; + } + + value_x++; + value_y++; gridptr += 2; } @@ -163,11 +182,9 @@ struct gridsample_2d_bicubic_compute_blob { // x gx = unormalize(vImgWf, gx); - gx = get_coord(vImgWf, gx); // y gy = unormalize(vImgHf, gy); - gy = get_coord(vImgHf, gy); } __m256 gx_floor = _mm256_floor_ps(gx); @@ -176,38 +193,45 @@ struct gridsample_2d_bicubic_compute_blob const __m256 tx = _mm256_sub_ps(gx, gx_floor); const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx3 = _mm256_add_ps(gx2, *(__m256*)_ps256_1); - const __m256 border_y = _mm256_sub_ps(vImgHf, *(__m256*)_ps256_1); - const __m256 border_x = _mm256_sub_ps(vImgWf, *(__m256*)_ps256_1); + gx0 = get_coord(vImgWf, gx0); + gx1 = get_coord(vImgWf, gx1); + gx2 = get_coord(vImgWf, gx2); + gx3 = get_coord(vImgWf, gx3); - gx0 = _mm256_min_ps(border_x, _mm256_max_ps(gx0, _mm256_setzero_ps())); - gx1 = _mm256_min_ps(border_x, _mm256_max_ps(gx1, _mm256_setzero_ps())); - gx2 = _mm256_min_ps(border_x, _mm256_max_ps(gx2, _mm256_setzero_ps())); - gx3 = _mm256_min_ps(border_x, _mm256_max_ps(gx3, _mm256_setzero_ps())); - - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = _mm256_min_ps(border_y, _mm256_max_ps(gy, _mm256_setzero_ps())); + gy = get_coord(vImgHf, gy); - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + __m256 gy_offset = _mm256_mul_ps(gy, vImgWf); - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), vElempackf); + __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), vElempackf); + __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), vElempackf); + __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), vElempackf); + + _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); + _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); + _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); + _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + + v0_offset_ptr[i] += 8; + v1_offset_ptr[i] += 8; + v2_offset_ptr[i] += 8; + v3_offset_ptr[i] += 8; } + _mm256_storeu_ps(value_x, tx); + _mm256_storeu_ps(value_y, ty); + + value_x += 8; + value_y += 8; + gridptr_x += 8; gridptr_y += 8; } @@ -224,25 +248,39 @@ struct gridsample_2d_bicubic_compute_blob sample_x = unormalize(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); int x1 = floor(sample_x); int y1 = floor(sample_y); int x0 = x1 - 1; - int y0 = y1 - 1; int x2 = x1 + 1; - int y2 = y1 + 1; int x3 = x1 + 2; - int y3 = y1 + 2; - x1 = std::min(src.w - 1, std::max(x1, 0)); - y1 = std::min(src.h - 1, std::max(y1, 0)); - x0 = std::min(src.w - 1, std::max(x0, 0)); - y0 = std::min(src.h - 1, std::max(y0, 0)); - x2 = std::min(src.w - 1, std::max(x2, 0)); - y2 = std::min(src.h - 1, std::max(y2, 0)); - x3 = std::min(src.w - 1, std::max(x3, 0)); - y3 = std::min(src.h - 1, std::max(y3, 0)); + *value_x = sample_x - static_cast(x1); + *value_y = sample_y - static_cast(y1); + + x1 = get_coord(src.w, x1); + x0 = get_coord(src.w, x0); + x2 = get_coord(src.w, x2); + x3 = get_coord(src.w, x3); + + for (int i = 0; i < 4; i++) + { + int offset_y = static_cast(get_coord(src.h, y1 + i - 1)) * src.w; + + *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; + *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; + *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; + *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + + v0_offset_ptr[i]++; + v1_offset_ptr[i]++; + v2_offset_ptr[i]++; + v3_offset_ptr[i]++; + } + + value_x++; + value_y++; gridptr_x++; gridptr_y++; @@ -260,27 +298,27 @@ struct gridsample_2d_bicubic_compute_blob #if __AVX__ const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; + float* value_x = value.channel(0); + float* value_y = value.channel(1); + for (int i = 0; i < 4; i ++) { - v0_offset_ptr[i * 4 + 0] = offset.channel(i * 4 + 0); - v0_offset_ptr[i * 4 + 1] = offset.channel(i * 4 + 1); - v0_offset_ptr[i * 4 + 2] = offset.channel(i * 4 + 2); - v0_offset_ptr[i * 4 + 3] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i * 4 + 0] = in_bound.channel(i * 4 + 0); - v0_in_bound_ptr[i * 4 + 1] = in_bound.channel(i * 4 + 1); - v0_in_bound_ptr[i * 4 + 2] = in_bound.channel(i * 4 + 2); - v0_in_bound_ptr[i * 4 + 3] = in_bound.channel(i * 4 + 3); + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); + v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); + v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); + v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); } grid_sample_unormalize unormalize; @@ -294,8 +332,8 @@ struct gridsample_2d_bicubic_compute_blob #if __AVX__ for (int x = 0; x + 15 < nn; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); @@ -318,41 +356,53 @@ struct gridsample_2d_bicubic_compute_blob const __m256 tx = _mm256_sub_ps(gx, gx_floor); const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx3 = _mm256_add_ps(gx2, *(__m256*)_ps256_1); __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); + _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_and_ps(x0_in_range, y_in_range)); + _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_and_ps(x1_in_range, y_in_range)); + _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_and_ps(x2_in_range, y_in_range)); + _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_and_ps(x3_in_range, y_in_range)); + + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf); + __m256 v1_offset_f = _mm256_add_ps(v0_offset_f, vElempackf); + __m256 v2_offset_f = _mm256_add_ps(v1_offset_f, vElempackf); + __m256 v3_offset_f = _mm256_add_ps(v2_offset_f, vElempackf); + + _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); + _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); + _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); + _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + + v0_offset_ptr[i] += 8; + v1_offset_ptr[i] += 8; + v2_offset_ptr[i] += 8; + v3_offset_ptr[i] += 8; + + v0_in_bound_ptr[i] += 8; + v1_in_bound_ptr[i] += 8; + v2_in_bound_ptr[i] += 8; + v3_in_bound_ptr[i] += 8; + } - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + _mm256_storeu_ps(value_x, tx); + _mm256_storeu_ps(value_y, ty); - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + value_x += 8; + value_y += 8; gridptr += 16; } @@ -368,42 +418,52 @@ struct gridsample_2d_bicubic_compute_blob // x sample_x = unormalize(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); int x1 = floor(sample_x); int y1 = floor(sample_y); int x0 = x1 - 1; - int y0 = y1 - 1; int x2 = x1 + 1; - int y2 = y1 + 1; int x3 = x1 + 2; - int y3 = y1 + 2; bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); bool x2_in_range = (x2 > -1) & (x2 < src.w); - bool y2_in_range = (y2 > -1) & (y2 < src.h); bool x3_in_range = (x3 > -1) & (x3 < src.w); - bool y3_in_range = (y3 > -1) & (y3 < src.h); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v02_in_range = x2_in_range & y0_in_range; - bool v03_in_range = x3_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - bool v12_in_range = x2_in_range & y1_in_range; - bool v13_in_range = x3_in_range & y1_in_range; - bool v20_in_range = x0_in_range & y2_in_range; - bool v21_in_range = x1_in_range & y2_in_range; - bool v22_in_range = x2_in_range & y2_in_range; - bool v23_in_range = x3_in_range & y2_in_range; - bool v30_in_range = x0_in_range & y3_in_range; - bool v31_in_range = x1_in_range & y3_in_range; - bool v32_in_range = x2_in_range & y3_in_range; - bool v33_in_range = x3_in_range & y3_in_range; + + for (int i = 0; i < 4; i ++) + { + int gy = y1 + i - 1; + int offset_y = gy * src.w; + + bool y_in_range = (gy > -1) & (gy < src.h); + + *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + + *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; + *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; + *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; + *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + + v0_offset_ptr[i]++; + v1_offset_ptr[i]++; + v2_offset_ptr[i]++; + v3_offset_ptr[i]++; + + v0_in_bound_ptr[i]++; + v1_in_bound_ptr[i]++; + v2_in_bound_ptr[i]++; + v3_in_bound_ptr[i]++; + } + + *value_x = sample_x - static_cast(x1); + *value_y = sample_y - static_cast(y1); + + value_x++; + value_y++; gridptr += 2; } @@ -435,41 +495,53 @@ struct gridsample_2d_bicubic_compute_blob const __m256 tx = _mm256_sub_ps(gx, gx_floor); const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 coefficients[4]; - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx_floor, _mm256_set1_ps(2.0f)); + __m256 gx3 = _mm256_add_ps(gx2, *(__m256*)_ps256_1); __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx0, _CMP_GT_OS)); __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx1, _CMP_GT_OS)); __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx2, _CMP_GT_OS)); __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx3, _CMP_GT_OS)); - __m256i v0_offset[4], v1_offset[4], v2_offset[4], v3_offset[4]; - __m256 v0_in_range[4], v1_in_range[4], v2_in_range[4], v3_in_range[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS)); - v0_in_range[i] = _mm256_and_ps(x0_in_range, y_in_range); - v1_in_range[i] = _mm256_and_ps(x1_in_range, y_in_range); - v2_in_range[i] = _mm256_and_ps(x2_in_range, y_in_range); - v3_in_range[i] = _mm256_and_ps(x3_in_range, y_in_range); + _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_and_ps(x0_in_range, y_in_range)); + _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_and_ps(x1_in_range, y_in_range)); + _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_and_ps(x2_in_range, y_in_range)); + _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_and_ps(x3_in_range, y_in_range)); + + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0), vElempackf); + __m256 v1_offset_f = _mm256_add_ps(v0_offset_f, vElempackf); + __m256 v2_offset_f = _mm256_add_ps(v1_offset_f, vElempackf); + __m256 v3_offset_f = _mm256_add_ps(v2_offset_f, vElempackf); + + _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); + _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); + _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); + _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + + v0_offset_ptr[i] += 8; + v1_offset_ptr[i] += 8; + v2_offset_ptr[i] += 8; + v3_offset_ptr[i] += 8; + + v0_in_bound_ptr[i] += 8; + v1_in_bound_ptr[i] += 8; + v2_in_bound_ptr[i] += 8; + v3_in_bound_ptr[i] += 8; + } - __m256 v0_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx0); - __m256 v1_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx1); - __m256 v2_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx2); - __m256 v3_offset_f = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx3); + _mm256_storeu_ps(value_x, tx); + _mm256_storeu_ps(value_y, ty); - v0_offset[i] = _mm256_cvtps_epi32(v0_offset_f); - v1_offset[i] = _mm256_cvtps_epi32(v1_offset_f); - v2_offset[i] = _mm256_cvtps_epi32(v2_offset_f); - v3_offset[i] = _mm256_cvtps_epi32(v3_offset_f); - } + value_x += 8; + value_y += 8; gridptr_x += 8; gridptr_y += 8; @@ -486,42 +558,52 @@ struct gridsample_2d_bicubic_compute_blob // x sample_x = unormalize(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); int x1 = floor(sample_x); int y1 = floor(sample_y); int x0 = x1 - 1; - int y0 = y1 - 1; int x2 = x1 + 1; - int y2 = y1 + 1; int x3 = x1 + 2; - int y3 = y1 + 2; bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); bool x2_in_range = (x2 > -1) & (x2 < src.w); - bool y2_in_range = (y2 > -1) & (y2 < src.h); bool x3_in_range = (x3 > -1) & (x3 < src.w); - bool y3_in_range = (y3 > -1) & (y3 < src.h); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v02_in_range = x2_in_range & y0_in_range; - bool v03_in_range = x3_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - bool v12_in_range = x2_in_range & y1_in_range; - bool v13_in_range = x3_in_range & y1_in_range; - bool v20_in_range = x0_in_range & y2_in_range; - bool v21_in_range = x1_in_range & y2_in_range; - bool v22_in_range = x2_in_range & y2_in_range; - bool v23_in_range = x3_in_range & y2_in_range; - bool v30_in_range = x0_in_range & y3_in_range; - bool v31_in_range = x1_in_range & y3_in_range; - bool v32_in_range = x2_in_range & y3_in_range; - bool v33_in_range = x3_in_range & y3_in_range; + + for (int i = 0; i < 4; i++) + { + int gy = y1 + i - 1; + int offset_y = gy * src.w; + + bool y_in_range = (gy > -1) & (gy < src.h); + + *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + + *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; + *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; + *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; + *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + + v0_offset_ptr[i]++; + v1_offset_ptr[i]++; + v2_offset_ptr[i]++; + v3_offset_ptr[i]++; + + v0_in_bound_ptr[i]++; + v1_in_bound_ptr[i]++; + v2_in_bound_ptr[i]++; + v3_in_bound_ptr[i]++; + } + + *value_x = sample_x - static_cast(x1); + *value_y = sample_y - static_cast(y1); + + value_x++; + value_y++; gridptr_x++; gridptr_y++; @@ -530,4 +612,100 @@ struct gridsample_2d_bicubic_compute_blob } }; +#if __AVX__ +static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) +{ + const __m256 A = _mm256_set1_ps(-0.75f); + + const __m256 x0 = _mm256_add_ps(tx, *(__m256*)_ps256_1); + const __m256& x1 = tx; + const __m256 x2 = _mm256_sub_ps(*(__m256*)_ps256_1, tx); + //const __m256 x3 = _mm256_add_ps(x2, *(__m256*)_ps256_1); + + coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), *(__m256*)_ps256_1); + coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), *(__m256*)_ps256_1); + coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(*(__m256*)_ps256_1, coeffs0), coeffs1), coeffs2); +} + +static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m256 value_f[4]; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); + v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); + v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); + v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m256 x0_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v0_in_bound_ptr[ii])); + __m256 x1_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v1_in_bound_ptr[ii])); + __m256 x2_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v2_in_bound_ptr[ii])); + __m256 x3_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v3_in_bound_ptr[ii])); + + value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + + v0_in_bound_ptr[ii]++; + v1_in_bound_ptr[ii]++; + v2_in_bound_ptr[ii]++; + v3_in_bound_ptr[ii]++; + } + + cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*value_y)); + + __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); + _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm256_storeu_ps(dstptr, _v); + + value_x++; + value_y++; + + dstptr += 8; + } + + } +} +#endif // __AVX__ \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 5c6089cfc69..421f9492423 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -21,15 +21,19 @@ struct gridsample_2d_bilinear_compute_blob #if __AVX__ const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #if __AVX2__ const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #endif // __AVX2__ #endif // __AVX__ - int* offset_ptr = offset.channel(0); + int* offset_ptr_00 = offset.channel(0); + int* offset_ptr_01 = offset.channel(1); + int* offset_ptr_10 = offset.channel(2); + int* offset_ptr_11 = offset.channel(3); - float* in_bound_ptr_00 = in_bound.channel(0); float* in_bound_ptr_01 = in_bound.channel(1); float* in_bound_ptr_10 = in_bound.channel(2); float* in_bound_ptr_11 = in_bound.channel(3); @@ -49,8 +53,8 @@ struct gridsample_2d_bilinear_compute_blob #if __AVX__ for (int x = 0; x + 15 < nn; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); @@ -70,7 +74,7 @@ struct gridsample_2d_bilinear_compute_blob gy = get_coord(vImgHf, gy); } - __m256 x_w = _mm256_floor_ps(gx); + __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); #if __AVX2__ @@ -84,9 +88,11 @@ struct gridsample_2d_bilinear_compute_blob __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - _mm256_storeu_epi32(in_bound_ptr_00, *(__m256i*)_pi32_256_1); _mm256_storeu_epi32(in_bound_ptr_01, x1_in_range); _mm256_storeu_epi32(in_bound_ptr_10, y1_in_range); _mm256_storeu_epi32(in_bound_ptr_11, v11_in_range); @@ -99,20 +105,25 @@ struct gridsample_2d_bilinear_compute_blob __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); - __m256 ne_offset = _mm256_add_ps(nw_offset, *(__m256*)_ps256_1); - __m256 sw_offset = _mm256_add_ps(nw_offset, vImgWf); - __m256 se_offset = _mm256_add_ps(sw_offset, *(__m256*)_ps256_1); + __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - - _mm256_storeu_ps(in_bound_ptr_00, *(__m256*)_ps256_n1); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); + _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); #endif - _mm256_storeu_epi32(offset_ptr, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); + _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); + _mm256_storeu_epi32(offset_ptr_11, i_se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -122,9 +133,11 @@ struct gridsample_2d_bilinear_compute_blob gridptr += 16; - offset_ptr += 8; + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; - in_bound_ptr_00 += 8; in_bound_ptr_01 += 8; in_bound_ptr_10 += 8; in_bound_ptr_11 += 8; @@ -154,21 +167,28 @@ struct gridsample_2d_bilinear_compute_blob int x1 = x0 + 1; int y1 = y0 + 1; - *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); - *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + bool x1_in_bound = (x1 > -1) & (x1 < src.w); + bool y1_in_bound = (y1 > -1) & (y1 < src.h); + + *in_bound_ptr_01 = x1_in_bound ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_10 = y1_in_bound ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; - *offset_ptr = x0 + y0 * src.w; + *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; + *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; + *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; + *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; *value_ptr_alpha = sample_x - x0; *value_ptr_beta = sample_y - y0; gridptr += 2; - offset_ptr++; + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; - in_bound_ptr_00++; in_bound_ptr_01++; in_bound_ptr_10++; in_bound_ptr_11++; @@ -215,9 +235,11 @@ struct gridsample_2d_bilinear_compute_blob __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - _mm256_storeu_epi32(in_bound_ptr_00, *(__m256i*)_pi32_256_1); _mm256_storeu_epi32(in_bound_ptr_01, x1_in_range); _mm256_storeu_epi32(in_bound_ptr_10, y1_in_range); _mm256_storeu_epi32(in_bound_ptr_11, v11_in_range); @@ -230,17 +252,25 @@ struct gridsample_2d_bilinear_compute_blob __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - _mm256_storeu_ps(in_bound_ptr_00, *(__m256*)_ps256_n1); _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); #endif - _mm256_storeu_epi32(offset_ptr, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); + _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); + _mm256_storeu_epi32(offset_ptr_11, i_se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -251,9 +281,11 @@ struct gridsample_2d_bilinear_compute_blob gridptr_x += 8; gridptr_y += 8; - offset_ptr += 8; + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; - in_bound_ptr_00 += 8; in_bound_ptr_01 += 8; in_bound_ptr_10 += 8; in_bound_ptr_11 += 8; @@ -283,12 +315,17 @@ struct gridsample_2d_bilinear_compute_blob int x1 = x0 + 1; int y1 = y0 + 1; - *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); - *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + bool x1_in_bound = (x1 > -1) & (x1 < src.w); + bool y1_in_bound = (y1 > -1) & (y1 < src.h); + + *in_bound_ptr_01 = x1_in_bound ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_10 = y1_in_bound ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; - *offset_ptr = x0 + y0 * src.w; + *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; + *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; + *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; + *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; *value_ptr_alpha = sample_x - x0; *value_ptr_beta = sample_y - y0; @@ -296,9 +333,11 @@ struct gridsample_2d_bilinear_compute_blob gridptr_x++; gridptr_y++; - offset_ptr++; + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; - in_bound_ptr_00++; in_bound_ptr_01++; in_bound_ptr_10++; in_bound_ptr_11++; @@ -319,13 +358,18 @@ struct gridsample_2d_bilinear_compute_blob #if __AVX__ const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #if __AVX2__ const __m256i vImgWi = _mm256_set1_epi32(src.w); const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); #endif // __AVX2__ #endif // __AVX__ - int* offset_ptr = offset.channel(0); + int* offset_ptr_00 = offset.channel(0); + int* offset_ptr_01 = offset.channel(1); + int* offset_ptr_10 = offset.channel(2); + int* offset_ptr_11 = offset.channel(3); float* in_bound_ptr_00 = in_bound.channel(0); float* in_bound_ptr_01 = in_bound.channel(1); @@ -346,8 +390,8 @@ struct gridsample_2d_bilinear_compute_blob #if __AVX__ for (int x = 0; x + 15 < nn; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); @@ -379,11 +423,14 @@ struct gridsample_2d_bilinear_compute_blob __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); _mm256_storeu_ps(in_bound_ptr_00, _mm256_castsi256_ps(v00_in_range)); _mm256_storeu_ps(in_bound_ptr_01, _mm256_castsi256_ps(v01_in_range)); @@ -399,13 +446,19 @@ struct gridsample_2d_bilinear_compute_blob __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); @@ -413,7 +466,10 @@ struct gridsample_2d_bilinear_compute_blob _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); #endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); + _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); + _mm256_storeu_epi32(offset_ptr_11, i_se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -421,9 +477,15 @@ struct gridsample_2d_bilinear_compute_blob _mm256_storeu_ps(value_ptr_alpha, alpha); _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + gridptr += 16; - offset_ptr += 8; + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; in_bound_ptr_00 += 8; in_bound_ptr_01 += 8; @@ -433,7 +495,6 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_alpha += 8; value_ptr_beta += 8; } - nn = grid_size & 15; #endif // __AVX__ @@ -444,6 +505,7 @@ struct gridsample_2d_bilinear_compute_blob // x sample_x = unormalize(src.w, sample_x); + // y sample_y = unormalize(src.h, sample_y); @@ -452,19 +514,30 @@ struct gridsample_2d_bilinear_compute_blob int x1 = x0 + 1; int y1 = y0 + 1; - *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); - *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + bool x0_in_bound = (x0 > -1) & (x0 < src.w); + bool x1_in_bound = (x1 > -1) & (x1 < src.w); + bool y0_in_bound = (y0 > -1) & (y0 < src.h); + bool y1_in_bound = (y1 > -1) & (y1 < src.h); + + *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; - *offset_ptr = x0 + y0 * src.w; + *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; + *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; + *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; + *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; *value_ptr_alpha = sample_x - x0; *value_ptr_beta = sample_y - y0; gridptr += 2; - offset_ptr++; + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; in_bound_ptr_00++; in_bound_ptr_01++; @@ -475,6 +548,7 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_beta++; } } + } else { @@ -511,11 +585,14 @@ struct gridsample_2d_bilinear_compute_blob __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v10_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - __m256i i_nw_offset = _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0); + __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); + __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); + __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); _mm256_storeu_ps(in_bound_ptr_00, _mm256_castsi256_ps(v00_in_range)); _mm256_storeu_ps(in_bound_ptr_01, _mm256_castsi256_ps(v01_in_range)); @@ -531,13 +608,19 @@ struct gridsample_2d_bilinear_compute_blob __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v10_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w); + __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); + __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); + __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); + __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); + __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); @@ -545,7 +628,10 @@ struct gridsample_2d_bilinear_compute_blob _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); #endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); + _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); + _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); + _mm256_storeu_epi32(offset_ptr_11, i_se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -556,7 +642,10 @@ struct gridsample_2d_bilinear_compute_blob gridptr_x += 8; gridptr_y += 8; - offset_ptr += 8; + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; in_bound_ptr_00 += 8; in_bound_ptr_01 += 8; @@ -585,12 +674,20 @@ struct gridsample_2d_bilinear_compute_blob int x1 = x0 + 1; int y1 = y0 + 1; - *in_bound_ptr_00 = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_01 = (x1 > -1) & (x1 < src.w) & (y0 > -1) & (y0 < src.h); - *in_bound_ptr_10 = (x0 > -1) & (x0 < src.w) & (y1 > -1) & (y1 < src.h); - *in_bound_ptr_11 = (x1 > -1) & (x1 < src.w) & (y1 > -1) & (y1 < src.h); + bool x0_in_bound = (x0 > -1) & (x0 < src.w); + bool x1_in_bound = (x1 > -1) & (x1 < src.w); + bool y0_in_bound = (y0 > -1) & (y0 < src.h); + bool y1_in_bound = (y1 > -1) & (y1 < src.h); + + *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; - *offset_ptr = x0 + y0 * src.w; + *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; + *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; + *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; + *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; *value_ptr_alpha = sample_x - x0; *value_ptr_beta = sample_y - y0; @@ -598,7 +695,10 @@ struct gridsample_2d_bilinear_compute_blob gridptr_x++; gridptr_y++; - offset_ptr++; + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; in_bound_ptr_00++; in_bound_ptr_01++; @@ -610,4 +710,78 @@ struct gridsample_2d_bilinear_compute_blob } } } -}; \ No newline at end of file + +}; + +#if __AVX__ +static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_00 = offset.channel(0); + const int* offset_ptr_01 = offset.channel(1); + const int* offset_ptr_10 = offset.channel(2); + const int* offset_ptr_11 = offset.channel(3); + + const float* in_bound_ptr_00 = in_bound.channel(0); + const float* in_bound_ptr_01 = in_bound.channel(1); + const float* in_bound_ptr_10 = in_bound.channel(2); + const float* in_bound_ptr_11 = in_bound.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_00), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_01), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_10), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_11), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + __m256 v00_in_range = _mm256_set1_ps(*in_bound_ptr_00); + __m256 v01_in_range = _mm256_set1_ps(*in_bound_ptr_01); + __m256 v10_in_range = _mm256_set1_ps(*in_bound_ptr_10); + __m256 v11_in_range = _mm256_set1_ps(*in_bound_ptr_11); + + __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_range); + __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_range); + __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_range); + __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_range); + + __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); + __m256 beta = _mm256_set1_ps(*value_ptr_beta); + + __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + + dstptr += 8; + } + + } +} +#endif // __AVX__ \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index c93bf2f1a52..76b13c68f5b 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -21,10 +21,7 @@ struct gridsample_2d_nearest_compute_blob #if __AVX__ const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ int* offset_ptr = offset.channel(0); @@ -41,8 +38,8 @@ struct gridsample_2d_nearest_compute_blob #if __AVX__ for (int x = 0; x + 15 < nn; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); @@ -65,7 +62,7 @@ struct gridsample_2d_nearest_compute_blob gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_epi32(offset_ptr, i_offset); @@ -88,13 +85,13 @@ struct gridsample_2d_nearest_compute_blob sample_x = get_coord(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); - sample_y = get_coord(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *offset_ptr = x0 + y0 * src.w; + *offset_ptr = (x0 + y0 * src.w) * src.elempack; gridptr += 2; @@ -128,7 +125,7 @@ struct gridsample_2d_nearest_compute_blob gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_epi32(offset_ptr, i_offset); @@ -152,13 +149,13 @@ struct gridsample_2d_nearest_compute_blob sample_x = get_coord(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); - sample_y = get_coord(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *offset_ptr = x0 + y0 * src.w; + *offset_ptr = (x0 + y0 * src.w) * src.elempack; gridptr_x++; gridptr_y++; @@ -178,10 +175,7 @@ struct gridsample_2d_nearest_compute_blob #if __AVX__ const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); -#endif // __AVX2__ + const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ int* offset_ptr = offset.channel(0); @@ -199,8 +193,8 @@ struct gridsample_2d_nearest_compute_blob #if __AVX__ for (int x = 0; x + 15 < nn; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr + x); - __m256 gy = _mm256_loadu_ps(gridptr + x + 8); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); @@ -223,7 +217,7 @@ struct gridsample_2d_nearest_compute_blob __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); @@ -245,13 +239,13 @@ struct gridsample_2d_nearest_compute_blob // x sample_x = unormalize(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *in_bound_ptr = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); - *offset_ptr = x0 + y0 * src.w; + *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? 0xFFFFFFFF : 0.0f; + *offset_ptr = (x0 + y0 * src.w) * src.elempack; gridptr += 2; offset_ptr++; @@ -285,7 +279,7 @@ struct gridsample_2d_nearest_compute_blob __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - __m256 offset = _mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx); + __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); @@ -308,14 +302,14 @@ struct gridsample_2d_nearest_compute_blob // x sample_x = unormalize(src.w, sample_x); // y - sample_y = unormalize(src.h, sample_x); + sample_y = unormalize(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *in_bound_ptr = (x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h); + *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? 0xFFFFFFFF : 0.0f; - *offset_ptr = x0 + y0 * src.w; + *offset_ptr = (x0 + y0 * src.w) * src.elempack; gridptr_x++; gridptr_y++; @@ -328,3 +322,37 @@ struct gridsample_2d_nearest_compute_blob } }; +#if __AVX__ +static void gridsample_2d_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr = offset.channel(0); + + const float* in_bound_ptr = in_bound.channel(0); + + for (int i = 0; i < grid_size; i++) + { + __m256 _v = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*in_bound_ptr)); + + _mm256_storeu_ps(dstptr, _v); + + offset_ptr++; + + in_bound_ptr++; + + dstptr += 8; + } + } +} + +#endif // __AVX__ \ No newline at end of file diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 26e125a54ec..c9afd483d3c 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -43,7 +43,86 @@ _PS256_CONST(n1, -1.0f); _PS256_CONST(2, 2.0f); _PI32_CONST256(n1, -1); -using PaddingMode = ncnn::GridSample::PaddingMode; +static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) +{ +#if __AVX2__ + __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[8], maski[8]; + memcpy(offseti, &offset, 8 * sizeof(int)); + memcpy(maski, &mask, 8 * sizeof(int)); + + float data[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < 8; i++) + { + if (maski[i] & 0xF0000000) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m256 v = _mm256_loadu_ps(data); +#endif // __AVX2__ + + return v; +} + +#endif // __AVX__ + +const __m128 v1fp4 = _mm_set1_ps(1.0f); +const __m128 vn1fp4 = _mm_set1_ps(-1.0f); +const __m128i v1ip4 = _mm_set1_epi32(1); +const __m128i vn1ip4 = _mm_set1_epi32(-1); + +static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) +{ +#if __AVX2__ + __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[4], maski[4]; + memcpy(offseti, &offset, 4 * sizeof(int)); + memcpy(maski, &mask, 4 * sizeof(int)); + + float data[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < 4; i++) + { + if (maski[i] & 0xF0000000) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m128 v = _mm_loadu_ps(data); +#endif // __AVX__ + + return v; +} + +static inline void interpolate_cubic(float fx, float* coeffs) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +static inline float reflect_coord(float x, int high) +{ + x = abs(x); + x = high - abs(x - high); + return x; +} + +#endif // __SSE2__ + +typedef GridSample::PaddingMode PaddingMode; template struct grid_sample_unormalize; @@ -54,7 +133,7 @@ struct grid_sample_unormalize #if __AVX__ __m256 operator()(__m256 length, __m256 coord) { - return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), length, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(length, *(__m256*)_ps256_1)); } #endif // __AVX__ float operator()(int length, float coord) @@ -163,107 +242,6 @@ struct compute_coord #include "gridsample_bicubic_compute_blob.h" #include "gridsample_nearest_compute_blob.h" -static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) -{ -#if __AVX2__ - __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[8], maski[8]; - memcpy(offseti, &offset, 8 * sizeof(int)); - memcpy(maski, &mask, 8 * sizeof(int)); - - float data[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = 0; i < 8; i++) - { - if (maski[i] & 0xF0000000) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m256 v = _mm256_loadu_ps(data); -#endif // __AVX__ - - return v; -} - -static NCNN_FORCEINLINE __m256 cubic_interp1d_p8(const __m256& x0_v, const __m256& x1_v, const __m256& x2_v, const __m256& x3_v, const __m256& tx) -{ - const __m256 A = _mm256_set1_ps(-0.75f); - - const __m256 x0 = _mm256_add_ps(tx, *(__m256*)_ps256_1); - const __m256& x1 = tx; - const __m256 x2 = _mm256_sub_ps(*(__m256*)_ps256_1, tx); - //const __m256 x3 = _mm256_add_ps(x2, *(__m256*)_ps256_1); - - const __m256 coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - const __m256 coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), *(__m256*)_ps256_1); - const __m256 coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), *(__m256*)_ps256_1); - const __m256 coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(*(__m256*)_ps256_1, coeffs0), coeffs1), coeffs2); - - __m256 _v = _mm256_mul_ps(coeffs0, x0_v); - _v = _mm256_comp_fmadd_ps(coeffs1, x1_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs2, x2_v, _v); - _v = _mm256_comp_fmadd_ps(coeffs3, x3_v, _v); - - return _v; -} - -#endif // __AVX__ - -const __m128 v1fp4 = _mm_set1_ps(1.0f); -const __m128 vn1fp4 = _mm_set1_ps(-1.0f); -const __m128i v1ip4 = _mm_set1_epi32(1); -const __m128i vn1ip4 = _mm_set1_epi32(-1); - -static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) -{ -#if __AVX2__ - __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[4], maski[4]; - memcpy(offseti, &offset, 4 * sizeof(int)); - memcpy(maski, &mask, 4 * sizeof(int)); - - float data[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = 0; i < 4; i++) - { - if (maski[i] & 0xF0000000) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m128 v = _mm_loadu_ps(data); -#endif // __AVX__ - - return v; -} - -static inline void interpolate_cubic(float fx, float* coeffs) -{ - const float A = -0.75f; - - float fx0 = fx + 1; - float fx1 = fx; - float fx2 = 1 - fx; - // float fx3 = 2 - fx; - - coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; - coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; - coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; - coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; -} - -static inline float reflect_coord(float x, int high) -{ - x = abs(x); - x = high - abs(x - high); - return x; -} - -#endif // __SSE2__ - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { const Mat& bottom_blob = bottom_blobs[0]; @@ -275,12 +253,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector weights(0); @@ -31,9 +32,9 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample int ret = test_layer("GridSample", pd, weights, as); if (ret != 0) { - fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d", + fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d permute_fusion=%d", a.dims, a.w, a.h, a.d, a.c, grid.dims, grid.w, grid.h, grid.d, grid.c, - sample_type, padding_mode, align_corner); + sample_type, padding_mode, align_corner, permute_fusion); } return ret; @@ -42,81 +43,141 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample static int test_gridsample_0() { return 0 - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 2, 3, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 2, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 2, 1) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 3, 0) - || test_gridsample(RandomMat(16, 12, 12), RandomMat(2, 16, 12), 3, 3, 1); + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 2, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 2, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 3, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 3, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 1, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 1, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 2, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 2, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 3, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 3, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 1, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 1, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 2, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 2, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 3, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 3, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 1, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 1, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 2, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 2, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 3, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 3, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 1, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 1, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 2, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 2, 1, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 3, 0, 1) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 3, 1, 1); } static int test_gridsample_1() { return 0 - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 2, 3, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 2, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 2, 1) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 3, 0) - || test_gridsample(RandomMat(16, 12, 16), RandomMat(2, 27, 21), 3, 3, 1); + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 1, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 1, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 2, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 2, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 3, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 3, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 1, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 1, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 2, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 2, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 3, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 3, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 1, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 1, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 2, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 2, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 3, 0, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 3, 1, 0) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 1, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 1, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 2, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 2, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 3, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 3, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 1, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 1, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 2, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 2, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 3, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 3, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 1, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 1, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 2, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 2, 1, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 3, 0, 1) + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 3, 1, 1); } static int test_gridsample_2() { return 0 - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 1, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 2, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 3, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 2, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 3, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 3, 1, 1); } static int test_gridsample_3() { return 0 - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 1, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 2, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 3, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 2, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 3, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 3, 1, 1); } int main() @@ -126,6 +187,6 @@ int main() return 0 || test_gridsample_0() || test_gridsample_1() - || test_gridsample_2() - || test_gridsample_3(); + /*|| test_gridsample_2() + || test_gridsample_3()*/; } From 5bd541c9381a0fe50ab638621c2f0b59cccc58ba Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 16 Feb 2023 16:04:32 +0000 Subject: [PATCH 062/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_compute_blob.h | 11 +++++------ src/layer/x86/gridsample_bilinear_compute_blob.h | 7 ++----- src/layer/x86/gridsample_nearest_compute_blob.h | 2 +- src/layer/x86/gridsample_x86.cpp | 10 +--------- tests/test_gridsample.cpp | 3 ++- 5 files changed, 11 insertions(+), 22 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index b292e097fc0..5a42fa93f95 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -144,7 +144,7 @@ struct gridsample_2d_bicubic_compute_blob x2 = get_coord(src.w, x2); x3 = get_coord(src.w, x3); - for (int i = 0; i < 4; i ++) + for (int i = 0; i < 4; i++) { int offset_y = get_coord(src.h, y1 + i - 1) * src.w; @@ -308,7 +308,7 @@ struct gridsample_2d_bicubic_compute_blob float* value_x = value.channel(0); float* value_y = value.channel(1); - for (int i = 0; i < 4; i ++) + for (int i = 0; i < 4; i++) { v0_offset_ptr[i] = offset.channel(i * 4 + 0); v1_offset_ptr[i] = offset.channel(i * 4 + 1); @@ -431,7 +431,7 @@ struct gridsample_2d_bicubic_compute_blob bool x2_in_range = (x2 > -1) & (x2 < src.w); bool x3_in_range = (x3 > -1) & (x3 < src.w); - for (int i = 0; i < 4; i ++) + for (int i = 0; i < 4; i++) { int gy = y1 + i - 1; int offset_y = gy * src.w; @@ -639,7 +639,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m256 value_f[4]; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -674,7 +674,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds __m256 x1_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v1_in_bound_ptr[ii])); __m256 x2_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v2_in_bound_ptr[ii])); __m256 x3_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v3_in_bound_ptr[ii])); - + value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); @@ -704,7 +704,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds dstptr += 8; } - } } diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 421f9492423..185c7b0a330 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -74,7 +74,7 @@ struct gridsample_2d_bilinear_compute_blob gy = get_coord(vImgHf, gy); } - __m256 x_w = _mm256_floor_ps(gx); + __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); #if __AVX2__ @@ -548,7 +548,6 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_beta++; } } - } else { @@ -710,7 +709,6 @@ struct gridsample_2d_bilinear_compute_blob } } } - }; #if __AVX__ @@ -721,7 +719,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int outh = dst.h; const int grid_size = outw * outh; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -781,7 +779,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d dstptr += 8; } - } } #endif // __AVX__ \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 50caee89b35..a29ef6d869b 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -330,7 +330,7 @@ static void gridsample_2d_nearest_apply_interpolation_p8(const Mat& src, Mat& ds const int outh = dst.h; const int grid_size = outw * outh; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index ee7a63b00a6..56ef3b49f5a 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -579,7 +579,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Fri, 17 Feb 2023 03:44:00 +0800 Subject: [PATCH 063/127] [WIP]fix bug --- .../x86/gridsample_bicubic_compute_blob.h | 54 +++++++++++++++---- .../x86/gridsample_bilinear_compute_blob.h | 28 +++++----- .../x86/gridsample_nearest_compute_blob.h | 4 +- src/layer/x86/gridsample_x86.cpp | 6 +-- 4 files changed, 63 insertions(+), 29 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 5a42fa93f95..1981b91c2c8 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -25,13 +25,19 @@ struct gridsample_2d_bicubic_compute_blob #endif // __AVX__ int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; for (int i = 0; i < 4; i++) { v0_offset_ptr[i] = offset.channel(i * 4 + 0); v1_offset_ptr[i] = offset.channel(i * 4 + 1); v2_offset_ptr[i] = offset.channel(i * 4 + 2); v3_offset_ptr[i] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); + v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); + v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); + v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); + } float* value_x = value.channel(0); @@ -101,6 +107,16 @@ struct gridsample_2d_bicubic_compute_blob _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + _mm256_storeu_ps(v0_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v1_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v2_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v3_in_bound_ptr[i], *(__m256*)_ps256_n1); + + v0_in_bound_ptr[i] += 8; + v1_in_bound_ptr[i] += 8; + v2_in_bound_ptr[i] += 8; + v3_in_bound_ptr[i] += 8; + v0_offset_ptr[i] += 8; v1_offset_ptr[i] += 8; v2_offset_ptr[i] += 8; @@ -153,6 +169,11 @@ struct gridsample_2d_bicubic_compute_blob *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + *v0_in_bound_ptr[i]++ = -1.0f; + *v1_in_bound_ptr[i]++ = -1.0f; + *v2_in_bound_ptr[i]++ = -1.0f; + *v3_in_bound_ptr[i]++ = -1.0f; + v0_offset_ptr[i]++; v1_offset_ptr[i]++; v2_offset_ptr[i]++; @@ -220,6 +241,16 @@ struct gridsample_2d_bicubic_compute_blob _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + _mm256_storeu_ps(v0_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v1_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v2_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v3_in_bound_ptr[i], *(__m256*)_ps256_n1); + + v0_in_bound_ptr[i] += 8; + v1_in_bound_ptr[i] += 8; + v2_in_bound_ptr[i] += 8; + v3_in_bound_ptr[i] += 8; + v0_offset_ptr[i] += 8; v1_offset_ptr[i] += 8; v2_offset_ptr[i] += 8; @@ -273,6 +304,11 @@ struct gridsample_2d_bicubic_compute_blob *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + *v0_in_bound_ptr[i]++ = -1.0f; + *v1_in_bound_ptr[i]++ = -1.0f; + *v2_in_bound_ptr[i]++ = -1.0f; + *v3_in_bound_ptr[i]++ = -1.0f; + v0_offset_ptr[i]++; v1_offset_ptr[i]++; v2_offset_ptr[i]++; @@ -438,10 +474,10 @@ struct gridsample_2d_bicubic_compute_blob bool y_in_range = (gy > -1) & (gy < src.h); - *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; - *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; - *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; - *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? -1.0f : 0.0f; + *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? -1.0f : 0.0f; + *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? -1.0f : 0.0f; + *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? -1.0f : 0.0f; *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; @@ -578,10 +614,10 @@ struct gridsample_2d_bicubic_compute_blob bool y_in_range = (gy > -1) & (gy < src.h); - *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; - *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; - *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; - *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? 0xFFFFFFFF : 0.0f; + *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? -1.0f : 0.0f; + *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? -1.0f : 0.0f; + *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? -1.0f : 0.0f; + *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? -1.0f : 0.0f; *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 185c7b0a330..79ed48b5e5f 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -170,9 +170,9 @@ struct gridsample_2d_bilinear_compute_blob bool x1_in_bound = (x1 > -1) & (x1 < src.w); bool y1_in_bound = (y1 > -1) & (y1 < src.h); - *in_bound_ptr_01 = x1_in_bound ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_10 = y1_in_bound ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_01 = x1_in_bound ? -1.0f : 0.0f; + *in_bound_ptr_10 = y1_in_bound ? -1.0f : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; @@ -318,9 +318,9 @@ struct gridsample_2d_bilinear_compute_blob bool x1_in_bound = (x1 > -1) & (x1 < src.w); bool y1_in_bound = (y1 > -1) & (y1 < src.h); - *in_bound_ptr_01 = x1_in_bound ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_10 = y1_in_bound ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_01 = x1_in_bound ? -1.0f : 0.0f; + *in_bound_ptr_10 = y1_in_bound ? -1.0f : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; @@ -519,10 +519,10 @@ struct gridsample_2d_bilinear_compute_blob bool y0_in_bound = (y0 > -1) & (y0 < src.h); bool y1_in_bound = (y1 > -1) & (y1 < src.h); - *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? -1.0f : 0.0f; + *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? -1.0f : 0.0f; + *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? -1.0f : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; @@ -678,10 +678,10 @@ struct gridsample_2d_bilinear_compute_blob bool y0_in_bound = (y0 > -1) & (y0 < src.h); bool y1_in_bound = (y1 > -1) & (y1 < src.h); - *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? -1.0f : 0.0f; + *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? -1.0f : 0.0f; + *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? -1.0f : 0.0f; + *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index a29ef6d869b..861950b8ff9 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -244,7 +244,7 @@ struct gridsample_2d_nearest_compute_blob int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? -1.0f : 0.0f; *offset_ptr = (x0 + y0 * src.w) * src.elempack; gridptr += 2; @@ -307,7 +307,7 @@ struct gridsample_2d_nearest_compute_blob int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? 0xFFFFFFFF : 0.0f; + *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? -1.0f : 0.0f; *offset_ptr = (x0 + y0 * src.w) * src.elempack; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 56ef3b49f5a..8dc036ec726 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -281,7 +281,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 16 Feb 2023 19:46:22 +0000 Subject: [PATCH 064/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_compute_blob.h | 1 - tests/test_gridsample.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 1981b91c2c8..b739fe3ff3c 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -37,7 +37,6 @@ struct gridsample_2d_bicubic_compute_blob v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); - } float* value_x = value.channel(0); diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index fd58df84e5a..6580f284617 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -188,6 +188,6 @@ int main() || test_gridsample_0() || test_gridsample_1() /*|| test_gridsample_2() - || test_gridsample_3()*/ + || test_gridsample_3()*/ ; } From ddce5d83258022cb2d511732f72014ebb58fd146 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Thu, 23 Feb 2023 14:31:17 +0800 Subject: [PATCH 065/127] [WIP]finish 3d_compute_blob and 3d_interpolation_pack8 --- src/layer/gridsample.cpp | 28 +- .../x86/gridsample_bilinear_compute_blob.h | 1275 ++++++++++++++++- .../x86/gridsample_nearest_compute_blob.h | 391 ++++- src/layer/x86/gridsample_x86.cpp | 113 +- tests/test_gridsample.cpp | 4 +- 5 files changed, 1724 insertions(+), 87 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 8059628310f..382f84a9d23 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -409,8 +409,13 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& float sample_z = gridptr[2]; sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_x = compute_coord(sample_x, w, padding_mode, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + sample_y = compute_coord(sample_y, h, padding_mode, align_corner); + sample_z = grid_sample_unormalize(d, sample_z, align_corner); + sample_z = compute_coord(sample_z, d, padding_mode, align_corner); *offsetptr_x = sample_x; *offsetptr_y = sample_y; @@ -444,8 +449,13 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& float sample_z = *gridptr_z; sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_x = compute_coord(sample_x, w, padding_mode, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + sample_y = compute_coord(sample_y, h, padding_mode, align_corner); + sample_z = grid_sample_unormalize(d, sample_z, align_corner); + sample_z = compute_coord(sample_z, d, padding_mode, align_corner); *offsetptr_x = sample_x; *offsetptr_y = sample_y; @@ -493,14 +503,14 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& int y1 = y0 + 1; int z1 = z0 + 1; - float v000 = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner); - float v001 = get_value_bounded(image, x1, y0, z0, padding_mode, align_corner); - float v010 = get_value_bounded(image, x0, y1, z0, padding_mode, align_corner); - float v011 = get_value_bounded(image, x1, y1, z0, padding_mode, align_corner); - float v100 = get_value_bounded(image, x0, y0, z1, padding_mode, align_corner); - float v101 = get_value_bounded(image, x1, y0, z1, padding_mode, align_corner); - float v110 = get_value_bounded(image, x0, y1, z1, padding_mode, align_corner); - float v111 = get_value_bounded(image, x1, y1, z1, padding_mode, align_corner); + float v000 = get_value_bounded(image, x0, y0, z0); + float v001 = get_value_bounded(image, x1, y0, z0); + float v010 = get_value_bounded(image, x0, y1, z0); + float v011 = get_value_bounded(image, x1, y1, z0); + float v100 = get_value_bounded(image, x0, y0, z1); + float v101 = get_value_bounded(image, x1, y0, z1); + float v110 = get_value_bounded(image, x0, y1, z1); + float v111 = get_value_bounded(image, x1, y1, z1); float alpha = sample_x - x0; float beta = sample_y - y0; @@ -553,7 +563,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& int y0 = static_cast(floor(sample_y + 0.5f)); int z0 = static_cast(floor(sample_z + 0.5f)); - float v = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner); + float v = get_value_bounded(image, x0, y0, z0); outptr[0] = v; outptr += 1; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 79ed48b5e5f..3a5752dade3 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -105,9 +105,9 @@ struct gridsample_2d_bilinear_compute_blob __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, vImgWf, x_w), vElempackf); __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 sw_offset = _mm256_comp_fmadd_ps(vImgWf, vElempackf, nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); @@ -450,9 +450,9 @@ struct gridsample_2d_bilinear_compute_blob __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, vImgWf, x_w), vElempackf); __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 sw_offset = _mm256_comp_fmadd_ps(vImgWf, vElempackf, nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); @@ -611,9 +611,9 @@ struct gridsample_2d_bilinear_compute_blob __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, vImgWf), x_w), vElempackf); + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, vImgWf, x_w), vElempackf); __m256 ne_offset = _mm256_add_ps(nw_offset, vElempackf); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 sw_offset = _mm256_comp_fmadd_ps(vImgWf, vElempackf, nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); @@ -711,6 +711,1161 @@ struct gridsample_2d_bilinear_compute_blob } }; +template +struct gridsample_3d_bilinear_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#endif // __AVX2__ +#endif // __AVX__ + + int* offset_ptr_000 = offset.channel(0); + int* offset_ptr_001 = offset.channel(1); + int* offset_ptr_010 = offset.channel(2); + int* offset_ptr_011 = offset.channel(3); + + int* offset_ptr_100 = offset.channel(4); + int* offset_ptr_101 = offset.channel(5); + int* offset_ptr_110 = offset.channel(6); + int* offset_ptr_111 = offset.channel(7); + + float* in_bound_ptr_000 = in_bound.channel(0); + float* in_bound_ptr_001 = in_bound.channel(1); + float* in_bound_ptr_010 = in_bound.channel(2); + float* in_bound_ptr_011 = in_bound.channel(3); + float* in_bound_ptr_100 = in_bound.channel(4); + float* in_bound_ptr_101 = in_bound.channel(5); + float* in_bound_ptr_110 = in_bound.channel(6); + float* in_bound_ptr_111 = in_bound.channel(7); + + float* value_ptr_alpha = value.channel(0); + float* value_ptr_beta = value.channel(1); + float* value_ptr_gamma = value.channel(2); + + grid_sample_unormalize unormalize; + compute_coord get_coord; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + + gz = unormalize(vImgDf, gz); + gz = get_coord(vImgDf, gz); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v011_in_range, v110_in_range, v101_in_range, v111_in_range; + { + v011_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v110_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v011_in_range, z1_in_range); + } + + __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); + _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(x1_in_range)); + _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(y1_in_range)); + _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); + + _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(z1_in_range)); + _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); + _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); + _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v011_in_range, v110_in_range, v101_in_range, v111_in_range; + { + v011_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v110_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v011_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + vElempackf); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf, tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); + _mm256_storeu_ps(in_bound_ptr_001, x1_in_range); + _mm256_storeu_ps(in_bound_ptr_010, y1_in_range); + _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); + + _mm256_storeu_ps(in_bound_ptr_100, z1_in_range); + _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); + _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); + _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); +#endif // __AVX2__ + _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); + _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); + _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); + _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); + + _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); + _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); + _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); + _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + __m256 gamma = _mm256_sub_ps(gz, z_t); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(value_ptr_gamma, gamma); + + gridptr += 24; + + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; + + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; + + in_bound_ptr_000 += 8; + in_bound_ptr_001 += 8; + in_bound_ptr_010 += 8; + in_bound_ptr_011 += 8; + + in_bound_ptr_100 += 8; + in_bound_ptr_101 += 8; + in_bound_ptr_110 += 8; + in_bound_ptr_111 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + } + nn = grid_size % 24; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + float sample_z = *(gridptr + 2); + + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + sample_z = unormalize(src.d, sample_z); + sample_z = get_coord(src.d, sample_z); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int z0 = (int)floor(sample_z); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + + bool v11_in_range = x1_in_range & y1_in_range; + + *in_bound_ptr_000 = -1.0f; + *in_bound_ptr_001 = x1_in_range ? -1.0f : 0.0f; + *in_bound_ptr_010 = y1_in_range ? -1.0f : 0.0f; + *in_bound_ptr_011 = v11_in_range ? -1.0f : 0.0f; + + *in_bound_ptr_100 = z1_in_range ? -1.0f : 0.0f; + *in_bound_ptr_101 = (x1_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_110 = (y1_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; + + *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + + *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + *value_ptr_gamma = sample_z - z0; + + gridptr += 3; + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + + gz = unormalize(vImgDf, gz); + gz = get_coord(vImgDf, gz); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v011_in_range, v110_in_range, v101_in_range, v111_in_range; + { + v011_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); + v110_in_range = _mm256_and_si256(y1_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v011_in_range, z1_in_range); + } + + __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); + _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(x1_in_range)); + _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(y1_in_range)); + _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); + + _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(z1_in_range)); + _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); + _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); + _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v011_in_range, v110_in_range, v101_in_range, v111_in_range; + { + v011_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); + v110_in_range = _mm256_and_ps(y1_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v011_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + vElempackf); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf, tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); + _mm256_storeu_ps(in_bound_ptr_001, x1_in_range); + _mm256_storeu_ps(in_bound_ptr_010, y1_in_range); + _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); + + _mm256_storeu_ps(in_bound_ptr_100, z1_in_range); + _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); + _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); + _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); +#endif // __AVX2__ + _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); + _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); + _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); + _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); + + _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); + _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); + _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); + _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + __m256 gamma = _mm256_sub_ps(gz, z_t); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(value_ptr_gamma, gamma); + + gridptr_x += 8; + gridptr_y += 8; + gridptr_z += 8; + + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; + + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; + + in_bound_ptr_000 += 8; + in_bound_ptr_001 += 8; + in_bound_ptr_010 += 8; + in_bound_ptr_011 += 8; + + in_bound_ptr_100 += 8; + in_bound_ptr_101 += 8; + in_bound_ptr_110 += 8; + in_bound_ptr_111 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + } + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x ++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; + + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + sample_z = unormalize(src.d, sample_z); + sample_z = get_coord(src.d, sample_z); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int z0 = (int)floor(sample_z); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v11_in_range = x1_in_range & y1_in_range; + + *in_bound_ptr_000 = -1.0f; + *in_bound_ptr_001 = x1_in_range ? -1.0f : 0.0f; + *in_bound_ptr_010 = y1_in_range ? -1.0f : 0.0f; + *in_bound_ptr_011 = v11_in_range ? -1.0f : 0.0f; + + *in_bound_ptr_100 = z1_in_range ? -1.0f : 0.0f; + *in_bound_ptr_101 = (x1_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_110 = (y1_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; + + *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + + *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + *value_ptr_gamma = sample_z - z0; + + gridptr_x++; + gridptr_y++; + gridptr_z++; + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + } + } + } +}; + +template +struct gridsample_3d_bilinear_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#if __AVX2__ + const __m256i vImgWi = _mm256_set1_epi32(src.w); + const __m256i vImgHi = _mm256_set1_epi32(src.h); + const __m256i vImgDi = _mm256_set1_epi32(src.d); + const __m256i vElempacki = _mm256_set1_epi32(src.elempack); +#endif // __AVX2__ +#endif // __AVX__ + + int* offset_ptr_000 = offset.channel(0); + int* offset_ptr_001 = offset.channel(1); + int* offset_ptr_010 = offset.channel(2); + int* offset_ptr_011 = offset.channel(3); + + int* offset_ptr_100 = offset.channel(4); + int* offset_ptr_101 = offset.channel(5); + int* offset_ptr_110 = offset.channel(6); + int* offset_ptr_111 = offset.channel(7); + + float* in_bound_ptr_000 = in_bound.channel(0); + float* in_bound_ptr_001 = in_bound.channel(1); + float* in_bound_ptr_010 = in_bound.channel(2); + float* in_bound_ptr_011 = in_bound.channel(3); + float* in_bound_ptr_100 = in_bound.channel(4); + float* in_bound_ptr_101 = in_bound.channel(5); + float* in_bound_ptr_110 = in_bound.channel(6); + float* in_bound_ptr_111 = in_bound.channel(7); + + float* value_ptr_alpha = value.channel(0); + float* value_ptr_beta = value.channel(1); + float* value_ptr_gamma = value.channel(2); + + grid_sample_unormalize unormalize; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gy = unormalize(vImgHf, gy); + gz = unormalize(vImgDf, gz); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v001_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v011_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v100_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v110_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + _mm256_storeu_ps(in_bound_ptr_000, _mm256_castsi256_ps(v000_in_range)); + _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(v001_in_range)); + _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(v010_in_range)); + _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); + + _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(v100_in_range)); + _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); + _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); + _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + vElempackf); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf, tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + _mm256_storeu_ps(in_bound_ptr_000, v000_in_range); + _mm256_storeu_ps(in_bound_ptr_001, v001_in_range); + _mm256_storeu_ps(in_bound_ptr_010, v010_in_range); + _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); + + _mm256_storeu_ps(in_bound_ptr_100, v100_in_range); + _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); + _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); + _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); +#endif // __AVX2__ + _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); + _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); + _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); + _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); + + _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); + _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); + _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); + _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + __m256 gamma = _mm256_sub_ps(gz, z_t); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(value_ptr_gamma, gamma); + + gridptr += 24; + + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; + + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; + + in_bound_ptr_000 += 8; + in_bound_ptr_001 += 8; + in_bound_ptr_010 += 8; + in_bound_ptr_011 += 8; + + in_bound_ptr_100 += 8; + in_bound_ptr_101 += 8; + in_bound_ptr_110 += 8; + in_bound_ptr_111 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + } + nn = grid_size % 24; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + float sample_z = *(gridptr + 2); + + sample_x = unormalize(src.w, sample_x); + sample_y = unormalize(src.h, sample_y); + sample_z = unormalize(src.d, sample_z); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int z0 = (int)floor(sample_z); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool z0_in_range = (z0 > -1) & (z0 < src.d); + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + *in_bound_ptr_000 = (v00_in_range & z0_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_001 = (v01_in_range & z0_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_010 = (v10_in_range & z0_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_011 = (v11_in_range & z0_in_range) ? -1.0f : 0.0f; + + *in_bound_ptr_100 = (v00_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_101 = (v01_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_110 = (v10_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; + + *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + + *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + *value_ptr_gamma = sample_z - z0; + + gridptr += 3; + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gy = unormalize(vImgHf, gy); + gz = unormalize(vImgDf, gz); + } + + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); +#if __AVX2__ + __m256i x0 = _mm256_cvtps_epi32(x_w); + __m256i y0 = _mm256_cvtps_epi32(y_n); + __m256i z0 = _mm256_cvtps_epi32(z_t); + + __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); + __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); + __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); + + __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); + __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); + __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); + __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); + __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); + __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); + + __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); + __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); + __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); + __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); + v001_in_range = _mm256_and_si256(v01_in_range, z0_in_range); + v010_in_range = _mm256_and_si256(v10_in_range, z0_in_range); + v011_in_range = _mm256_and_si256(v11_in_range, z0_in_range); + + v100_in_range = _mm256_and_si256(v00_in_range, z1_in_range); + v101_in_range = _mm256_and_si256(v01_in_range, z1_in_range); + v110_in_range = _mm256_and_si256(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); + } + + __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); + __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); + __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); + + __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); + __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); + __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); + __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); + + _mm256_storeu_ps(in_bound_ptr_000, _mm256_castsi256_ps(v000_in_range)); + _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(v001_in_range)); + _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(v010_in_range)); + _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); + + _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(v100_in_range)); + _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); + _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); + _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); +#else + __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); + __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); + __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); + + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, z1, _CMP_GT_OS)); + + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } + + __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + vElempackf); + __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 tse_offset = _mm256_add_ps(tsw_offset, vElempackf); + + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), vElempackf, tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, vElempackf); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); + __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); + + __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); + __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); + __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); + __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); + + __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); + __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); + __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); + __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); + + _mm256_storeu_ps(in_bound_ptr_000, v000_in_range); + _mm256_storeu_ps(in_bound_ptr_001, v001_in_range); + _mm256_storeu_ps(in_bound_ptr_010, v010_in_range); + _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); + + _mm256_storeu_ps(in_bound_ptr_100, v100_in_range); + _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); + _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); + _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); +#endif // __AVX2__ + _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); + _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); + _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); + _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); + + _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); + _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); + _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); + _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + __m256 gamma = _mm256_sub_ps(gz, z_t); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(value_ptr_gamma, gamma); + + gridptr_x += 8; + gridptr_y += 8; + gridptr_z += 8; + + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; + + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; + + in_bound_ptr_000 += 8; + in_bound_ptr_001 += 8; + in_bound_ptr_010 += 8; + in_bound_ptr_011 += 8; + + in_bound_ptr_100 += 8; + in_bound_ptr_101 += 8; + in_bound_ptr_110 += 8; + in_bound_ptr_111 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + } + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x ++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; + + sample_x = unormalize(src.w, sample_x); + sample_y = unormalize(src.h, sample_y); + sample_z = unormalize(src.d, sample_z); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int z0 = (int)floor(sample_z); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool z0_in_range = (z0 > -1) & (z0 < src.d); + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + *in_bound_ptr_000 = (v00_in_range & z0_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_001 = (v01_in_range & z0_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_010 = (v10_in_range & z0_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_011 = (v11_in_range & z0_in_range) ? -1.0f : 0.0f; + + *in_bound_ptr_100 = (v00_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_101 = (v01_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_110 = (v10_in_range & z1_in_range) ? -1.0f : 0.0f; + *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; + + *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + + *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + *value_ptr_gamma = sample_z - z0; + + gridptr_x++; + gridptr_y++; + gridptr_z++; + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + } + } + } +}; + #if __AVX__ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) { @@ -781,4 +1936,112 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d } } } +static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_000 = offset.channel(0); + const int* offset_ptr_001 = offset.channel(1); + const int* offset_ptr_010 = offset.channel(2); + const int* offset_ptr_011 = offset.channel(3); + const int* offset_ptr_100 = offset.channel(4); + const int* offset_ptr_101 = offset.channel(5); + const int* offset_ptr_110 = offset.channel(6); + const int* offset_ptr_111 = offset.channel(7); + + const float* in_bound_ptr_000 = in_bound.channel(0); + const float* in_bound_ptr_001 = in_bound.channel(1); + const float* in_bound_ptr_010 = in_bound.channel(2); + const float* in_bound_ptr_011 = in_bound.channel(3); + const float* in_bound_ptr_100 = in_bound.channel(4); + const float* in_bound_ptr_101 = in_bound.channel(5); + const float* in_bound_ptr_110 = in_bound.channel(6); + const float* in_bound_ptr_111 = in_bound.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + for (int i = 0; i < grid_size; i++) + { + __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_000), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_001), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_010), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v011_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_011), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v100_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_100), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_101), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_110), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_111), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + + __m256 v000_in_range = _mm256_set1_ps(*in_bound_ptr_000); + __m256 v001_in_range = _mm256_set1_ps(*in_bound_ptr_001); + __m256 v010_in_range = _mm256_set1_ps(*in_bound_ptr_010); + __m256 v011_in_range = _mm256_set1_ps(*in_bound_ptr_011); + __m256 v100_in_range = _mm256_set1_ps(*in_bound_ptr_100); + __m256 v101_in_range = _mm256_set1_ps(*in_bound_ptr_101); + __m256 v110_in_range = _mm256_set1_ps(*in_bound_ptr_110); + __m256 v111_in_range = _mm256_set1_ps(*in_bound_ptr_111); + + __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_range); + __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_range); + __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_range); + __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_range); + __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_range); + __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_range); + __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_range); + __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_range); + + __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); + __m256 beta = _mm256_set1_ps(*value_ptr_beta); + __m256 gamma = _mm256_set1_ps(*value_ptr_gamma); + + __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); + __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + + dstptr += 8; + } + } +} #endif // __AVX__ \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 861950b8ff9..f90fbc9c2b7 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -50,11 +50,9 @@ struct gridsample_2d_nearest_compute_blob // compute coord { - // x gx = unormalize(vImgWf, gx); gx = get_coord(vImgWf, gx); - - // y + gy = unormalize(vImgHf, gy); gy = get_coord(vImgHf, gy); } @@ -62,7 +60,7 @@ struct gridsample_2d_nearest_compute_blob gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_epi32(offset_ptr, i_offset); @@ -80,11 +78,11 @@ struct gridsample_2d_nearest_compute_blob float sample_x = *gridptr; float sample_y = *(gridptr + 1); - // x + sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); - // y + sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); @@ -113,11 +111,9 @@ struct gridsample_2d_nearest_compute_blob // compute coord { - // x gx = unormalize(vImgWf, gx); gx = get_coord(vImgWf, gx); - - // y + gy = unormalize(vImgHf, gy); gy = get_coord(vImgHf, gy); } @@ -125,7 +121,7 @@ struct gridsample_2d_nearest_compute_blob gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_epi32(offset_ptr, i_offset); @@ -143,12 +139,10 @@ struct gridsample_2d_nearest_compute_blob { float sample_x = *gridptr_x; float sample_y = *gridptr_y; - - // x + sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); - - // y + sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); @@ -205,9 +199,7 @@ struct gridsample_2d_nearest_compute_blob // compute coord { - // x gx = unormalize(vImgWf, gx); - // y gy = unormalize(vImgHf, gy); } @@ -217,7 +209,7 @@ struct gridsample_2d_nearest_compute_blob __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); @@ -236,9 +228,9 @@ struct gridsample_2d_nearest_compute_blob float sample_x = *gridptr; float sample_y = *(gridptr + 1); - // x + sample_x = unormalize(src.w, sample_x); - // y + sample_y = unormalize(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); @@ -267,9 +259,7 @@ struct gridsample_2d_nearest_compute_blob // compute coord { - // x gx = unormalize(vImgWf, gx); - // y gy = unormalize(vImgHf, gy); } @@ -279,7 +269,7 @@ struct gridsample_2d_nearest_compute_blob __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); - __m256 offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, vImgWf), gx), vElempackf); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); @@ -299,9 +289,7 @@ struct gridsample_2d_nearest_compute_blob float sample_x = *gridptr_x; float sample_y = *gridptr_y; - // x sample_x = unormalize(src.w, sample_x); - // y sample_y = unormalize(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); @@ -322,13 +310,364 @@ struct gridsample_2d_nearest_compute_blob } }; +template +struct gridsample_3d_nearest_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // __AVX__ + + int* offset_ptr = offset.channel(0); + + grid_sample_unormalize unormalize; + compute_coord get_coord; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + + gz = unormalize(vImgDf, gz); + gz = get_coord(vImgDf, gz); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, + _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr += 24; + + offset_ptr += 8; + } + + nn = grid_size % 24; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + float sample_z = *(gridptr + 2); + + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + sample_z = unormalize(src.d, sample_z); + sample_z = get_coord(src.d, sample_z); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + int z0 = static_cast(floor(sample_z + 0.5f)); + + *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + + gridptr += 3; + + offset_ptr++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gx = get_coord(vImgWf, gx); + + + gy = unormalize(vImgHf, gy); + gy = get_coord(vImgHf, gy); + + gz = unormalize(vImgDf, gz); + gz = get_coord(vImgDf, gz); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, + _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr_x += 8; + gridptr_y += 8; + gridptr_z += 8; + + offset_ptr += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; + + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + sample_z = unormalize(src.d, sample_z); + sample_z = get_coord(src.d, sample_z); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + int z0 = static_cast(floor(sample_z + 0.5f)); + + *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + + gridptr_x++; + gridptr_y++; + gridptr_z++; + + offset_ptr++; + } + } + } +}; + +template +struct gridsample_3d_nearest_compute_blob +{ + void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + { + const int grid_size = grid.w * grid.h * grid.d; +#if __AVX__ + const __m256 vImgWf = _mm256_set1_ps(src.w); + const __m256 vImgHf = _mm256_set1_ps(src.h); + const __m256 vImgDf = _mm256_set1_ps(src.d); + const __m256 vElempackf = _mm256_set1_ps(src.elempack); +#endif // __AVX__ + + int* offset_ptr = offset.channel(0); + + float* in_bound_ptr = in_bound.channel(0); + + grid_sample_unormalize unormalize; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 23 < nn; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + + // compute coord + { + gx = unormalize(vImgWf, gx); + gy = unormalize(vImgHf, gy); + gz = unormalize(vImgDf, gz); + } + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); + + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, + _mm256_comp_fmadd_ps(gy, vImgWf, gx)), vElempackf); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_ps(in_bound_ptr, v_in_range); + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr += 24; + offset_ptr += 8; + in_bound_ptr += 8; + } + + nn = grid_size % 24; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x += 3) + { + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + float sample_z = *(gridptr + 2); + + sample_x = unormalize(src.w, sample_x); + sample_y = unormalize(src.h, sample_y); + sample_z = unormalize(src.d, sample_z); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + int z0 = static_cast(floor(sample_z + 0.5f)); + + *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)) ? -1.0f : 0.0f; + *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + + gridptr += 3; + offset_ptr++; + in_bound_ptr++; + } + } + } + else + { + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); + + int nn = grid_size; +#if __AVX__ + for (int x = 0; x + 7 < nn; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); + + // compute coord= + { + gx = unormalize(vImgWf, gx); + gy = unormalize(vImgHf, gy); + gz = unormalize(vImgDf, gz); + + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + } + + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgWf, gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); + + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, + _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + __m256i i_offset = _mm256_cvtps_epi32(offset); + + _mm256_storeu_ps(in_bound_ptr, v_in_range); + _mm256_storeu_epi32(offset_ptr, i_offset); + + gridptr_x += 8; + gridptr_y += 8; + gridptr_z += 8; + + offset_ptr += 8; + in_bound_ptr += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + + for (int x = grid_size - nn; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; + + sample_x = unormalize(src.w, sample_x); + sample_y = unormalize(src.h, sample_y); + sample_z = unormalize(src.d, sample_z); + + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + int z0 = static_cast(floor(sample_z + 0.5f)); + + *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)) ? -1.0f : 0.0f; + + *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + + gridptr_x++; + gridptr_y++; + gridptr_z++; + + offset_ptr++; + + in_bound_ptr++; + } + } + } +}; + #if __AVX__ -static void gridsample_2d_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) { const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; - const int grid_size = outw * outh; + const int outd = dst.d; + const int grid_size = outw * outh * outd; #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 8dc036ec726..55f4f45df3c 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -256,16 +256,16 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_align1_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == 2) + else if (padding_mode == PaddingMode::Border) { if (align_corner == 0) { - gridsample_3d_bilinear_align0_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_align1_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == 3) + else if (padding_mode == PaddingMode::Reflection) { if (align_corner == 0) { - gridsample_3d_bilinear_align0_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_align1_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } else { - NCNN_LOGE("gridsample sample_type error\n"); + NCNN_LOGE("gridsample padding_mode error\n"); return -100; } } - if (sample_type == 2) + if (sample_type == InterpolationMode::Nearest) { - if (padding_mode == 1) + offset_blob.create(outw, outh, outd, 1, elemsize, 1, opt.blob_allocator); + in_bound_blob.create(outw, outh, outd, 1, elemsize, 1, opt.blob_allocator); + if (offset_blob.empty() || in_bound_blob.empty()) + return -100; + + in_bound_blob.fill(-1.0f); + + if (padding_mode == PaddingMode::Zeros) { if (align_corner == 0) { - gridsample_3d_nearest_align0_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_align1_zeros_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == 2) + else if (padding_mode == PaddingMode::Border) { if (align_corner == 0) { - gridsample_3d_nearest_align0_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_align1_border_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == 3) + else if (padding_mode == PaddingMode::Reflection) { if (align_corner == 0) { - gridsample_3d_nearest_align0_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_align1_reflection_blob_pack8(bottom_blob, grid, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob op; + op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } else { - NCNN_LOGE("gridsample sample_type error\n"); + NCNN_LOGE("gridsample padding_mode error\n"); return -100; } } @@ -534,7 +565,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 23 Feb 2023 15:45:18 +0800 Subject: [PATCH 066/127] [WIP]finish interpolation_pack4 --- .../x86/gridsample_bicubic_compute_blob.h | 95 +++++++++ .../x86/gridsample_bilinear_compute_blob.h | 181 +++++++++++++++++- .../x86/gridsample_nearest_compute_blob.h | 37 +++- src/layer/x86/gridsample_x86.cpp | 97 ++++------ 4 files changed, 352 insertions(+), 58 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index b739fe3ff3c..a2d6a69f87a 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -647,6 +647,7 @@ struct gridsample_2d_bicubic_compute_blob } }; +#if __SSE2__ #if __AVX__ static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) { @@ -743,3 +744,97 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds } #endif // __AVX__ +static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, __m128& coeffs3, const __m128& tx) +{ + const __m128 A = _mm_set1_ps(-0.75f); + + const __m128 x0 = _mm_add_ps(tx, *(__m128*)_ps_1); + const __m128& x1 = tx; + const __m128 x2 = _mm_sub_ps(*(__m128*)_ps_1, tx); + //const __m128 x3 = _mm_add_ps(x2, *(__m128*)_ps_1); + + coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); + coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), *(__m128*)_ps_1); + coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), *(__m128*)_ps_1); + coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(*(__m128*)_ps_1, coeffs0), coeffs1), coeffs2); +} + +static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m128 value_f[4]; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); + v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); + v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); + v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set1_ps(*value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v0_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v0_in_bound_ptr[ii])); + __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v1_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v1_in_bound_ptr[ii])); + __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v2_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v2_in_bound_ptr[ii])); + __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v3_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v3_in_bound_ptr[ii])); + + value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + + v0_in_bound_ptr[ii]++; + v1_in_bound_ptr[ii]++; + v2_in_bound_ptr[ii]++; + v3_in_bound_ptr[ii]++; + } + + cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set1_ps(*value_y)); + + __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); + _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm_storeu_ps(dstptr, _v); + + value_x++; + value_y++; + + dstptr += 4; + } + } +} +#endif // __SSE2__ diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 3a5752dade3..073f5461c30 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -1866,6 +1866,7 @@ struct gridsample_3d_bilinear_compute_blob } }; +#if __SSE2__ #if __AVX__ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) { @@ -2044,4 +2045,182 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d } } } -#endif // __AVX__ \ No newline at end of file +#endif // __AVX__ +static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_00 = offset.channel(0); + const int* offset_ptr_01 = offset.channel(1); + const int* offset_ptr_10 = offset.channel(2); + const int* offset_ptr_11 = offset.channel(3); + + const float* in_bound_ptr_00 = in_bound.channel(0); + const float* in_bound_ptr_01 = in_bound.channel(1); + const float* in_bound_ptr_10 = in_bound.channel(2); + const float* in_bound_ptr_11 = in_bound.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + __m128i v00_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_00), _mm_set_epi32(3, 2, 1, 0)); + __m128i v01_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_01), _mm_set_epi32(3, 2, 1, 0)); + __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_10), _mm_set_epi32(3, 2, 1, 0)); + __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_11), _mm_set_epi32(3, 2, 1, 0)); + + __m128 v00_in_range = _mm_set1_ps(*in_bound_ptr_00); + __m128 v01_in_range = _mm_set1_ps(*in_bound_ptr_01); + __m128 v10_in_range = _mm_set1_ps(*in_bound_ptr_10); + __m128 v11_in_range = _mm_set1_ps(*in_bound_ptr_11); + + __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_range); + __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_range); + __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_range); + __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_range); + + __m128 alpha = _mm_set1_ps(*value_ptr_alpha); + __m128 beta = _mm_set1_ps(*value_ptr_beta); + + __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + + dstptr += 4; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_000 = offset.channel(0); + const int* offset_ptr_001 = offset.channel(1); + const int* offset_ptr_010 = offset.channel(2); + const int* offset_ptr_011 = offset.channel(3); + const int* offset_ptr_100 = offset.channel(4); + const int* offset_ptr_101 = offset.channel(5); + const int* offset_ptr_110 = offset.channel(6); + const int* offset_ptr_111 = offset.channel(7); + + const float* in_bound_ptr_000 = in_bound.channel(0); + const float* in_bound_ptr_001 = in_bound.channel(1); + const float* in_bound_ptr_010 = in_bound.channel(2); + const float* in_bound_ptr_011 = in_bound.channel(3); + const float* in_bound_ptr_100 = in_bound.channel(4); + const float* in_bound_ptr_101 = in_bound.channel(5); + const float* in_bound_ptr_110 = in_bound.channel(6); + const float* in_bound_ptr_111 = in_bound.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + for (int i = 0; i < grid_size; i++) + { + __m128i v000_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_000), _mm_set_epi32(3, 2, 1, 0)); + __m128i v001_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_001), _mm_set_epi32(3, 2, 1, 0)); + __m128i v010_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_010), _mm_set_epi32(3, 2, 1, 0)); + __m128i v011_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_011), _mm_set_epi32(3, 2, 1, 0)); + __m128i v100_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_100), _mm_set_epi32(3, 2, 1, 0)); + __m128i v101_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_101), _mm_set_epi32(3, 2, 1, 0)); + __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_110), _mm_set_epi32(3, 2, 1, 0)); + __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_111), _mm_set_epi32(3, 2, 1, 0)); + + __m128 v000_in_range = _mm_set1_ps(*in_bound_ptr_000); + __m128 v001_in_range = _mm_set1_ps(*in_bound_ptr_001); + __m128 v010_in_range = _mm_set1_ps(*in_bound_ptr_010); + __m128 v011_in_range = _mm_set1_ps(*in_bound_ptr_011); + __m128 v100_in_range = _mm_set1_ps(*in_bound_ptr_100); + __m128 v101_in_range = _mm_set1_ps(*in_bound_ptr_101); + __m128 v110_in_range = _mm_set1_ps(*in_bound_ptr_110); + __m128 v111_in_range = _mm_set1_ps(*in_bound_ptr_111); + + __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_range); + __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_range); + __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_range); + __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_range); + __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_range); + __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_range); + __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_range); + __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_range); + + __m128 alpha = _mm_set1_ps(*value_ptr_alpha); + __m128 beta = _mm_set1_ps(*value_ptr_beta); + __m128 gamma = _mm_set1_ps(*value_ptr_gamma); + + __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); + __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); + + __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + + dstptr += 4; + } + } +} +#endif // __SSE2__ \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index f90fbc9c2b7..bc345d15247 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -660,6 +660,7 @@ struct gridsample_3d_nearest_compute_blob } }; +#if __SSE2__ #if __AVX__ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) { @@ -693,5 +694,39 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, } } } - #endif // __AVX__ +static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr = offset.channel(0); + + const float* in_bound_ptr = in_bound.channel(0); + + for (int i = 0; i < grid_size; i++) + { + __m128 _v = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_ptr), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*in_bound_ptr)); + + _mm_storeu_ps(dstptr, _v); + + offset_ptr++; + + in_bound_ptr++; + + dstptr += 4; + } + } +} + + +#endif // __SSE2__ diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 55f4f45df3c..f682ffbff9f 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -613,85 +613,70 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Thu, 23 Feb 2023 21:06:29 +0800 Subject: [PATCH 067/127] [WIP]finish interpolation_pack4 and interpolation_pack1 --- .../x86/gridsample_bicubic_compute_blob.h | 189 +++++++++ .../x86/gridsample_bilinear_compute_blob.h | 381 +++++++++++++++++- .../x86/gridsample_nearest_compute_blob.h | 60 ++- src/layer/x86/gridsample_x86.cpp | 31 +- 4 files changed, 630 insertions(+), 31 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index a2d6a69f87a..e6b0f80a0cb 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -838,3 +838,192 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds } } #endif // __SSE2__ + +static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2, float& coeffs3, float fx) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs0 = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs1 = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs2 = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs3 = 1.f - coeffs0 - coeffs1 - coeffs2; +} + +static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); + v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); + v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); + v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + int nn = grid_size; +#if __SSE2__ +#if __AVX__ + { + __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m256 value_f[4]; + for (int i = 0; i + 7 < grid_size; i += 8) + { + cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m256 x0_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v0_offset_ptr[ii]), _mm256_loadu_ps(v0_in_bound_ptr[ii])); + __m256 x1_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v1_offset_ptr[ii]), _mm256_loadu_ps(v1_in_bound_ptr[ii])); + __m256 x2_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v2_offset_ptr[ii]), _mm256_loadu_ps(v2_in_bound_ptr[ii])); + __m256 x3_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v3_offset_ptr[ii]), _mm256_loadu_ps(v3_in_bound_ptr[ii])); + + value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii] += 8; + v1_offset_ptr[ii] += 8; + v2_offset_ptr[ii] += 8; + v3_offset_ptr[ii] += 8; + + v0_in_bound_ptr[ii] += 8; + v1_in_bound_ptr[ii] += 8; + v2_in_bound_ptr[ii] += 8; + v3_in_bound_ptr[ii] += 8; + } + + cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_loadu_ps(value_y)); + + __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); + _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm256_storeu_ps(dstptr, _v); + + value_x += 8; + value_y += 8; + + dstptr += 8; + } + } + nn = grid_size & 7; +#endif // __AVX__ + { + __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m128 value_f[4]; + for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + { + cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m128 x0_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v0_offset_ptr[ii]), _mm_loadu_ps(v0_in_bound_ptr[ii])); + __m128 x1_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v1_offset_ptr[ii]), _mm_loadu_ps(v1_in_bound_ptr[ii])); + __m128 x2_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v2_offset_ptr[ii]), _mm_loadu_ps(v2_in_bound_ptr[ii])); + __m128 x3_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v3_offset_ptr[ii]), _mm_loadu_ps(v3_in_bound_ptr[ii])); + + value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii] += 4; + v1_offset_ptr[ii] += 4; + v2_offset_ptr[ii] += 4; + v3_offset_ptr[ii] += 4; + + v0_in_bound_ptr[ii] += 4; + v1_in_bound_ptr[ii] += 4; + v2_in_bound_ptr[ii] += 4; + v3_in_bound_ptr[ii] += 4; + } + + cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_loadu_ps(value_y)); + + __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); + _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm_storeu_ps(dstptr, _v); + + value_x += 4; + value_y += 4; + + dstptr += 4; + } + } + nn = grid_size & 3; +#endif // __SSE2__ + float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + float value_f[4]; + + for (int i = grid_size - nn; i < grid_size; i++) + { + cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); + for (int ii = 0; ii < 4; ii++) + { + float x0_val = *v0_in_bound_ptr[ii] < 0 ? *(srcptr + *v0_offset_ptr[ii]) : 0; + float x1_val = *v1_in_bound_ptr[ii] < 0 ? *(srcptr + *v1_offset_ptr[ii]) : 0; + float x2_val = *v2_in_bound_ptr[ii] < 0 ? *(srcptr + *v2_offset_ptr[ii]) : 0; + float x3_val = *v3_in_bound_ptr[ii] < 0 ? *(srcptr + *v3_offset_ptr[ii]) : 0; + + value_f[ii] = x_coeffs0 * x0_val; + value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; + value_f[ii] = x_coeffs2 * x2_val + value_f[ii]; + value_f[ii] = x_coeffs3 * x3_val + value_f[ii]; + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + + v0_in_bound_ptr[ii]++; + v1_in_bound_ptr[ii]++; + v2_in_bound_ptr[ii]++; + v3_in_bound_ptr[ii]++; + } + + cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *value_y); + + float _v = y_coeffs0 * value_f[0]; + _v = y_coeffs1 * value_f[1] + _v; + _v = y_coeffs2 * value_f[2] + _v; + _v = y_coeffs3 * value_f[3] + _v; + *dstptr = _v; + + value_x++; + value_y++; + + dstptr++; + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 073f5461c30..643eb909623 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -2223,4 +2223,383 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d } } } -#endif // __SSE2__ \ No newline at end of file +#endif // __SSE2__ + +static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_00 = offset.channel(0); + const int* offset_ptr_01 = offset.channel(1); + const int* offset_ptr_10 = offset.channel(2); + const int* offset_ptr_11 = offset.channel(3); + + const float* in_bound_ptr_00 = in_bound.channel(0); + const float* in_bound_ptr_01 = in_bound.channel(1); + const float* in_bound_ptr_10 = in_bound.channel(2); + const float* in_bound_ptr_11 = in_bound.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + int nn = grid_size; +#if __SSE2__ +#if __AVX__ + + for (int i = 0; i + 7 < grid_size; i += 8) + { + __m256i v00_offset = _mm256_loadu_epi32(offset_ptr_00); + __m256i v01_offset = _mm256_loadu_epi32(offset_ptr_01); + __m256i v10_offset = _mm256_loadu_epi32(offset_ptr_10); + __m256i v11_offset = _mm256_loadu_epi32(offset_ptr_11); + + __m256 v00_in_range = _mm256_loadu_ps(in_bound_ptr_00); + __m256 v01_in_range = _mm256_loadu_ps(in_bound_ptr_01); + __m256 v10_in_range = _mm256_loadu_ps(in_bound_ptr_10); + __m256 v11_in_range = _mm256_loadu_ps(in_bound_ptr_11); + + __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_range); + __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_range); + __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_range); + __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_range); + + __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); + __m256 beta = _mm256_loadu_ps(value_ptr_beta); + + __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; + + in_bound_ptr_00 += 8; + in_bound_ptr_01 += 8; + in_bound_ptr_10 += 8; + in_bound_ptr_11 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + + dstptr += 8; + } + nn = grid_size & 7; +#endif // __AVX__ + for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + { + __m128i v00_offset = _mm_loadu_epi32(offset_ptr_00); + __m128i v01_offset = _mm_loadu_epi32(offset_ptr_01); + __m128i v10_offset = _mm_loadu_epi32(offset_ptr_10); + __m128i v11_offset = _mm_loadu_epi32(offset_ptr_11); + + __m128 v00_in_range = _mm_loadu_ps(in_bound_ptr_00); + __m128 v01_in_range = _mm_loadu_ps(in_bound_ptr_01); + __m128 v10_in_range = _mm_loadu_ps(in_bound_ptr_10); + __m128 v11_in_range = _mm_loadu_ps(in_bound_ptr_11); + + __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_range); + __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_range); + __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_range); + __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_range); + + __m128 alpha = _mm_loadu_ps(value_ptr_alpha); + __m128 beta = _mm_loadu_ps(value_ptr_beta); + + __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_00 += 4; + offset_ptr_01 += 4; + offset_ptr_10 += 4; + offset_ptr_11 += 4; + + in_bound_ptr_00 += 4; + in_bound_ptr_01 += 4; + in_bound_ptr_10 += 4; + in_bound_ptr_11 += 4; + + value_ptr_alpha += 4; + value_ptr_beta += 4; + + dstptr += 4; + } + nn = grid_size & 3; +#endif // __SSE2__ + for (int i = grid_size - nn; i < grid_size; i++) + { + float v00 = *in_bound_ptr_00 < 0 ? *(srcptr + *offset_ptr_00) : 0; + float v01 = *in_bound_ptr_01 < 0 ? *(srcptr + *offset_ptr_01) : 0; + float v10 = *in_bound_ptr_10 < 0 ? *(srcptr + *offset_ptr_10) : 0; + float v11 = *in_bound_ptr_11 < 0 ? *(srcptr + *offset_ptr_11) : 0; + + float v0 = v00 * (1 - *value_ptr_alpha) + v01 * *value_ptr_alpha; + float v1 = v10 * (1 - *value_ptr_alpha) + v11 * *value_ptr_alpha; + + *dstptr = v0 * (1 - *value_ptr_beta) + v1 * *value_ptr_beta; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + dstptr++; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_000 = offset.channel(0); + const int* offset_ptr_001 = offset.channel(1); + const int* offset_ptr_010 = offset.channel(2); + const int* offset_ptr_011 = offset.channel(3); + const int* offset_ptr_100 = offset.channel(4); + const int* offset_ptr_101 = offset.channel(5); + const int* offset_ptr_110 = offset.channel(6); + const int* offset_ptr_111 = offset.channel(7); + + const float* in_bound_ptr_000 = in_bound.channel(0); + const float* in_bound_ptr_001 = in_bound.channel(1); + const float* in_bound_ptr_010 = in_bound.channel(2); + const float* in_bound_ptr_011 = in_bound.channel(3); + const float* in_bound_ptr_100 = in_bound.channel(4); + const float* in_bound_ptr_101 = in_bound.channel(5); + const float* in_bound_ptr_110 = in_bound.channel(6); + const float* in_bound_ptr_111 = in_bound.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + int nn = grid_size; +#if __SSE2__ +#if __AVX__ + for (int i = 0; i + 7 < grid_size; i += 8) + { + __m256i v000_offset = _mm256_loadu_epi32(offset_ptr_000); + __m256i v001_offset = _mm256_loadu_epi32(offset_ptr_001); + __m256i v010_offset = _mm256_loadu_epi32(offset_ptr_010); + __m256i v011_offset = _mm256_loadu_epi32(offset_ptr_011); + __m256i v100_offset = _mm256_loadu_epi32(offset_ptr_100); + __m256i v101_offset = _mm256_loadu_epi32(offset_ptr_101); + __m256i v110_offset = _mm256_loadu_epi32(offset_ptr_110); + __m256i v111_offset = _mm256_loadu_epi32(offset_ptr_111); + + __m256 v000_in_range = _mm256_loadu_ps(in_bound_ptr_000); + __m256 v001_in_range = _mm256_loadu_ps(in_bound_ptr_001); + __m256 v010_in_range = _mm256_loadu_ps(in_bound_ptr_010); + __m256 v011_in_range = _mm256_loadu_ps(in_bound_ptr_011); + __m256 v100_in_range = _mm256_loadu_ps(in_bound_ptr_100); + __m256 v101_in_range = _mm256_loadu_ps(in_bound_ptr_101); + __m256 v110_in_range = _mm256_loadu_ps(in_bound_ptr_110); + __m256 v111_in_range = _mm256_loadu_ps(in_bound_ptr_111); + + __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_range); + __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_range); + __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_range); + __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_range); + __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_range); + __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_range); + __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_range); + __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_range); + + __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); + __m256 beta = _mm256_loadu_ps(value_ptr_beta); + __m256 gamma = _mm256_loadu_ps(value_ptr_gamma); + + __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); + __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; + + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; + + in_bound_ptr_000 += 8; + in_bound_ptr_001 += 8; + in_bound_ptr_010 += 8; + in_bound_ptr_011 += 8; + + in_bound_ptr_100 += 8; + in_bound_ptr_101 += 8; + in_bound_ptr_110 += 8; + in_bound_ptr_111 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + + dstptr += 8; + } + + nn = grid_size & 7; +#endif // __AVX__ + for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + { + __m128i v000_offset = _mm_loadu_epi32(offset_ptr_000); + __m128i v001_offset = _mm_loadu_epi32(offset_ptr_001); + __m128i v010_offset = _mm_loadu_epi32(offset_ptr_010); + __m128i v011_offset = _mm_loadu_epi32(offset_ptr_011); + __m128i v100_offset = _mm_loadu_epi32(offset_ptr_100); + __m128i v101_offset = _mm_loadu_epi32(offset_ptr_101); + __m128i v110_offset = _mm_loadu_epi32(offset_ptr_110); + __m128i v111_offset = _mm_loadu_epi32(offset_ptr_111); + + __m128 v000_in_range = _mm_loadu_ps(in_bound_ptr_000); + __m128 v001_in_range = _mm_loadu_ps(in_bound_ptr_001); + __m128 v010_in_range = _mm_loadu_ps(in_bound_ptr_010); + __m128 v011_in_range = _mm_loadu_ps(in_bound_ptr_011); + __m128 v100_in_range = _mm_loadu_ps(in_bound_ptr_100); + __m128 v101_in_range = _mm_loadu_ps(in_bound_ptr_101); + __m128 v110_in_range = _mm_loadu_ps(in_bound_ptr_110); + __m128 v111_in_range = _mm_loadu_ps(in_bound_ptr_111); + + __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_range); + __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_range); + __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_range); + __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_range); + __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_range); + __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_range); + __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_range); + __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_range); + + __m128 alpha = _mm_loadu_ps(value_ptr_alpha); + __m128 beta = _mm_loadu_ps(value_ptr_beta); + __m128 gamma = _mm_loadu_ps(value_ptr_gamma); + + __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); + __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); + + __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_000 += 4; + offset_ptr_001 += 4; + offset_ptr_010 += 4; + offset_ptr_011 += 4; + + offset_ptr_100 += 4; + offset_ptr_101 += 4; + offset_ptr_110 += 4; + offset_ptr_111 += 4; + + in_bound_ptr_000 += 4; + in_bound_ptr_001 += 4; + in_bound_ptr_010 += 4; + in_bound_ptr_011 += 4; + + in_bound_ptr_100 += 4; + in_bound_ptr_101 += 4; + in_bound_ptr_110 += 4; + in_bound_ptr_111 += 4; + + value_ptr_alpha += 4; + value_ptr_beta += 4; + value_ptr_gamma += 4; + + dstptr += 4; + } + nn = grid_size & 3; +#endif // __SSE2__ + for (int i = grid_size - nn; i < grid_size; i++) + { + float v000 = *in_bound_ptr_000 < 0 ? *(srcptr + *offset_ptr_000) : 0; + float v001 = *in_bound_ptr_001 < 0 ? *(srcptr + *offset_ptr_001) : 0; + float v010 = *in_bound_ptr_010 < 0 ? *(srcptr + *offset_ptr_010) : 0; + float v011 = *in_bound_ptr_011 < 0 ? *(srcptr + *offset_ptr_011) : 0; + + float v100 = *in_bound_ptr_100 < 0 ? *(srcptr + *offset_ptr_100) : 0; + float v101 = *in_bound_ptr_101 < 0 ? *(srcptr + *offset_ptr_101) : 0; + float v110 = *in_bound_ptr_110 < 0 ? *(srcptr + *offset_ptr_110) : 0; + float v111 = *in_bound_ptr_111 < 0 ? *(srcptr + *offset_ptr_111) : 0; + + + float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; + float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; + float v10 = v100 * (1 - *value_ptr_alpha) + v101 * *value_ptr_alpha; + float v11 = v110 * (1 - *value_ptr_alpha) + v111 * *value_ptr_alpha; + + float v0 = v00 * (1 - *value_ptr_beta) + v01 * *value_ptr_beta; + float v1 = v10 * (1 - *value_ptr_beta) + v11 * *value_ptr_beta; + + *dstptr = v0 * (1 - *value_ptr_gamma) + v1 * *value_ptr_gamma; + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + dstptr++; + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index bc345d15247..0e1e9ee018c 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -687,9 +687,7 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, _mm256_storeu_ps(dstptr, _v); offset_ptr++; - in_bound_ptr++; - dstptr += 8; } } @@ -720,9 +718,7 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, _mm_storeu_ps(dstptr, _v); offset_ptr++; - in_bound_ptr++; - dstptr += 4; } } @@ -730,3 +726,59 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, #endif // __SSE2__ + +static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr = offset.channel(0); + + const float* in_bound_ptr = in_bound.channel(0); + + int nn = grid_size; +#if __SSE2__ +#if __AVX__ + for (int i = 0; i + 7 < grid_size; i += 8) + { + __m256 _v = mask_gather_ps256(srcptr, _mm256_loadu_epi32(offset_ptr), _mm256_loadu_ps(in_bound_ptr)); + + _mm256_storeu_ps(dstptr, _v); + + offset_ptr += 8; + in_bound_ptr += 8; + dstptr += 8; + } + nn = grid_size & 7; +#endif // __AVX__ + for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + { + __m128 _v = mask_gather_ps(srcptr, _mm_loadu_epi32(offset_ptr), _mm_loadu_ps(in_bound_ptr)); + + _mm_storeu_ps(dstptr, _v); + + offset_ptr += 4; + in_bound_ptr += 4; + dstptr += 4; + } + nn = grid_size & 3; +#endif // __SSE2__ + for (int i = grid_size - nn; i < grid_size; i ++) + { + *dstptr = *in_bound_ptr < 0 ? *(srcptr + *offset_ptr) : 0; + + in_bound_ptr++; + offset_ptr++; + dstptr++; + } + } +} diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index f682ffbff9f..3c68118fce1 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -98,28 +98,6 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, return v; } -static inline void interpolate_cubic(float fx, float* coeffs) -{ - const float A = -0.75f; - - float fx0 = fx + 1; - float fx1 = fx; - float fx2 = 1 - fx; - // float fx3 = 2 - fx; - - coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; - coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; - coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; - coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; -} - -static inline float reflect_coord(float x, int high) -{ - x = abs(x); - x = high - abs(x - high); - return x; -} - #endif // __SSE2__ typedef GridSample::PaddingMode PaddingMode; @@ -646,10 +624,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 23 Feb 2023 13:08:40 +0000 Subject: [PATCH 068/127] apply code-format changes --- .../x86/gridsample_bicubic_compute_blob.h | 8 +-- .../x86/gridsample_bilinear_compute_blob.h | 72 +++++++++---------- .../x86/gridsample_nearest_compute_blob.h | 41 ++++++----- tests/test_gridsample.cpp | 3 +- 4 files changed, 60 insertions(+), 64 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index e6b0f80a0cb..ac76752d257 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -770,7 +770,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m128 value_f[4]; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -861,7 +861,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds const int outh = dst.h; const int grid_size = outw * outh; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -889,7 +889,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds int nn = grid_size; #if __SSE2__ -#if __AVX__ +#if __AVX__ { __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; @@ -1014,7 +1014,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *value_y); - float _v = y_coeffs0 * value_f[0]; + float _v = y_coeffs0 * value_f[0]; _v = y_coeffs1 * value_f[1] + _v; _v = y_coeffs2 * value_f[2] + _v; _v = y_coeffs3 * value_f[3] + _v; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 643eb909623..51ec665ee8c 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -856,7 +856,7 @@ struct gridsample_3d_bilinear_compute_blob } __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, - _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), vElempackf); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -960,7 +960,6 @@ struct gridsample_3d_bilinear_compute_blob bool y1_in_range = (y1 > -1) & (y1 < src.h); bool z1_in_range = (z1 > -1) & (z1 < src.d); - bool v11_in_range = x1_in_range & y1_in_range; *in_bound_ptr_000 = -1.0f; @@ -1108,7 +1107,7 @@ struct gridsample_3d_bilinear_compute_blob } __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, - _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), vElempackf); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -1188,7 +1187,7 @@ struct gridsample_3d_bilinear_compute_blob nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x ++) + for (int x = grid_size - nn; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -1426,7 +1425,7 @@ struct gridsample_3d_bilinear_compute_blob v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - + v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); @@ -1434,7 +1433,7 @@ struct gridsample_3d_bilinear_compute_blob } __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, - _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), vElempackf); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -1469,7 +1468,7 @@ struct gridsample_3d_bilinear_compute_blob _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); - + _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); @@ -1485,15 +1484,15 @@ struct gridsample_3d_bilinear_compute_blob gridptr += 24; - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; in_bound_ptr_000 += 8; in_bound_ptr_001 += 8; @@ -1581,7 +1580,7 @@ struct gridsample_3d_bilinear_compute_blob in_bound_ptr_001++; in_bound_ptr_010++; in_bound_ptr_011++; - + in_bound_ptr_100++; in_bound_ptr_101++; in_bound_ptr_110++; @@ -1693,7 +1692,7 @@ struct gridsample_3d_bilinear_compute_blob v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - + v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); @@ -1701,7 +1700,7 @@ struct gridsample_3d_bilinear_compute_blob } __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), z_t, - _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), + _mm256_comp_fmadd_ps(y_n, vImgWf, x_w)), vElempackf); __m256 tne_offset = _mm256_add_ps(tnw_offset, vElempackf); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); @@ -1781,7 +1780,7 @@ struct gridsample_3d_bilinear_compute_blob nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x ++) + for (int x = grid_size - nn; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -1945,7 +1944,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -1959,7 +1958,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int* offset_ptr_101 = offset.channel(5); const int* offset_ptr_110 = offset.channel(6); const int* offset_ptr_111 = offset.channel(7); - + const float* in_bound_ptr_000 = in_bound.channel(0); const float* in_bound_ptr_001 = in_bound.channel(1); const float* in_bound_ptr_010 = in_bound.channel(2); @@ -1968,7 +1967,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* in_bound_ptr_101 = in_bound.channel(5); const float* in_bound_ptr_110 = in_bound.channel(6); const float* in_bound_ptr_111 = in_bound.channel(7); - + const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); const float* value_ptr_gamma = value.channel(2); @@ -2053,7 +2052,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const int outh = dst.h; const int grid_size = outw * outh; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -2123,7 +2122,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -2232,7 +2231,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const int outh = dst.h; const int grid_size = outw * outh; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -2253,7 +2252,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d int nn = grid_size; #if __SSE2__ -#if __AVX__ +#if __AVX__ for (int i = 0; i + 7 < grid_size; i += 8) { @@ -2297,7 +2296,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 8; } nn = grid_size & 7; -#endif // __AVX__ +#endif // __AVX__ for (int i = grid_size - nn; i + 3 < grid_size; i += 4) { __m128i v00_offset = _mm_loadu_epi32(offset_ptr_00); @@ -2340,7 +2339,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 4; } nn = grid_size & 3; -#endif // __SSE2__ +#endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { float v00 = *in_bound_ptr_00 < 0 ? *(srcptr + *offset_ptr_00) : 0; @@ -2377,7 +2376,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -2407,7 +2406,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d int nn = grid_size; #if __SSE2__ -#if __AVX__ +#if __AVX__ for (int i = 0; i + 7 < grid_size; i += 8) { __m256i v000_offset = _mm256_loadu_epi32(offset_ptr_000); @@ -2456,7 +2455,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_001 += 8; offset_ptr_010 += 8; offset_ptr_011 += 8; - + offset_ptr_100 += 8; offset_ptr_101 += 8; offset_ptr_110 += 8; @@ -2466,7 +2465,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d in_bound_ptr_001 += 8; in_bound_ptr_010 += 8; in_bound_ptr_011 += 8; - + in_bound_ptr_100 += 8; in_bound_ptr_101 += 8; in_bound_ptr_110 += 8; @@ -2480,7 +2479,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d } nn = grid_size & 7; -#endif // __AVX__ +#endif // __AVX__ for (int i = grid_size - nn; i + 3 < grid_size; i += 4) { __m128i v000_offset = _mm_loadu_epi32(offset_ptr_000); @@ -2529,7 +2528,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_001 += 4; offset_ptr_010 += 4; offset_ptr_011 += 4; - + offset_ptr_100 += 4; offset_ptr_101 += 4; offset_ptr_110 += 4; @@ -2539,7 +2538,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d in_bound_ptr_001 += 4; in_bound_ptr_010 += 4; in_bound_ptr_011 += 4; - + in_bound_ptr_100 += 4; in_bound_ptr_101 += 4; in_bound_ptr_110 += 4; @@ -2552,7 +2551,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 4; } nn = grid_size & 3; -#endif // __SSE2__ +#endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { float v000 = *in_bound_ptr_000 < 0 ? *(srcptr + *offset_ptr_000) : 0; @@ -2565,7 +2564,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d float v110 = *in_bound_ptr_110 < 0 ? *(srcptr + *offset_ptr_110) : 0; float v111 = *in_bound_ptr_111 < 0 ? *(srcptr + *offset_ptr_111) : 0; - float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; float v10 = v100 * (1 - *value_ptr_alpha) + v101 * *value_ptr_alpha; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 0e1e9ee018c..cdb7ec7b6b6 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -52,7 +52,7 @@ struct gridsample_2d_nearest_compute_blob { gx = unormalize(vImgWf, gx); gx = get_coord(vImgWf, gx); - + gy = unormalize(vImgHf, gy); gy = get_coord(vImgHf, gy); } @@ -78,11 +78,9 @@ struct gridsample_2d_nearest_compute_blob float sample_x = *gridptr; float sample_y = *(gridptr + 1); - sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); - sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); @@ -113,7 +111,7 @@ struct gridsample_2d_nearest_compute_blob { gx = unormalize(vImgWf, gx); gx = get_coord(vImgWf, gx); - + gy = unormalize(vImgHf, gy); gy = get_coord(vImgHf, gy); } @@ -139,10 +137,10 @@ struct gridsample_2d_nearest_compute_blob { float sample_x = *gridptr_x; float sample_y = *gridptr_y; - + sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); - + sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); @@ -228,9 +226,8 @@ struct gridsample_2d_nearest_compute_blob float sample_x = *gridptr; float sample_y = *(gridptr + 1); - sample_x = unormalize(src.w, sample_x); - + sample_y = unormalize(src.h, sample_y); int x0 = static_cast(floor(sample_x + 0.5f)); @@ -356,7 +353,7 @@ struct gridsample_3d_nearest_compute_blob { gx = unormalize(vImgWf, gx); gx = get_coord(vImgWf, gx); - + gy = unormalize(vImgHf, gy); gy = get_coord(vImgHf, gy); @@ -369,7 +366,8 @@ struct gridsample_3d_nearest_compute_blob gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, - _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + _mm256_comp_fmadd_ps(gy, vImgWf, gx)), + vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_epi32(offset_ptr, i_offset); @@ -428,7 +426,6 @@ struct gridsample_3d_nearest_compute_blob gx = unormalize(vImgWf, gx); gx = get_coord(vImgWf, gx); - gy = unormalize(vImgHf, gy); gy = get_coord(vImgHf, gy); @@ -441,7 +438,8 @@ struct gridsample_3d_nearest_compute_blob gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, - _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + _mm256_comp_fmadd_ps(gy, vImgWf, gx)), + vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_epi32(offset_ptr, i_offset); @@ -461,10 +459,10 @@ struct gridsample_3d_nearest_compute_blob float sample_x = *gridptr_x; float sample_y = *gridptr_y; float sample_z = *gridptr_z; - + sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); - + sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); @@ -546,7 +544,8 @@ struct gridsample_3d_nearest_compute_blob v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, - _mm256_comp_fmadd_ps(gy, vImgWf, gx)), vElempackf); + _mm256_comp_fmadd_ps(gy, vImgWf, gx)), + vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); @@ -613,7 +612,8 @@ struct gridsample_3d_nearest_compute_blob v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgDf, gz, _CMP_GT_OS))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, - _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + _mm256_comp_fmadd_ps(gy, vImgWf, gx)), + vElempackf); __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); @@ -635,7 +635,7 @@ struct gridsample_3d_nearest_compute_blob float sample_x = *gridptr_x; float sample_y = *gridptr_y; float sample_z = *gridptr_z; - + sample_x = unormalize(src.w, sample_x); sample_y = unormalize(src.h, sample_y); sample_z = unormalize(src.d, sample_z); @@ -701,7 +701,7 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -724,7 +724,6 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, } } - #endif // __SSE2__ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) @@ -735,7 +734,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -772,7 +771,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, } nn = grid_size & 3; #endif // __SSE2__ - for (int i = grid_size - nn; i < grid_size; i ++) + for (int i = grid_size - nn; i < grid_size; i++) { *dstptr = *in_bound_ptr < 0 ? *(srcptr + *offset_ptr) : 0; diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 12ed47155a5..4e6d2cb60ed 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -188,6 +188,5 @@ int main() || test_gridsample_0() || test_gridsample_1() || test_gridsample_2() - || test_gridsample_3() - ; + || test_gridsample_3(); } From 016dbb5c41748ed2882a69f3542bbccd6bc70cf4 Mon Sep 17 00:00:00 2001 From: Yoh Date: Fri, 24 Feb 2023 02:35:17 +0800 Subject: [PATCH 069/127] finish interpolation_pac16 --- .../x86/gridsample_bicubic_compute_blob.h | 111 ++++++++- .../x86/gridsample_bilinear_compute_blob.h | 213 ++++++++++++++++-- .../x86/gridsample_nearest_compute_blob.h | 36 ++- src/layer/x86/gridsample_x86.cpp | 32 ++- src/layer/x86/sse_mathfun.h | 92 +------- src/layer/x86/unaryop_x86.cpp | 70 +++++- 6 files changed, 431 insertions(+), 123 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index ac76752d257..8e687abe2e7 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -649,6 +649,101 @@ struct gridsample_2d_bicubic_compute_blob #if __SSE2__ #if __AVX__ +#if __AVX512F__ +static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2, __m512& coeffs3, const __m512& tx) +{ + const __m512 A = _mm512_set1_ps(-0.75f); + + const __m512 x0 = _mm512_add_ps(tx, *(__m512*)_ps512_1); + const __m512& x1 = tx; + const __m512 x2 = _mm512_sub_ps(*(__m512*)_ps512_1, tx); + //const __m512 x3 = _mm512_add_ps(x2, *(__m512*)_ps512_1); + + coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); + coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), *(__m512*)_ps512_1); + coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), *(__m512*)_ps512_1); + coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); +} + +static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m512 value_f[4]; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + + v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); + v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); + v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); + v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v0_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v0_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v1_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v1_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v2_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v2_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v3_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v3_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + + value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm512_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm512_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + + v0_in_bound_ptr[ii]++; + v1_in_bound_ptr[ii]++; + v2_in_bound_ptr[ii]++; + v3_in_bound_ptr[ii]++; + } + + cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*value_y)); + + __m512 _v = _mm512_mul_ps(y_coeffs0, value_f[0]); + _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm512_storeu_ps(dstptr, _v); + + value_x++; + value_y++; + + dstptr += 16; + } + } +} +#endif // __AVX512F__ static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) { const __m256 A = _mm256_set1_ps(-0.75f); @@ -945,10 +1040,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); for (int ii = 0; ii < 4; ii++) { - __m128 x0_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v0_offset_ptr[ii]), _mm_loadu_ps(v0_in_bound_ptr[ii])); - __m128 x1_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v1_offset_ptr[ii]), _mm_loadu_ps(v1_in_bound_ptr[ii])); - __m128 x2_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v2_offset_ptr[ii]), _mm_loadu_ps(v2_in_bound_ptr[ii])); - __m128 x3_val = mask_gather_ps(srcptr, _mm_loadu_epi32(v3_offset_ptr[ii]), _mm_loadu_ps(v3_in_bound_ptr[ii])); + __m128 x0_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), _mm_loadu_ps(v0_in_bound_ptr[ii])); + __m128 x1_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), _mm_loadu_ps(v1_in_bound_ptr[ii])); + __m128 x2_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), _mm_loadu_ps(v2_in_bound_ptr[ii])); + __m128 x3_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), _mm_loadu_ps(v3_in_bound_ptr[ii])); value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -991,10 +1086,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); for (int ii = 0; ii < 4; ii++) { - float x0_val = *v0_in_bound_ptr[ii] < 0 ? *(srcptr + *v0_offset_ptr[ii]) : 0; - float x1_val = *v1_in_bound_ptr[ii] < 0 ? *(srcptr + *v1_offset_ptr[ii]) : 0; - float x2_val = *v2_in_bound_ptr[ii] < 0 ? *(srcptr + *v2_offset_ptr[ii]) : 0; - float x3_val = *v3_in_bound_ptr[ii] < 0 ? *(srcptr + *v3_offset_ptr[ii]) : 0; + float x0_val = *reinterpret_cast(v0_in_bound_ptr[ii]) < 0 ? *(srcptr + *v0_offset_ptr[ii]) : 0; + float x1_val = *reinterpret_cast(v1_in_bound_ptr[ii]) < 0 ? *(srcptr + *v1_offset_ptr[ii]) : 0; + float x2_val = *reinterpret_cast(v2_in_bound_ptr[ii]) < 0 ? *(srcptr + *v2_offset_ptr[ii]) : 0; + float x3_val = *reinterpret_cast(v3_in_bound_ptr[ii]) < 0 ? *(srcptr + *v3_offset_ptr[ii]) : 0; value_f[ii] = x_coeffs0 * x0_val; value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 51ec665ee8c..704b7ccf539 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -1867,6 +1867,177 @@ struct gridsample_3d_bilinear_compute_blob #if __SSE2__ #if __AVX__ +#if __AVX512F__ +static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_00 = offset.channel(0); + const int* offset_ptr_01 = offset.channel(1); + const int* offset_ptr_10 = offset.channel(2); + const int* offset_ptr_11 = offset.channel(3); + + const float* in_bound_ptr_00 = in_bound.channel(0); + const float* in_bound_ptr_01 = in_bound.channel(1); + const float* in_bound_ptr_10 = in_bound.channel(2); + const float* in_bound_ptr_11 = in_bound.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + __m512i v00_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_00), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v01_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_01), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_10), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_11), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + __mmask16 mask00 = *reinterpret_cast(in_bound_ptr_00) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask01 = *reinterpret_cast(in_bound_ptr_01) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask10 = *reinterpret_cast(in_bound_ptr_10) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask11 = *reinterpret_cast(in_bound_ptr_11) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + + __m512 v00_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask00, v00_offset, srcptr, sizeof(float)); + __m512 v01_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask01, v01_offset, srcptr, sizeof(float)); + __m512 v10_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask10, v10_offset, srcptr, sizeof(float)); + __m512 v11_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask11, v11_offset, srcptr, sizeof(float)); + + __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); + __m512 beta = _mm512_set1_ps(*value_ptr_beta); + + __m512 v0 = _mm512_fmadd_ps(v01_val, alpha, _mm512_fnmadd_ps(v00_val, alpha, v00_val)); + __m512 v1 = _mm512_fmadd_ps(v11_val, alpha, _mm512_fnmadd_ps(v10_val, alpha, v10_val)); + + __m512 _v = _mm512_fmadd_ps(v1, beta, _mm512_fnmadd_ps(v0, beta, v0)); + _mm512_storeu_ps(dstptr, _v); + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + in_bound_ptr_00++; + in_bound_ptr_01++; + in_bound_ptr_10++; + in_bound_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + + dstptr += 16; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr_000 = offset.channel(0); + const int* offset_ptr_001 = offset.channel(1); + const int* offset_ptr_010 = offset.channel(2); + const int* offset_ptr_011 = offset.channel(3); + const int* offset_ptr_100 = offset.channel(4); + const int* offset_ptr_101 = offset.channel(5); + const int* offset_ptr_110 = offset.channel(6); + const int* offset_ptr_111 = offset.channel(7); + + const float* in_bound_ptr_000 = in_bound.channel(0); + const float* in_bound_ptr_001 = in_bound.channel(1); + const float* in_bound_ptr_010 = in_bound.channel(2); + const float* in_bound_ptr_011 = in_bound.channel(3); + const float* in_bound_ptr_100 = in_bound.channel(4); + const float* in_bound_ptr_101 = in_bound.channel(5); + const float* in_bound_ptr_110 = in_bound.channel(6); + const float* in_bound_ptr_111 = in_bound.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + for (int i = 0; i < grid_size; i++) + { + __m512i v000_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_000), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v001_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_001), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v010_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_010), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v011_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_011), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v100_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_100), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v101_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_101), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_110), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_111), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_000) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); + __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_001) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); + __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_010) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); + __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_011) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); + __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_100) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); + __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_101) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); + __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_110) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); + __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_111) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); + + __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); + __m512 beta = _mm512_set1_ps(*value_ptr_beta); + __m512 gamma = _mm512_set1_ps(*value_ptr_gamma); + + __m512 v00 = _mm512_fmadd_ps(v001_val, alpha, _mm512_fnmadd_ps(v000_val, alpha, v000_val)); + __m512 v01 = _mm512_fmadd_ps(v011_val, alpha, _mm512_fnmadd_ps(v010_val, alpha, v010_val)); + __m512 v10 = _mm512_fmadd_ps(v101_val, alpha, _mm512_fnmadd_ps(v100_val, alpha, v100_val)); + __m512 v11 = _mm512_fmadd_ps(v111_val, alpha, _mm512_fnmadd_ps(v110_val, alpha, v110_val)); + + __m512 v0 = _mm512_fmadd_ps(v01, beta, _mm512_fnmadd_ps(v00, beta, v00)); + __m512 v1 = _mm512_fmadd_ps(v11, beta, _mm512_fnmadd_ps(v10, beta, v10)); + + __m512 _v = _mm512_fmadd_ps(v1, gamma, _mm512_fnmadd_ps(v0, gamma, v0)); + _mm512_storeu_ps(dstptr, _v); + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + in_bound_ptr_000++; + in_bound_ptr_001++; + in_bound_ptr_010++; + in_bound_ptr_011++; + + in_bound_ptr_100++; + in_bound_ptr_101++; + in_bound_ptr_110++; + in_bound_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + + dstptr += 16; + } + } +} + +#endif // __AVX512F__ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) { const int channels = dst.c; @@ -2299,10 +2470,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __AVX__ for (int i = grid_size - nn; i + 3 < grid_size; i += 4) { - __m128i v00_offset = _mm_loadu_epi32(offset_ptr_00); - __m128i v01_offset = _mm_loadu_epi32(offset_ptr_01); - __m128i v10_offset = _mm_loadu_epi32(offset_ptr_10); - __m128i v11_offset = _mm_loadu_epi32(offset_ptr_11); + __m128i v00_offset = _mm_set_epi32(*(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); + __m128i v01_offset = _mm_set_epi32(*(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); + __m128i v10_offset = _mm_set_epi32(*(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); + __m128i v11_offset = _mm_set_epi32(*(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); __m128 v00_in_range = _mm_loadu_ps(in_bound_ptr_00); __m128 v01_in_range = _mm_loadu_ps(in_bound_ptr_01); @@ -2482,14 +2653,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __AVX__ for (int i = grid_size - nn; i + 3 < grid_size; i += 4) { - __m128i v000_offset = _mm_loadu_epi32(offset_ptr_000); - __m128i v001_offset = _mm_loadu_epi32(offset_ptr_001); - __m128i v010_offset = _mm_loadu_epi32(offset_ptr_010); - __m128i v011_offset = _mm_loadu_epi32(offset_ptr_011); - __m128i v100_offset = _mm_loadu_epi32(offset_ptr_100); - __m128i v101_offset = _mm_loadu_epi32(offset_ptr_101); - __m128i v110_offset = _mm_loadu_epi32(offset_ptr_110); - __m128i v111_offset = _mm_loadu_epi32(offset_ptr_111); + __m128i v000_offset = _mm_set_epi32(*(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); + __m128i v001_offset = _mm_set_epi32(*(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); + __m128i v010_offset = _mm_set_epi32(*(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); + __m128i v011_offset = _mm_set_epi32(*(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); + __m128i v100_offset = _mm_set_epi32(*(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); + __m128i v101_offset = _mm_set_epi32(*(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); + __m128i v110_offset = _mm_set_epi32(*(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); + __m128i v111_offset = _mm_set_epi32(*(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); __m128 v000_in_range = _mm_loadu_ps(in_bound_ptr_000); __m128 v001_in_range = _mm_loadu_ps(in_bound_ptr_001); @@ -2554,15 +2725,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { - float v000 = *in_bound_ptr_000 < 0 ? *(srcptr + *offset_ptr_000) : 0; - float v001 = *in_bound_ptr_001 < 0 ? *(srcptr + *offset_ptr_001) : 0; - float v010 = *in_bound_ptr_010 < 0 ? *(srcptr + *offset_ptr_010) : 0; - float v011 = *in_bound_ptr_011 < 0 ? *(srcptr + *offset_ptr_011) : 0; - - float v100 = *in_bound_ptr_100 < 0 ? *(srcptr + *offset_ptr_100) : 0; - float v101 = *in_bound_ptr_101 < 0 ? *(srcptr + *offset_ptr_101) : 0; - float v110 = *in_bound_ptr_110 < 0 ? *(srcptr + *offset_ptr_110) : 0; - float v111 = *in_bound_ptr_111 < 0 ? *(srcptr + *offset_ptr_111) : 0; + float v000 = *reinterpret_cast(in_bound_ptr_000) < 0 ? *(srcptr + *offset_ptr_000) : 0; + float v001 = *reinterpret_cast(in_bound_ptr_001) < 0 ? *(srcptr + *offset_ptr_001) : 0; + float v010 = *reinterpret_cast(in_bound_ptr_010) < 0 ? *(srcptr + *offset_ptr_010) : 0; + float v011 = *reinterpret_cast(in_bound_ptr_011) < 0 ? *(srcptr + *offset_ptr_011) : 0; + + float v100 = *reinterpret_cast(in_bound_ptr_100) < 0 ? *(srcptr + *offset_ptr_100) : 0; + float v101 = *reinterpret_cast(in_bound_ptr_101) < 0 ? *(srcptr + *offset_ptr_101) : 0; + float v110 = *reinterpret_cast(in_bound_ptr_110) < 0 ? *(srcptr + *offset_ptr_110) : 0; + float v111 = *reinterpret_cast(in_bound_ptr_111) < 0 ? *(srcptr + *offset_ptr_111) : 0; float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index cdb7ec7b6b6..b96304a3a21 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -662,6 +662,38 @@ struct gridsample_3d_nearest_compute_blob #if __SSE2__ #if __AVX__ +#if __AVX512F__ +static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const int* offset_ptr = offset.channel(0); + + const float* in_bound_ptr = in_bound.channel(0); + + for (int i = 0; i < grid_size; i++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + + _mm512_storeu_ps(dstptr, _v); + + offset_ptr++; + in_bound_ptr++; + dstptr += 16; + } + } +} +#endif // __AVX512F__ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) { const int channels = dst.c; @@ -761,7 +793,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, #endif // __AVX__ for (int i = grid_size - nn; i + 3 < grid_size; i += 4) { - __m128 _v = mask_gather_ps(srcptr, _mm_loadu_epi32(offset_ptr), _mm_loadu_ps(in_bound_ptr)); + __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), _mm_loadu_ps(in_bound_ptr)); _mm_storeu_ps(dstptr, _v); @@ -773,7 +805,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, #endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { - *dstptr = *in_bound_ptr < 0 ? *(srcptr + *offset_ptr) : 0; + *dstptr = *reinterpret_cast(in_bound_ptr) < 0 ? *(srcptr + *offset_ptr) : 0; in_bound_ptr++; offset_ptr++; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 3c68118fce1..f57fe8550a5 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -416,7 +416,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(static_cast(absolute)); - __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); - - // truncated_with_sign = (truncated || negative_mask); - __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); - - // negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f); - __m128 negative_fix = _mm_and_ps( - _mm_cmplt_ps(x, truncated_with_sign), - _mm_set_ps1(1.0f)); - - // fixed_result = truncated_with_sign - negative_fix; - __m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix); - - // return ((x && no_fraction) || (!no_fraction && fixed_result)); - return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); -} - -static NCNN_FORCEINLINE __m128 ceil_ps(const __m128 x) -{ -#if (_MSC_VER && __AVX__) || (__SSE4_1__ && __linux__) - return _mm_ceil_ps(x); -#endif // __SSE4_1__ - - // Use negative zero as the sign bit mask. - const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); - - // The smallest float number that have no fractional part. (2^23) - const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f); - - // absolute = abs(x); - __m128 absolute = _mm_andnot_ps(magic_negative_zero, x); - - // negative_mask = magic_negative_zero && x; - __m128 negative_mask = _mm_and_ps(magic_negative_zero, x); - - // no_fraction = (magic_smallest_no_fraction < absolute); - __m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute); - - // truncated = static_cast(static_cast(absolute)); - __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); - - // truncated_with_sign = (truncated || negative_mask); - __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); - - // positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f); - __m128 positive_fix = _mm_and_ps( - _mm_and_ps( - _mm_cmpgt_ps(x, magic_negative_zero), - _mm_cmpgt_ps(x, truncated_with_sign)), - _mm_set_ps1(-1.0f)); - - // fixed_result = truncated_with_sign - positive_fix; - __m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix); - - // return ((x && no_fraction) || (!no_fraction && fixed_result)); - return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + //TODO sse optimize + float tmpx[4]; + float tmpy[4]; + _mm_storeu_ps(tmpx, a); + _mm_storeu_ps(tmpy, b); + tmpx[0] = atan2(tmpx[0], tmpy[0]); + tmpx[1] = atan2(tmpx[1], tmpy[1]); + tmpx[2] = atan2(tmpx[2], tmpy[2]); + tmpx[3] = atan2(tmpx[3], tmpy[3]); + return _mm_loadu_ps(tmpx); } #endif // SSE_MATHFUN_H diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index 9674804d322..4ff17767c2f 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -158,7 +158,39 @@ struct unary_op_floor #if __SSE2__ __m128 func_pack4(const __m128& x) const { - return floor_ps(x); + // Use negative zero as the sign bit mask. + const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); + + // The smallest float number that have no fractional part. (2^23) + const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f); + + // absolute = abs(x); + __m128 absolute = _mm_andnot_ps(magic_negative_zero, x); + + // negative_mask = magic_negative_zero && x; + __m128 negative_mask = _mm_and_ps(magic_negative_zero, x); + + // no_fraction = (magic_smallest_no_fraction < absolute); + __m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute); + + // truncated = static_cast(static_cast(absolute)); + __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); + + // truncated_with_sign = (truncated || negative_mask); + __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); + + // negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f); + __m128 negative_fix = _mm_and_ps( + _mm_cmplt_ps(x, truncated_with_sign), + _mm_set_ps1(1.0f)); + + // fixed_result = truncated_with_sign - negative_fix; + __m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix); + + // return ((x && no_fraction) || (!no_fraction && fixed_result)); + return _mm_or_ps( + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); } #if __AVX__ __m256 func_pack8(const __m256& x) const @@ -184,7 +216,41 @@ struct unary_op_ceil #if __SSE2__ __m128 func_pack4(const __m128& x) const { - return ceil_ps(x); + // Use negative zero as the sign bit mask. + const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); + + // The smallest float number that have no fractional part. (2^23) + const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f); + + // absolute = abs(x); + __m128 absolute = _mm_andnot_ps(magic_negative_zero, x); + + // negative_mask = magic_negative_zero && x; + __m128 negative_mask = _mm_and_ps(magic_negative_zero, x); + + // no_fraction = (magic_smallest_no_fraction < absolute); + __m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute); + + // truncated = static_cast(static_cast(absolute)); + __m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute)); + + // truncated_with_sign = (truncated || negative_mask); + __m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask); + + // positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f); + __m128 positive_fix = _mm_and_ps( + _mm_and_ps( + _mm_cmpgt_ps(x, magic_negative_zero), + _mm_cmpgt_ps(x, truncated_with_sign)), + _mm_set_ps1(-1.0f)); + + // fixed_result = truncated_with_sign - positive_fix; + __m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix); + + // return ((x && no_fraction) || (!no_fraction && fixed_result)); + return _mm_or_ps( + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); } #if __AVX__ __m256 func_pack8(const __m256& x) const From 51a1fd53d6d1213a096f045890ca25ae3b078e26 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 23 Feb 2023 18:37:15 +0000 Subject: [PATCH 070/127] apply code-format changes --- .../x86/gridsample_bicubic_compute_blob.h | 2 +- .../x86/gridsample_bilinear_compute_blob.h | 4 ++-- .../x86/gridsample_nearest_compute_blob.h | 2 +- src/layer/x86/unaryop_x86.cpp | 20 +++++++++---------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 8e687abe2e7..ef3b58472fc 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -676,7 +676,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m512 value_f[4]; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 704b7ccf539..b6ff9c398d0 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -1875,7 +1875,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const int outh = dst.h; const int grid_size = outw * outh; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); @@ -1945,7 +1945,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index b96304a3a21..c03ad15151e 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -671,7 +671,7 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const int outd = dst.d; const int grid_size = outw * outh * outd; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index 4ff17767c2f..4bdb3c10f6d 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -181,16 +181,16 @@ struct unary_op_floor // negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f); __m128 negative_fix = _mm_and_ps( - _mm_cmplt_ps(x, truncated_with_sign), - _mm_set_ps1(1.0f)); + _mm_cmplt_ps(x, truncated_with_sign), + _mm_set_ps1(1.0f)); // fixed_result = truncated_with_sign - negative_fix; __m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix); // return ((x && no_fraction) || (!no_fraction && fixed_result)); return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); } #if __AVX__ __m256 func_pack8(const __m256& x) const @@ -239,18 +239,18 @@ struct unary_op_ceil // positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f); __m128 positive_fix = _mm_and_ps( - _mm_and_ps( - _mm_cmpgt_ps(x, magic_negative_zero), - _mm_cmpgt_ps(x, truncated_with_sign)), - _mm_set_ps1(-1.0f)); + _mm_and_ps( + _mm_cmpgt_ps(x, magic_negative_zero), + _mm_cmpgt_ps(x, truncated_with_sign)), + _mm_set_ps1(-1.0f)); // fixed_result = truncated_with_sign - positive_fix; __m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix); // return ((x && no_fraction) || (!no_fraction && fixed_result)); return _mm_or_ps( - _mm_and_ps(x, no_fraction), - _mm_andnot_ps(no_fraction, fixed_result)); + _mm_and_ps(x, no_fraction), + _mm_andnot_ps(no_fraction, fixed_result)); } #if __AVX__ __m256 func_pack8(const __m256& x) const From 56fb232269c81572368c03f0a8db24c7855fe316 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 24 Feb 2023 19:23:00 +0800 Subject: [PATCH 071/127] fix compile bug --- .../x86/gridsample_bicubic_compute_blob.h | 80 ++- .../x86/gridsample_bilinear_compute_blob.h | 671 +++++------------- .../x86/gridsample_nearest_compute_blob.h | 54 +- src/layer/x86/gridsample_x86.cpp | 2 +- 4 files changed, 233 insertions(+), 574 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index ef3b58472fc..88e01fc4a0b 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -24,7 +24,7 @@ struct gridsample_2d_bicubic_compute_blob const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; for (int i = 0; i < 4; i++) { @@ -101,10 +101,10 @@ struct gridsample_2d_bicubic_compute_blob __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), vElempackf); __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), vElempackf); - _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); - _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); - _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); - _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); + _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); + _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); + _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); _mm256_storeu_ps(v0_in_bound_ptr[i], *(__m256*)_ps256_n1); _mm256_storeu_ps(v1_in_bound_ptr[i], *(__m256*)_ps256_n1); @@ -235,10 +235,10 @@ struct gridsample_2d_bicubic_compute_blob __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), vElempackf); __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), vElempackf); - _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); - _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); - _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); - _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); + _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); + _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); + _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); _mm256_storeu_ps(v0_in_bound_ptr[i], *(__m256*)_ps256_n1); _mm256_storeu_ps(v1_in_bound_ptr[i], *(__m256*)_ps256_n1); @@ -336,7 +336,7 @@ struct gridsample_2d_bicubic_compute_blob const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; @@ -417,10 +417,10 @@ struct gridsample_2d_bicubic_compute_blob __m256 v2_offset_f = _mm256_add_ps(v1_offset_f, vElempackf); __m256 v3_offset_f = _mm256_add_ps(v2_offset_f, vElempackf); - _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); - _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); - _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); - _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); + _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); + _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); + _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); v0_offset_ptr[i] += 8; v1_offset_ptr[i] += 8; @@ -556,10 +556,10 @@ struct gridsample_2d_bicubic_compute_blob __m256 v2_offset_f = _mm256_add_ps(v1_offset_f, vElempackf); __m256 v3_offset_f = _mm256_add_ps(v2_offset_f, vElempackf); - _mm256_storeu_epi32(v0_offset_ptr[i], _mm256_cvtps_epi32(v0_offset_f)); - _mm256_storeu_epi32(v1_offset_ptr[i], _mm256_cvtps_epi32(v1_offset_f)); - _mm256_storeu_epi32(v2_offset_ptr[i], _mm256_cvtps_epi32(v2_offset_f)); - _mm256_storeu_epi32(v3_offset_ptr[i], _mm256_cvtps_epi32(v3_offset_f)); + _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); + _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); + _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); + _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); v0_offset_ptr[i] += 8; v1_offset_ptr[i] += 8; @@ -682,7 +682,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; @@ -776,7 +776,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; @@ -801,10 +801,22 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*value_x)); for (int ii = 0; ii < 4; ii++) { - __m256 x0_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v0_in_bound_ptr[ii])); - __m256 x1_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v1_in_bound_ptr[ii])); - __m256 x2_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v2_in_bound_ptr[ii])); - __m256 x3_val = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*v3_in_bound_ptr[ii])); +#if __AVX2__ + __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v2_offset = _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v3_offset = _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i v0_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v0_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v1_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v1_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v2_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v2_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v3_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ + + __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(*v0_in_bound_ptr[ii])); + __m256 x1_val = mask_gather_ps256(srcptr, v1_offset, _mm256_set1_ps(*v1_in_bound_ptr[ii])); + __m256 x2_val = mask_gather_ps256(srcptr, v2_offset, _mm256_set1_ps(*v2_in_bound_ptr[ii])); + __m256 x3_val = mask_gather_ps256(srcptr, v3_offset, _mm256_set1_ps(*v3_in_bound_ptr[ii])); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -871,7 +883,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; @@ -962,7 +974,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - int *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; @@ -994,10 +1006,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); for (int ii = 0; ii < 4; ii++) { - __m256 x0_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v0_offset_ptr[ii]), _mm256_loadu_ps(v0_in_bound_ptr[ii])); - __m256 x1_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v1_offset_ptr[ii]), _mm256_loadu_ps(v1_in_bound_ptr[ii])); - __m256 x2_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v2_offset_ptr[ii]), _mm256_loadu_ps(v2_in_bound_ptr[ii])); - __m256 x3_val = mask_gather_ps256(srcptr, _mm256_loadu_epi32(v3_offset_ptr[ii]), _mm256_loadu_ps(v3_in_bound_ptr[ii])); + __m256 x0_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v0_offset_ptr[ii] + 7), *(v0_offset_ptr[ii] + 6), *(v0_offset_ptr[ii] + 5), *(v0_offset_ptr[ii] + 4), *(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), _mm256_loadu_ps(v0_in_bound_ptr[ii])); + __m256 x1_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v1_offset_ptr[ii] + 7), *(v1_offset_ptr[ii] + 6), *(v1_offset_ptr[ii] + 5), *(v1_offset_ptr[ii] + 4), *(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), _mm256_loadu_ps(v1_in_bound_ptr[ii])); + __m256 x2_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v2_offset_ptr[ii] + 7), *(v2_offset_ptr[ii] + 6), *(v2_offset_ptr[ii] + 5), *(v2_offset_ptr[ii] + 4), *(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), _mm256_loadu_ps(v2_in_bound_ptr[ii])); + __m256 x3_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v3_offset_ptr[ii] + 7), *(v3_offset_ptr[ii] + 6), *(v3_offset_ptr[ii] + 5), *(v3_offset_ptr[ii] + 4), *(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), _mm256_loadu_ps(v3_in_bound_ptr[ii])); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -1086,10 +1098,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); for (int ii = 0; ii < 4; ii++) { - float x0_val = *reinterpret_cast(v0_in_bound_ptr[ii]) < 0 ? *(srcptr + *v0_offset_ptr[ii]) : 0; - float x1_val = *reinterpret_cast(v1_in_bound_ptr[ii]) < 0 ? *(srcptr + *v1_offset_ptr[ii]) : 0; - float x2_val = *reinterpret_cast(v2_in_bound_ptr[ii]) < 0 ? *(srcptr + *v2_offset_ptr[ii]) : 0; - float x3_val = *reinterpret_cast(v3_in_bound_ptr[ii]) < 0 ? *(srcptr + *v3_offset_ptr[ii]) : 0; + float x0_val = *reinterpret_cast(v0_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v0_offset_ptr[ii])) : 0; + float x1_val = *reinterpret_cast(v1_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v1_offset_ptr[ii])) : 0; + float x2_val = *reinterpret_cast(v2_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v2_offset_ptr[ii])) : 0; + float x3_val = *reinterpret_cast(v3_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v3_offset_ptr[ii])) : 0; value_f[ii] = x_coeffs0 * x0_val; value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index b6ff9c398d0..912327270b5 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -22,17 +22,12 @@ struct gridsample_2d_bilinear_compute_blob const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#endif // __AVX2__ #endif // __AVX__ - int* offset_ptr_00 = offset.channel(0); - int* offset_ptr_01 = offset.channel(1); - int* offset_ptr_10 = offset.channel(2); - int* offset_ptr_11 = offset.channel(3); + float* offset_ptr_00 = offset.channel(0); + float* offset_ptr_01 = offset.channel(1); + float* offset_ptr_10 = offset.channel(2); + float* offset_ptr_11 = offset.channel(3); float* in_bound_ptr_01 = in_bound.channel(1); float* in_bound_ptr_10 = in_bound.channel(2); @@ -77,26 +72,6 @@ struct gridsample_2d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - _mm256_storeu_epi32(in_bound_ptr_01, x1_in_range); - _mm256_storeu_epi32(in_bound_ptr_10, y1_in_range); - _mm256_storeu_epi32(in_bound_ptr_11, v11_in_range); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); @@ -110,20 +85,14 @@ struct gridsample_2d_bilinear_compute_blob __m256 sw_offset = _mm256_comp_fmadd_ps(vImgWf, vElempackf, nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); -#endif - _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); - _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); - _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); - _mm256_storeu_epi32(offset_ptr_11, i_se_offset); + _mm256_storeu_ps(offset_ptr_00, nw_offset); + _mm256_storeu_ps(offset_ptr_01, ne_offset); + _mm256_storeu_ps(offset_ptr_10, sw_offset); + _mm256_storeu_ps(offset_ptr_11, se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -224,26 +193,6 @@ struct gridsample_2d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - _mm256_storeu_epi32(in_bound_ptr_01, x1_in_range); - _mm256_storeu_epi32(in_bound_ptr_10, y1_in_range); - _mm256_storeu_epi32(in_bound_ptr_11, v11_in_range); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); @@ -257,20 +206,14 @@ struct gridsample_2d_bilinear_compute_blob __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); -#endif - _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); - _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); - _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); - _mm256_storeu_epi32(offset_ptr_11, i_se_offset); + _mm256_storeu_ps(offset_ptr_00, nw_offset); + _mm256_storeu_ps(offset_ptr_01, ne_offset); + _mm256_storeu_ps(offset_ptr_10, sw_offset); + _mm256_storeu_ps(offset_ptr_11, se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -359,17 +302,12 @@ struct gridsample_2d_bilinear_compute_blob const __m256 vImgWf = _mm256_set1_ps(src.w); const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#endif // __AVX2__ #endif // __AVX__ - int* offset_ptr_00 = offset.channel(0); - int* offset_ptr_01 = offset.channel(1); - int* offset_ptr_10 = offset.channel(2); - int* offset_ptr_11 = offset.channel(3); + float* offset_ptr_00 = offset.channel(0); + float* offset_ptr_01 = offset.channel(1); + float* offset_ptr_10 = offset.channel(2); + float* offset_ptr_11 = offset.channel(3); float* in_bound_ptr_00 = in_bound.channel(0); float* in_bound_ptr_01 = in_bound.channel(1); @@ -411,32 +349,6 @@ struct gridsample_2d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - _mm256_storeu_ps(in_bound_ptr_00, _mm256_castsi256_ps(v00_in_range)); - _mm256_storeu_ps(in_bound_ptr_01, _mm256_castsi256_ps(v01_in_range)); - _mm256_storeu_ps(in_bound_ptr_10, _mm256_castsi256_ps(v10_in_range)); - _mm256_storeu_ps(in_bound_ptr_11, _mm256_castsi256_ps(v11_in_range)); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); @@ -455,21 +367,15 @@ struct gridsample_2d_bilinear_compute_blob __m256 sw_offset = _mm256_comp_fmadd_ps(vImgWf, vElempackf, nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); _mm256_storeu_ps(in_bound_ptr_10, v10_in_range); _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); -#endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); - _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); - _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); - _mm256_storeu_epi32(offset_ptr_11, i_se_offset); + _mm256_storeu_ps(offset_ptr_00, nw_offset); + _mm256_storeu_ps(offset_ptr_01, ne_offset); + _mm256_storeu_ps(offset_ptr_10, sw_offset); + _mm256_storeu_ps(offset_ptr_11, se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -572,32 +478,6 @@ struct gridsample_2d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - __m256i i_nw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0), vElempacki); - __m256i i_ne_offset = _mm256_add_epi32(i_nw_offset, vElempacki); - __m256i i_sw_offset = _mm256_add_epi32(i_nw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_se_offset = _mm256_add_epi32(i_sw_offset, vElempacki); - - _mm256_storeu_ps(in_bound_ptr_00, _mm256_castsi256_ps(v00_in_range)); - _mm256_storeu_ps(in_bound_ptr_01, _mm256_castsi256_ps(v01_in_range)); - _mm256_storeu_ps(in_bound_ptr_10, _mm256_castsi256_ps(v10_in_range)); - _mm256_storeu_ps(in_bound_ptr_11, _mm256_castsi256_ps(v11_in_range)); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); @@ -616,21 +496,15 @@ struct gridsample_2d_bilinear_compute_blob __m256 sw_offset = _mm256_comp_fmadd_ps(vImgWf, vElempackf, nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, vElempackf); - __m256i i_nw_offset = _mm256_cvtps_epi32(nw_offset); - __m256i i_ne_offset = _mm256_cvtps_epi32(ne_offset); - __m256i i_sw_offset = _mm256_cvtps_epi32(sw_offset); - __m256i i_se_offset = _mm256_cvtps_epi32(se_offset); - _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); _mm256_storeu_ps(in_bound_ptr_10, v10_in_range); _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); -#endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr_00, i_nw_offset); - _mm256_storeu_epi32(offset_ptr_01, i_ne_offset); - _mm256_storeu_epi32(offset_ptr_10, i_sw_offset); - _mm256_storeu_epi32(offset_ptr_11, i_se_offset); + _mm256_storeu_ps(offset_ptr_00, nw_offset); + _mm256_storeu_ps(offset_ptr_01, ne_offset); + _mm256_storeu_ps(offset_ptr_10, sw_offset); + _mm256_storeu_ps(offset_ptr_11, se_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -722,23 +596,17 @@ struct gridsample_3d_bilinear_compute_blob const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#endif // __AVX2__ #endif // __AVX__ - int* offset_ptr_000 = offset.channel(0); - int* offset_ptr_001 = offset.channel(1); - int* offset_ptr_010 = offset.channel(2); - int* offset_ptr_011 = offset.channel(3); - - int* offset_ptr_100 = offset.channel(4); - int* offset_ptr_101 = offset.channel(5); - int* offset_ptr_110 = offset.channel(6); - int* offset_ptr_111 = offset.channel(7); + float* offset_ptr_000 = offset.channel(0); + float* offset_ptr_001 = offset.channel(1); + float* offset_ptr_010 = offset.channel(2); + float* offset_ptr_011 = offset.channel(3); + + float* offset_ptr_100 = offset.channel(4); + float* offset_ptr_101 = offset.channel(5); + float* offset_ptr_110 = offset.channel(6); + float* offset_ptr_111 = offset.channel(7); float* in_bound_ptr_000 = in_bound.channel(0); float* in_bound_ptr_001 = in_bound.channel(1); @@ -795,47 +663,7 @@ struct gridsample_3d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); __m256 z_t = _mm256_floor_ps(gz); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v011_in_range, v110_in_range, v101_in_range, v111_in_range; - { - v011_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v110_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v011_in_range, z1_in_range); - } - - __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); - _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(x1_in_range)); - _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(y1_in_range)); - _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); - - _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(z1_in_range)); - _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); - _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); - _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); @@ -867,16 +695,6 @@ struct gridsample_3d_bilinear_compute_blob __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); _mm256_storeu_ps(in_bound_ptr_001, x1_in_range); _mm256_storeu_ps(in_bound_ptr_010, y1_in_range); @@ -886,16 +704,16 @@ struct gridsample_3d_bilinear_compute_blob _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); -#endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); - _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); - _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); - _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); - _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); - _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); - _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); - _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + _mm256_storeu_ps(offset_ptr_000, tnw_offset); + _mm256_storeu_ps(offset_ptr_001, tne_offset); + _mm256_storeu_ps(offset_ptr_010, tsw_offset); + _mm256_storeu_ps(offset_ptr_011, tse_offset); + + _mm256_storeu_ps(offset_ptr_100, bnw_offset); + _mm256_storeu_ps(offset_ptr_101, bne_offset); + _mm256_storeu_ps(offset_ptr_110, bsw_offset); + _mm256_storeu_ps(offset_ptr_111, bse_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -1043,50 +861,7 @@ struct gridsample_3d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); __m256 z_t = _mm256_floor_ps(gz); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v011_in_range, v110_in_range, v101_in_range, v111_in_range; - { - v011_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - v101_in_range = _mm256_and_si256(x1_in_range, z1_in_range); - v110_in_range = _mm256_and_si256(y1_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v011_in_range, z1_in_range); - } - __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); - _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(x1_in_range)); - _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(y1_in_range)); - _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); - - _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(z1_in_range)); - _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); - _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); - _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); @@ -1118,16 +893,6 @@ struct gridsample_3d_bilinear_compute_blob __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - _mm256_storeu_ps(in_bound_ptr_000, *(__m256*)_ps256_n1); _mm256_storeu_ps(in_bound_ptr_001, x1_in_range); _mm256_storeu_ps(in_bound_ptr_010, y1_in_range); @@ -1137,16 +902,16 @@ struct gridsample_3d_bilinear_compute_blob _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); -#endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); - _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); - _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); - _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); - _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); - _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); - _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); - _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + _mm256_storeu_ps(offset_ptr_000, tnw_offset); + _mm256_storeu_ps(offset_ptr_001, tne_offset); + _mm256_storeu_ps(offset_ptr_010, tsw_offset); + _mm256_storeu_ps(offset_ptr_011, tse_offset); + + _mm256_storeu_ps(offset_ptr_100, bnw_offset); + _mm256_storeu_ps(offset_ptr_101, bne_offset); + _mm256_storeu_ps(offset_ptr_110, bsw_offset); + _mm256_storeu_ps(offset_ptr_111, bse_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -1282,23 +1047,17 @@ struct gridsample_3d_bilinear_compute_blob const __m256 vImgHf = _mm256_set1_ps(src.h); const __m256 vImgDf = _mm256_set1_ps(src.d); const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#if __AVX2__ - const __m256i vImgWi = _mm256_set1_epi32(src.w); - const __m256i vImgHi = _mm256_set1_epi32(src.h); - const __m256i vImgDi = _mm256_set1_epi32(src.d); - const __m256i vElempacki = _mm256_set1_epi32(src.elempack); -#endif // __AVX2__ #endif // __AVX__ - int* offset_ptr_000 = offset.channel(0); - int* offset_ptr_001 = offset.channel(1); - int* offset_ptr_010 = offset.channel(2); - int* offset_ptr_011 = offset.channel(3); - - int* offset_ptr_100 = offset.channel(4); - int* offset_ptr_101 = offset.channel(5); - int* offset_ptr_110 = offset.channel(6); - int* offset_ptr_111 = offset.channel(7); + float* offset_ptr_000 = offset.channel(0); + float* offset_ptr_001 = offset.channel(1); + float* offset_ptr_010 = offset.channel(2); + float* offset_ptr_011 = offset.channel(3); + + float* offset_ptr_100 = offset.channel(4); + float* offset_ptr_101 = offset.channel(5); + float* offset_ptr_110 = offset.channel(6); + float* offset_ptr_111 = offset.channel(7); float* in_bound_ptr_000 = in_bound.channel(0); float* in_bound_ptr_001 = in_bound.channel(1); @@ -1349,60 +1108,7 @@ struct gridsample_3d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); __m256 z_t = _mm256_floor_ps(gz); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v001_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v011_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v100_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v110_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - _mm256_storeu_ps(in_bound_ptr_000, _mm256_castsi256_ps(v000_in_range)); - _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(v001_in_range)); - _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(v010_in_range)); - _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); - - _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(v100_in_range)); - _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); - _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); - _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); @@ -1444,16 +1150,6 @@ struct gridsample_3d_bilinear_compute_blob __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - _mm256_storeu_ps(in_bound_ptr_000, v000_in_range); _mm256_storeu_ps(in_bound_ptr_001, v001_in_range); _mm256_storeu_ps(in_bound_ptr_010, v010_in_range); @@ -1463,16 +1159,16 @@ struct gridsample_3d_bilinear_compute_blob _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); -#endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); - _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); - _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); - _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); - _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); - _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); - _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); - _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + _mm256_storeu_ps(offset_ptr_000, tnw_offset); + _mm256_storeu_ps(offset_ptr_001, tne_offset); + _mm256_storeu_ps(offset_ptr_010, tsw_offset); + _mm256_storeu_ps(offset_ptr_011, tse_offset); + + _mm256_storeu_ps(offset_ptr_100, bnw_offset); + _mm256_storeu_ps(offset_ptr_101, bne_offset); + _mm256_storeu_ps(offset_ptr_110, bsw_offset); + _mm256_storeu_ps(offset_ptr_111, bse_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -1616,60 +1312,7 @@ struct gridsample_3d_bilinear_compute_blob __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); __m256 z_t = _mm256_floor_ps(gz); -#if __AVX2__ - __m256i x0 = _mm256_cvtps_epi32(x_w); - __m256i y0 = _mm256_cvtps_epi32(y_n); - __m256i z0 = _mm256_cvtps_epi32(z_t); - - __m256i x1 = _mm256_add_epi32(x0, *(__m256i*)_pi32_256_1); - __m256i y1 = _mm256_add_epi32(y0, *(__m256i*)_pi32_256_1); - __m256i z1 = _mm256_add_epi32(z0, *(__m256i*)_pi32_256_1); - - __m256i x0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x0)); - __m256i x1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(x1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgWi, x1)); - __m256i y0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y0)); - __m256i y1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(y1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgHi, y1)); - __m256i z0_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z0, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z0)); - __m256i z1_in_range = _mm256_and_si256(_mm256_cmpgt_epi32(z1, *(__m256i*)_pi32_256_n1), _mm256_cmpgt_epi32(vImgDi, z1)); - - __m256i v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256i v00_in_range = _mm256_and_si256(x0_in_range, y0_in_range); - __m256i v01_in_range = _mm256_and_si256(x1_in_range, y0_in_range); - __m256i v10_in_range = _mm256_and_si256(x0_in_range, y1_in_range); - __m256i v11_in_range = _mm256_and_si256(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_si256(v00_in_range, z0_in_range); - v001_in_range = _mm256_and_si256(v01_in_range, z0_in_range); - v010_in_range = _mm256_and_si256(v10_in_range, z0_in_range); - v011_in_range = _mm256_and_si256(v11_in_range, z0_in_range); - - v100_in_range = _mm256_and_si256(v00_in_range, z1_in_range); - v101_in_range = _mm256_and_si256(v01_in_range, z1_in_range); - v110_in_range = _mm256_and_si256(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_si256(v11_in_range, z1_in_range); - } - __m256i i_tnw_offset = _mm256_mullo_epi32(_mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), z0), _mm256_add_epi32(_mm256_mullo_epi32(y0, vImgWi), x0)), vElempacki); - __m256i i_tne_offset = _mm256_add_epi32(i_tnw_offset, vElempacki); - __m256i i_tsw_offset = _mm256_add_epi32(i_tnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_tse_offset = _mm256_add_epi32(i_tsw_offset, vElempacki); - - __m256i i_bnw_offset = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(vImgWi, vImgHi), vElempacki), i_tnw_offset); - __m256i i_bne_offset = _mm256_add_epi32(i_bnw_offset, vElempacki); - __m256i i_bsw_offset = _mm256_add_epi32(i_bnw_offset, _mm256_mullo_epi32(vImgWi, vElempacki)); - __m256i i_bse_offset = _mm256_add_epi32(i_bsw_offset, vElempacki); - - _mm256_storeu_ps(in_bound_ptr_000, _mm256_castsi256_ps(v000_in_range)); - _mm256_storeu_ps(in_bound_ptr_001, _mm256_castsi256_ps(v001_in_range)); - _mm256_storeu_ps(in_bound_ptr_010, _mm256_castsi256_ps(v010_in_range)); - _mm256_storeu_ps(in_bound_ptr_011, _mm256_castsi256_ps(v011_in_range)); - - _mm256_storeu_ps(in_bound_ptr_100, _mm256_castsi256_ps(v100_in_range)); - _mm256_storeu_ps(in_bound_ptr_101, _mm256_castsi256_ps(v101_in_range)); - _mm256_storeu_ps(in_bound_ptr_110, _mm256_castsi256_ps(v110_in_range)); - _mm256_storeu_ps(in_bound_ptr_111, _mm256_castsi256_ps(v111_in_range)); -#else __m256 x1 = _mm256_add_ps(x_w, *(__m256*)_ps256_1); __m256 y1 = _mm256_add_ps(y_n, *(__m256*)_ps256_1); __m256 z1 = _mm256_add_ps(z_t, *(__m256*)_ps256_1); @@ -1711,16 +1354,6 @@ struct gridsample_3d_bilinear_compute_blob __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(vImgWf, vElempackf)); __m256 bse_offset = _mm256_add_ps(bsw_offset, vElempackf); - __m256i i_tnw_offset = _mm256_cvtps_epi32(tnw_offset); - __m256i i_tne_offset = _mm256_cvtps_epi32(tne_offset); - __m256i i_tsw_offset = _mm256_cvtps_epi32(tsw_offset); - __m256i i_tse_offset = _mm256_cvtps_epi32(tse_offset); - - __m256i i_bnw_offset = _mm256_cvtps_epi32(bnw_offset); - __m256i i_bne_offset = _mm256_cvtps_epi32(bne_offset); - __m256i i_bsw_offset = _mm256_cvtps_epi32(bsw_offset); - __m256i i_bse_offset = _mm256_cvtps_epi32(bse_offset); - _mm256_storeu_ps(in_bound_ptr_000, v000_in_range); _mm256_storeu_ps(in_bound_ptr_001, v001_in_range); _mm256_storeu_ps(in_bound_ptr_010, v010_in_range); @@ -1730,16 +1363,16 @@ struct gridsample_3d_bilinear_compute_blob _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); -#endif // __AVX2__ - _mm256_storeu_epi32(offset_ptr_000, i_tnw_offset); - _mm256_storeu_epi32(offset_ptr_001, i_tne_offset); - _mm256_storeu_epi32(offset_ptr_010, i_tsw_offset); - _mm256_storeu_epi32(offset_ptr_011, i_tse_offset); - _mm256_storeu_epi32(offset_ptr_100, i_bnw_offset); - _mm256_storeu_epi32(offset_ptr_101, i_bne_offset); - _mm256_storeu_epi32(offset_ptr_110, i_bsw_offset); - _mm256_storeu_epi32(offset_ptr_111, i_bse_offset); + _mm256_storeu_ps(offset_ptr_000, tnw_offset); + _mm256_storeu_ps(offset_ptr_001, tne_offset); + _mm256_storeu_ps(offset_ptr_010, tsw_offset); + _mm256_storeu_ps(offset_ptr_011, tse_offset); + + _mm256_storeu_ps(offset_ptr_100, bnw_offset); + _mm256_storeu_ps(offset_ptr_101, bne_offset); + _mm256_storeu_ps(offset_ptr_110, bsw_offset); + _mm256_storeu_ps(offset_ptr_111, bse_offset); __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -1881,10 +1514,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_00 = offset.channel(0); - const int* offset_ptr_01 = offset.channel(1); - const int* offset_ptr_10 = offset.channel(2); - const int* offset_ptr_11 = offset.channel(3); + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); const float* in_bound_ptr_00 = in_bound.channel(0); const float* in_bound_ptr_01 = in_bound.channel(1); @@ -1951,14 +1584,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_000 = offset.channel(0); - const int* offset_ptr_001 = offset.channel(1); - const int* offset_ptr_010 = offset.channel(2); - const int* offset_ptr_011 = offset.channel(3); - const int* offset_ptr_100 = offset.channel(4); - const int* offset_ptr_101 = offset.channel(5); - const int* offset_ptr_110 = offset.channel(6); - const int* offset_ptr_111 = offset.channel(7); + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); const float* in_bound_ptr_000 = in_bound.channel(0); const float* in_bound_ptr_001 = in_bound.channel(1); @@ -2051,10 +1684,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_00 = offset.channel(0); - const int* offset_ptr_01 = offset.channel(1); - const int* offset_ptr_10 = offset.channel(2); - const int* offset_ptr_11 = offset.channel(3); + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); const float* in_bound_ptr_00 = in_bound.channel(0); const float* in_bound_ptr_01 = in_bound.channel(1); @@ -2066,10 +1699,17 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { +#if __AVX2__ __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_00), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_01), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_10), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_11), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i v00_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_00), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v01_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_01), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v10_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_10), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_11), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ __m256 v00_in_range = _mm256_set1_ps(*in_bound_ptr_00); __m256 v01_in_range = _mm256_set1_ps(*in_bound_ptr_01); @@ -2121,14 +1761,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_000 = offset.channel(0); - const int* offset_ptr_001 = offset.channel(1); - const int* offset_ptr_010 = offset.channel(2); - const int* offset_ptr_011 = offset.channel(3); - const int* offset_ptr_100 = offset.channel(4); - const int* offset_ptr_101 = offset.channel(5); - const int* offset_ptr_110 = offset.channel(6); - const int* offset_ptr_111 = offset.channel(7); + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); const float* in_bound_ptr_000 = in_bound.channel(0); const float* in_bound_ptr_001 = in_bound.channel(1); @@ -2145,6 +1785,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { +#if __AVX2__ __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_000), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_001), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_010), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); @@ -2153,6 +1794,16 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_101), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_110), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_111), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i v000_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_000), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v001_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_001), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v010_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_010), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v011_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_011), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v100_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_100), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v101_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_101), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v110_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_110), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_111), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ __m256 v000_in_range = _mm256_set1_ps(*in_bound_ptr_000); __m256 v001_in_range = _mm256_set1_ps(*in_bound_ptr_001); @@ -2229,10 +1880,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_00 = offset.channel(0); - const int* offset_ptr_01 = offset.channel(1); - const int* offset_ptr_10 = offset.channel(2); - const int* offset_ptr_11 = offset.channel(3); + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); const float* in_bound_ptr_00 = in_bound.channel(0); const float* in_bound_ptr_01 = in_bound.channel(1); @@ -2408,10 +2059,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_00 = offset.channel(0); - const int* offset_ptr_01 = offset.channel(1); - const int* offset_ptr_10 = offset.channel(2); - const int* offset_ptr_11 = offset.channel(3); + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); const float* in_bound_ptr_00 = in_bound.channel(0); const float* in_bound_ptr_01 = in_bound.channel(1); @@ -2427,10 +2078,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d for (int i = 0; i + 7 < grid_size; i += 8) { - __m256i v00_offset = _mm256_loadu_epi32(offset_ptr_00); - __m256i v01_offset = _mm256_loadu_epi32(offset_ptr_01); - __m256i v10_offset = _mm256_loadu_epi32(offset_ptr_10); - __m256i v11_offset = _mm256_loadu_epi32(offset_ptr_11); + __m256i v00_offset = _mm256_set_epi32(*(offset_ptr_00 + 7), *(offset_ptr_00 + 6), *(offset_ptr_00 + 5), *(offset_ptr_00 + 4), *(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); + __m256i v01_offset = _mm256_set_epi32(*(offset_ptr_01 + 7), *(offset_ptr_01 + 6), *(offset_ptr_01 + 5), *(offset_ptr_01 + 4), *(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); + __m256i v10_offset = _mm256_set_epi32(*(offset_ptr_10 + 7), *(offset_ptr_10 + 6), *(offset_ptr_10 + 5), *(offset_ptr_10 + 4), *(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); + __m256i v11_offset = _mm256_set_epi32(*(offset_ptr_11 + 7), *(offset_ptr_11 + 6), *(offset_ptr_11 + 5), *(offset_ptr_11 + 4), *(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); __m256 v00_in_range = _mm256_loadu_ps(in_bound_ptr_00); __m256 v01_in_range = _mm256_loadu_ps(in_bound_ptr_01); @@ -2513,10 +2164,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { - float v00 = *in_bound_ptr_00 < 0 ? *(srcptr + *offset_ptr_00) : 0; - float v01 = *in_bound_ptr_01 < 0 ? *(srcptr + *offset_ptr_01) : 0; - float v10 = *in_bound_ptr_10 < 0 ? *(srcptr + *offset_ptr_10) : 0; - float v11 = *in_bound_ptr_11 < 0 ? *(srcptr + *offset_ptr_11) : 0; + float v00 = *in_bound_ptr_00 < 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; + float v01 = *in_bound_ptr_01 < 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; + float v10 = *in_bound_ptr_10 < 0 ? *(srcptr + static_cast(*offset_ptr_10)) : 0; + float v11 = *in_bound_ptr_11 < 0 ? *(srcptr + static_cast(*offset_ptr_11)) : 0; float v0 = v00 * (1 - *value_ptr_alpha) + v01 * *value_ptr_alpha; float v1 = v10 * (1 - *value_ptr_alpha) + v11 * *value_ptr_alpha; @@ -2553,14 +2204,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_000 = offset.channel(0); - const int* offset_ptr_001 = offset.channel(1); - const int* offset_ptr_010 = offset.channel(2); - const int* offset_ptr_011 = offset.channel(3); - const int* offset_ptr_100 = offset.channel(4); - const int* offset_ptr_101 = offset.channel(5); - const int* offset_ptr_110 = offset.channel(6); - const int* offset_ptr_111 = offset.channel(7); + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); const float* in_bound_ptr_000 = in_bound.channel(0); const float* in_bound_ptr_001 = in_bound.channel(1); @@ -2580,14 +2231,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #if __AVX__ for (int i = 0; i + 7 < grid_size; i += 8) { - __m256i v000_offset = _mm256_loadu_epi32(offset_ptr_000); - __m256i v001_offset = _mm256_loadu_epi32(offset_ptr_001); - __m256i v010_offset = _mm256_loadu_epi32(offset_ptr_010); - __m256i v011_offset = _mm256_loadu_epi32(offset_ptr_011); - __m256i v100_offset = _mm256_loadu_epi32(offset_ptr_100); - __m256i v101_offset = _mm256_loadu_epi32(offset_ptr_101); - __m256i v110_offset = _mm256_loadu_epi32(offset_ptr_110); - __m256i v111_offset = _mm256_loadu_epi32(offset_ptr_111); + __m256i v000_offset = _mm256_set_epi32(*(offset_ptr_000 + 7), *(offset_ptr_000 + 6), *(offset_ptr_000 + 5), *(offset_ptr_000 + 4), *(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); + __m256i v001_offset = _mm256_set_epi32(*(offset_ptr_001 + 7), *(offset_ptr_001 + 6), *(offset_ptr_001 + 5), *(offset_ptr_001 + 4), *(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); + __m256i v010_offset = _mm256_set_epi32(*(offset_ptr_010 + 7), *(offset_ptr_010 + 6), *(offset_ptr_010 + 5), *(offset_ptr_010 + 4), *(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); + __m256i v011_offset = _mm256_set_epi32(*(offset_ptr_011 + 7), *(offset_ptr_011 + 6), *(offset_ptr_011 + 5), *(offset_ptr_011 + 4), *(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); + __m256i v100_offset = _mm256_set_epi32(*(offset_ptr_100 + 7), *(offset_ptr_100 + 6), *(offset_ptr_100 + 5), *(offset_ptr_100 + 4), *(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); + __m256i v101_offset = _mm256_set_epi32(*(offset_ptr_101 + 7), *(offset_ptr_101 + 6), *(offset_ptr_101 + 5), *(offset_ptr_101 + 4), *(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); + __m256i v110_offset = _mm256_set_epi32(*(offset_ptr_110 + 7), *(offset_ptr_110 + 6), *(offset_ptr_110 + 5), *(offset_ptr_110 + 4), *(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); + __m256i v111_offset = _mm256_set_epi32(*(offset_ptr_111 + 7), *(offset_ptr_111 + 6), *(offset_ptr_111 + 5), *(offset_ptr_111 + 4), *(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); __m256 v000_in_range = _mm256_loadu_ps(in_bound_ptr_000); __m256 v001_in_range = _mm256_loadu_ps(in_bound_ptr_001); @@ -2725,15 +2376,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { - float v000 = *reinterpret_cast(in_bound_ptr_000) < 0 ? *(srcptr + *offset_ptr_000) : 0; - float v001 = *reinterpret_cast(in_bound_ptr_001) < 0 ? *(srcptr + *offset_ptr_001) : 0; - float v010 = *reinterpret_cast(in_bound_ptr_010) < 0 ? *(srcptr + *offset_ptr_010) : 0; - float v011 = *reinterpret_cast(in_bound_ptr_011) < 0 ? *(srcptr + *offset_ptr_011) : 0; - - float v100 = *reinterpret_cast(in_bound_ptr_100) < 0 ? *(srcptr + *offset_ptr_100) : 0; - float v101 = *reinterpret_cast(in_bound_ptr_101) < 0 ? *(srcptr + *offset_ptr_101) : 0; - float v110 = *reinterpret_cast(in_bound_ptr_110) < 0 ? *(srcptr + *offset_ptr_110) : 0; - float v111 = *reinterpret_cast(in_bound_ptr_111) < 0 ? *(srcptr + *offset_ptr_111) : 0; + float v000 = *reinterpret_cast(in_bound_ptr_000) < 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; + float v001 = *reinterpret_cast(in_bound_ptr_001) < 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; + float v010 = *reinterpret_cast(in_bound_ptr_010) < 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; + float v011 = *reinterpret_cast(in_bound_ptr_011) < 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; + + float v100 = *reinterpret_cast(in_bound_ptr_100) < 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; + float v101 = *reinterpret_cast(in_bound_ptr_101) < 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; + float v110 = *reinterpret_cast(in_bound_ptr_110) < 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; + float v111 = *reinterpret_cast(in_bound_ptr_111) < 0 ? *(srcptr + static_cast(*offset_ptr_111)) : 0; float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index c03ad15151e..ae82054f110 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -24,7 +24,7 @@ struct gridsample_2d_nearest_compute_blob const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ - int* offset_ptr = offset.channel(0); + float* offset_ptr = offset.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -61,9 +61,8 @@ struct gridsample_2d_nearest_compute_blob gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr += 16; @@ -120,9 +119,8 @@ struct gridsample_2d_nearest_compute_blob gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; @@ -170,7 +168,7 @@ struct gridsample_2d_nearest_compute_blob const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ - int* offset_ptr = offset.channel(0); + float* offset_ptr = offset.channel(0); float* in_bound_ptr = in_bound.channel(0); @@ -208,10 +206,9 @@ struct gridsample_2d_nearest_compute_blob _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr += 16; offset_ptr += 8; @@ -267,10 +264,9 @@ struct gridsample_2d_nearest_compute_blob _mm256_and_ps(_mm256_cmp_ps(gy, *(__m256*)_ps256_n1, _CMP_GT_OS), _mm256_cmp_ps(vImgHf, gy, _CMP_GT_OS))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, vImgWf, gx), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; @@ -320,7 +316,7 @@ struct gridsample_3d_nearest_compute_blob const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ - int* offset_ptr = offset.channel(0); + float* offset_ptr = offset.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -366,11 +362,9 @@ struct gridsample_3d_nearest_compute_blob gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, - _mm256_comp_fmadd_ps(gy, vImgWf, gx)), - vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); + _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr += 24; @@ -440,9 +434,8 @@ struct gridsample_3d_nearest_compute_blob __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, _mm256_comp_fmadd_ps(gy, vImgWf, gx)), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; @@ -498,7 +491,7 @@ struct gridsample_3d_nearest_compute_blob const __m256 vElempackf = _mm256_set1_ps(src.elempack); #endif // __AVX__ - int* offset_ptr = offset.channel(0); + float* offset_ptr = offset.channel(0); float* in_bound_ptr = in_bound.channel(0); @@ -546,10 +539,9 @@ struct gridsample_3d_nearest_compute_blob __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, _mm256_comp_fmadd_ps(gy, vImgWf, gx)), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr += 24; offset_ptr += 8; @@ -614,10 +606,9 @@ struct gridsample_3d_nearest_compute_blob __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, _mm256_comp_fmadd_ps(gy, vImgWf, gx)), vElempackf); - __m256i i_offset = _mm256_cvtps_epi32(offset); _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_epi32(offset_ptr, i_offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; @@ -677,7 +668,7 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr = offset.channel(0); + const float* offset_ptr = offset.channel(0); const float* in_bound_ptr = in_bound.channel(0); @@ -708,13 +699,18 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr = offset.channel(0); + const float* offset_ptr = offset.channel(0); const float* in_bound_ptr = in_bound.channel(0); for (int i = 0; i < grid_size; i++) { - __m256 _v = mask_gather_ps256(srcptr, _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)), _mm256_set1_ps(*in_bound_ptr)); +#if __AVX2__ + __m256i _offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i _offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ + __m256 _v = mask_gather_ps256(srcptr, _offset, _mm256_set1_ps(*in_bound_ptr)); _mm256_storeu_ps(dstptr, _v); @@ -739,7 +735,7 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr = offset.channel(0); + const float* offset_ptr = offset.channel(0); const float* in_bound_ptr = in_bound.channel(0); @@ -772,7 +768,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr = offset.channel(0); + const float* offset_ptr = offset.channel(0); const float* in_bound_ptr = in_bound.channel(0); @@ -781,7 +777,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, #if __AVX__ for (int i = 0; i + 7 < grid_size; i += 8) { - __m256 _v = mask_gather_ps256(srcptr, _mm256_loadu_epi32(offset_ptr), _mm256_loadu_ps(in_bound_ptr)); + __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), _mm256_loadu_ps(in_bound_ptr)); _mm256_storeu_ps(dstptr, _v); @@ -805,7 +801,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, #endif // __SSE2__ for (int i = grid_size - nn; i < grid_size; i++) { - *dstptr = *reinterpret_cast(in_bound_ptr) < 0 ? *(srcptr + *offset_ptr) : 0; + *dstptr = *reinterpret_cast(in_bound_ptr) < 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; in_bound_ptr++; offset_ptr++; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index f57fe8550a5..b9aa50d77d2 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -241,7 +241,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Fri, 24 Feb 2023 11:25:18 +0000 Subject: [PATCH 072/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_compute_blob.h | 6 +++--- src/layer/x86/gridsample_nearest_compute_blob.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 912327270b5..88d508841e1 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -602,7 +602,7 @@ struct gridsample_3d_bilinear_compute_blob float* offset_ptr_001 = offset.channel(1); float* offset_ptr_010 = offset.channel(2); float* offset_ptr_011 = offset.channel(3); - + float* offset_ptr_100 = offset.channel(4); float* offset_ptr_101 = offset.channel(5); float* offset_ptr_110 = offset.channel(6); @@ -1053,7 +1053,7 @@ struct gridsample_3d_bilinear_compute_blob float* offset_ptr_001 = offset.channel(1); float* offset_ptr_010 = offset.channel(2); float* offset_ptr_011 = offset.channel(3); - + float* offset_ptr_100 = offset.channel(4); float* offset_ptr_101 = offset.channel(5); float* offset_ptr_110 = offset.channel(6); @@ -2380,7 +2380,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d float v001 = *reinterpret_cast(in_bound_ptr_001) < 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; float v010 = *reinterpret_cast(in_bound_ptr_010) < 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; float v011 = *reinterpret_cast(in_bound_ptr_011) < 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; - + float v100 = *reinterpret_cast(in_bound_ptr_100) < 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; float v101 = *reinterpret_cast(in_bound_ptr_101) < 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; float v110 = *reinterpret_cast(in_bound_ptr_110) < 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index ae82054f110..097270c4c2b 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -362,7 +362,8 @@ struct gridsample_3d_nearest_compute_blob gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(vImgWf, vImgHf), gz, - _mm256_comp_fmadd_ps(gy, vImgWf, gx)),vElempackf); + _mm256_comp_fmadd_ps(gy, vImgWf, gx)), + vElempackf); _mm256_storeu_ps(offset_ptr, offset); From 765ca733bc110865ddd633bef1aa418675d36252 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 24 Feb 2023 19:35:09 +0800 Subject: [PATCH 073/127] delete test code --- src/layer/x86/gridsample_x86.cpp | 2 +- tests/test_gridsample.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index b9aa50d77d2..f57fe8550a5 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -241,7 +241,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Sat, 25 Feb 2023 02:08:24 +0800 Subject: [PATCH 074/127] fix sse2 'Access violation' --- src/layer/x86/gridsample_bilinear_compute_blob.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 88d508841e1..17490785d29 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -1950,14 +1950,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const int* offset_ptr_000 = offset.channel(0); - const int* offset_ptr_001 = offset.channel(1); - const int* offset_ptr_010 = offset.channel(2); - const int* offset_ptr_011 = offset.channel(3); - const int* offset_ptr_100 = offset.channel(4); - const int* offset_ptr_101 = offset.channel(5); - const int* offset_ptr_110 = offset.channel(6); - const int* offset_ptr_111 = offset.channel(7); + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); const float* in_bound_ptr_000 = in_bound.channel(0); const float* in_bound_ptr_001 = in_bound.channel(1); From d2877cd5c974d87c8aec4ac1d21b71aca7333008 Mon Sep 17 00:00:00 2001 From: Yoh Date: Mon, 27 Feb 2023 07:36:55 +0800 Subject: [PATCH 075/127] move permute_gridsample_fusion into pass_ncnn/F_grid_sample.cpp --- src/layer/x86/unaryop_x86.cpp | 9 ++ tools/pnnx/src/CMakeLists.txt | 1 - tools/pnnx/src/pass_ncnn.cpp | 2 - tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 92 ++++++++++++- .../src/pass_ncnn/fuse_permute_gridsample.cpp | 125 ------------------ .../src/pass_ncnn/fuse_permute_gridsample.h | 25 ---- tools/pnnx/src/pass_ncnn/torch_permute.cpp | 3 - tools/pnnx/tests/ncnn/test_F_grid_sample.py | 82 +++++++----- .../ncnn/test_ncnn_fuse_permute_gridsample.py | 75 ----------- .../test_pnnx_fuse_permute_gridsample.py | 75 ----------- 10 files changed, 146 insertions(+), 343 deletions(-) delete mode 100644 tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp delete mode 100644 tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h delete mode 100644 tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py delete mode 100644 tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index e8dbeee5ef5..6bbb1c76085 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -158,6 +158,10 @@ struct unary_op_floor #if __SSE2__ __m128 func_pack4(const __m128& x) const { +#if __SSE4_1__ + return _mm_floor_ps(x); +#endif // __SSE4_1__ + // Use negative zero as the sign bit mask. const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); @@ -216,6 +220,11 @@ struct unary_op_ceil #if __SSE2__ __m128 func_pack4(const __m128& x) const { +#if __SSE4_1__ + return _mm_ceil_ps(x); +#endif // __SSE4_1__ + + // Use negative zero as the sign bit mask. const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 531af9e73ec..eaff15f1f34 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -368,7 +368,6 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/fuse_innerproduct_activation.cpp pass_ncnn/fuse_transpose_matmul.cpp pass_ncnn/fuse_binaryop_eltwise.cpp - pass_ncnn/fuse_permute_gridsample.cpp pass_ncnn/insert_reshape_numpy_binaryop_broadcast.cpp pass_ncnn/insert_reshape_linear.cpp pass_ncnn/insert_reshape_pooling.cpp diff --git a/tools/pnnx/src/pass_ncnn.cpp b/tools/pnnx/src/pass_ncnn.cpp index ebd548d7928..5daac8f4f94 100644 --- a/tools/pnnx/src/pass_ncnn.cpp +++ b/tools/pnnx/src/pass_ncnn.cpp @@ -43,7 +43,6 @@ #include "pass_ncnn/fuse_innerproduct_activation.h" #include "pass_ncnn/fuse_transpose_matmul.h" #include "pass_ncnn/fuse_binaryop_eltwise.h" -#include "pass_ncnn/fuse_permute_gridsample.h" #include "pass_ncnn/insert_reshape_numpy_binaryop_broadcast.h" #include "pass_ncnn/insert_reshape_linear.h" #include "pass_ncnn/insert_reshape_pooling.h" @@ -123,7 +122,6 @@ void pass_ncnn(Graph& g) ncnn::fuse_deconvolution_activation(g); ncnn::fuse_deconvolutiondepthwise_activation(g); ncnn::fuse_innerproduct_activation(g); - ncnn::fuse_permute_gridsample(g); ncnn::eliminate_tail_reshape_permute(g); dead_code_elimination(g); diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 7b9872b1842..a91107041ac 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -66,6 +66,96 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) +class F_grid_sample_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_a 0 1 a +pnnx.Input input_b 0 1 b +torch.permute op_0 1 1 b b1 dims=%dims +F.grid_sample op_1 2 1 a b1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "GridSample"; + } + + const char* name_str() const + { + return "permutegridsample"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::string& mode = captured_params.at("mode").s; + if (mode == "bilinear") + op->params["0"] = 1; + if (mode == "nearest") + op->params["0"] = 2; + if (mode == "bicubic") + op->params["0"] = 3; + + const std::string& padding_mode = captured_params.at("padding_mode").s; + if (padding_mode == "zeros") + op->params["1"] = 1; + if (padding_mode == "border") + op->params["1"] = 2; + if (padding_mode == "reflection") + op->params["1"] = 3; + + op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; + + const int batch_index = op->inputs[1]->params["__batch_index"].i; + + const std::vector& dims = captured_params.at("dims").ai; + + int input_rank = (int)op->inputs[0]->shape.size(); + + if (input_rank == 0) + { + // assume input is fine + input_rank = (int)dims.size(); + } + + if (batch_index >= 0 && batch_index < input_rank) + input_rank -= 1; + + if (input_rank > 4) + { + fprintf(stderr, "permute %d-rank tensor is not supported yet!\n", input_rank); + return; + } + + // drop permute batch index + std::vector new_dims; + for (int i = 0; i < (int)dims.size(); i++) + { + if (dims[i] == batch_index) + continue; + + int new_dim = dims[i] > batch_index ? dims[i] - 1 : dims[i]; + new_dims.push_back(new_dim); + } + + if (input_rank != (int)new_dims.size()) + { + fprintf(stderr, "permute %d-rank tensor with %d-rank dims is not possible\n", input_rank, (int)new_dims.size()); + return; + } + + if ((input_rank == 3 && new_dims == std::vector{1, 2, 0}) || (input_rank == 4 && new_dims == std::vector{1, 2, 3, 0})) + op->params["3"] = 1; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample_1, 19) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp b/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp deleted file mode 100644 index 4d448f114e2..00000000000 --- a/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.cpp +++ /dev/null @@ -1,125 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "fuse_permute_gridsample.h" - -#include "pass_level2.h" - -#include - -namespace pnnx { - -namespace ncnn { - -class fuse_permute_gridsample_4d_pass : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input_a 0 1 a -pnnx.Input input_b 0 1 b -Permute op_0 1 1 b b1 0=3 1=2 -GridSample op_1 2 1 a b1 out 0=%c 1=%d 2=%e 3=0 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "GridSample"; - } - - const char* name_str() const - { - return "permutegridsample"; - } - - void write(Operator* op, const std::map& captured_params) const - { - int mode = 0; - int padding_mode = 0; - int align_corner = 0; - if (captured_params.at("c").type == 2) - mode = captured_params.at("c").i; - if (captured_params.at("d").type == 2) - padding_mode = captured_params.at("d").i; - if (captured_params.at("e").type == 2) - align_corner = captured_params.at("e").i; - - op->params["0"] = mode; - op->params["1"] = padding_mode; - op->params["2"] = align_corner; - op->params["3"] = 1; - } -}; - -class fuse_permute_gridsample_5d_pass : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input_a 0 1 a -pnnx.Input input_b 0 1 b -Permute op_0 1 1 b b1 0=9 1=3 -GridSample op_1 2 1 a b1 out 0=%c 1=%d 2=%e 3=0 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "GridSample"; - } - - const char* name_str() const - { - return "permutegridsample"; - } - - void write(Operator* op, const std::map& captured_params) const - { - int mode = 0; - int padding_mode = 0; - int align_corner = 0; - if (captured_params.at("c").type == 2) - mode = captured_params.at("c").i; - if (captured_params.at("d").type == 2) - padding_mode = captured_params.at("d").i; - if (captured_params.at("e").type == 2) - align_corner = captured_params.at("e").i; - - op->params["0"] = mode; - op->params["1"] = padding_mode; - op->params["2"] = align_corner; - op->params["3"] = 1; - } -}; - -void fuse_permute_gridsample(Graph& graph) -{ - fuse_permute_gridsample_4d_pass a; - fuse_permute_gridsample_5d_pass b; - int opindex = 0; - - pnnx_graph_rewrite(graph, &a, opindex); - pnnx_graph_rewrite(graph, &b, opindex); -} - -} // namespace ncnn - -} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h b/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h deleted file mode 100644 index 61499dea32e..00000000000 --- a/tools/pnnx/src/pass_ncnn/fuse_permute_gridsample.h +++ /dev/null @@ -1,25 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "ir.h" - -namespace pnnx { - -namespace ncnn { - -void fuse_permute_gridsample(Graph& graph); - -} // namespace ncnn - -} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/torch_permute.cpp b/tools/pnnx/src/pass_ncnn/torch_permute.cpp index 30546cfad1c..13705dc9be6 100644 --- a/tools/pnnx/src/pass_ncnn/torch_permute.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_permute.cpp @@ -94,7 +94,6 @@ pnnx.Output output 1 0 out op->type = "Noop"; else if (new_dims == std::vector{1, 0}) op->params["0"] = 1; - op->params["1"] = 1; } if (input_rank == 3) { @@ -110,7 +109,6 @@ pnnx.Output output 1 0 out op->params["0"] = 4; else if (new_dims == std::vector{2, 1, 0}) op->params["0"] = 5; - op->params["1"] = 2; } if (input_rank == 4) { @@ -162,7 +160,6 @@ pnnx.Output output 1 0 out op->params["0"] = 22; else if (new_dims == std::vector{3, 2, 1, 0}) op->params["0"] = 23; - op->params["1"] = 3; } } }; diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py index c84d38232b1..1322ca0216e 100644 --- a/tools/pnnx/tests/ncnn/test_F_grid_sample.py +++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py @@ -29,39 +29,49 @@ def forward(self, x, xg1, xg2, y, yg1, yg2): yg1 = yg1 * 2 - 1 yg2 = yg2 * 2 - 1 - x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) - x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) - x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='reflection', align_corners=False) - x = F.grid_sample(x, xg2, mode='nearest', padding_mode='zeros', align_corners=False) - x = F.grid_sample(x, xg1, mode='nearest', padding_mode='border', align_corners=False) - x = F.grid_sample(x, xg2, mode='nearest', padding_mode='reflection', align_corners=False) - x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='zeros', align_corners=False) - x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='border', align_corners=False) - x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='reflection', align_corners=False) - x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='zeros', align_corners=True) - x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='border', align_corners=True) - x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='reflection', align_corners=True) - x = F.grid_sample(x, xg1, mode='nearest', padding_mode='zeros', align_corners=True) - x = F.grid_sample(x, xg2, mode='nearest', padding_mode='border', align_corners=True) - x = F.grid_sample(x, xg1, mode='nearest', padding_mode='reflection', align_corners=True) - x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='zeros', align_corners=True) - x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='border', align_corners=True) - x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='reflection', align_corners=True) - - y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) - y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) - y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=False) - y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=False) - y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=False) - y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=False) - y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=True) - y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=True) - y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=True) - y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=True) - y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=True) - y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=True) - - return x, y + x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) + x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='border', align_corners=False) + x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='reflection', align_corners=False) + x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='zeros', align_corners=False) + x0 = F.grid_sample(x0, xg1, mode='nearest', padding_mode='border', align_corners=False) + x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='reflection', align_corners=False) + x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='zeros', align_corners=False) + x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='border', align_corners=False) + x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='reflection', align_corners=False) + x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='zeros', align_corners=True) + x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='border', align_corners=True) + x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='reflection', align_corners=True) + x0 = F.grid_sample(x0, xg1, mode='nearest', padding_mode='zeros', align_corners=True) + x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='border', align_corners=True) + x0 = F.grid_sample(x0, xg1, mode='nearest', padding_mode='reflection', align_corners=True) + x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='zeros', align_corners=True) + x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='border', align_corners=True) + x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='reflection', align_corners=True) + + y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) + y0 = F.grid_sample(y0, yg2, mode='bilinear', padding_mode='border', align_corners=False) + y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='reflection', align_corners=False) + y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='zeros', align_corners=False) + y0 = F.grid_sample(y0, yg1, mode='nearest', padding_mode='border', align_corners=False) + y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='reflection', align_corners=False) + y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='zeros', align_corners=True) + y0 = F.grid_sample(y0, yg2, mode='bilinear', padding_mode='border', align_corners=True) + y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='reflection', align_corners=True) + y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='zeros', align_corners=True) + y0 = F.grid_sample(y0, yg1, mode='nearest', padding_mode='border', align_corners=True) + y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='reflection', align_corners=True) + + xg1 = torch.permute(xg1, (0, 2, 3, 1)) + xg2 = torch.permute(xg2, (0, 2, 3, 1)) + yg1 = torch.permute(yg1, (0, 2, 3, 4, 1)) + yg2 = torch.permute(yg2, (0, 2, 3, 4, 1)) + + x1 = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) + x1 = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) + + y1 = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) + y1 = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) + return x0, y0, x1, y1 def test(): net = Model() @@ -75,7 +85,7 @@ def test(): yg1 = torch.rand(1, 10, 21, 27, 3) yg2 = torch.rand(1, 10, 12, 16, 3) - a0, a1 = net(x, xg1, xg2, y, yg1, yg2) + a0, a1, a2, a3 = net(x, xg1, xg2, y, yg1, yg2) # export torchscript mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) @@ -87,9 +97,9 @@ def test(): # ncnn inference import test_F_grid_sample_ncnn - b0, b1 = test_F_grid_sample_ncnn.test_inference() + b0, b1, b2, b3 = test_F_grid_sample_ncnn.test_inference() - return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4) + return torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) and torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py b/tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py deleted file mode 100644 index c6448a4f5c0..00000000000 --- a/tools/pnnx/tests/ncnn/test_ncnn_fuse_permute_gridsample.py +++ /dev/null @@ -1,75 +0,0 @@ -# Tencent is pleased to support the open source community by making ncnn available. -# -# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. - -import torch -import torch.nn as nn -import torch.nn.functional as F - -class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - - def forward(self, x, xg1, xg2, y, yg1, yg2): - # norm to -1 ~ 1 - xg1 = xg1 * 2 - 1 - xg2 = xg2 * 2 - 1 - yg1 = yg1 * 2 - 1 - yg2 = yg2 * 2 - 1 - - xg1 = torch.permute(xg1, (0, 2, 3, 1)) - xg2 = torch.permute(xg2, (0, 2, 3, 1)) - yg1 = torch.permute(yg1, (0, 2, 3, 4, 1)) - yg2 = torch.permute(yg2, (0, 2, 3, 4, 1)) - - x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) - x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) - - y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) - y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) - - return x, y - -def test(): - net = Model() - net.eval() - - torch.manual_seed(0) - x = torch.rand(1, 3, 12, 16) - xg1 = torch.rand(1, 2, 21, 27) - xg2 = torch.rand(1, 2, 12, 16) - y = torch.rand(1, 5, 10, 12, 16) - yg1 = torch.rand(1, 3, 10, 21, 27) - yg2 = torch.rand(1, 3, 10, 12, 16) - - a0, a1 = net(x, xg1, xg2, y, yg1, yg2) - - # export torchscript - mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) - mod.save("test_ncnn_fuse_permute_gridsample.pt") - - # torchscript to pnnx - import os - os.system("../src/pnnx test_ncnn_fuse_permute_gridsample.pt inputshape=[1,3,12,16],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,3,10,21,27],[1,3,10,12,16]") - - # ncnn inference - import test_ncnn_fuse_permute_gridsample_ncnn - b0, b1 = test_ncnn_fuse_permute_gridsample_ncnn.test_inference() - - return torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) - -if __name__ == "__main__": - if test(): - exit(0) - else: - exit(1) diff --git a/tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py b/tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py deleted file mode 100644 index be95b15a896..00000000000 --- a/tools/pnnx/tests/test_pnnx_fuse_permute_gridsample.py +++ /dev/null @@ -1,75 +0,0 @@ -# Tencent is pleased to support the open source community by making ncnn available. -# -# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. - -import torch -import torch.nn as nn -import torch.nn.functional as F - -class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - - def forward(self, x, xg1, xg2, y, yg1, yg2): - # norm to -1 ~ 1 - xg1 = xg1 * 2 - 1 - xg2 = xg2 * 2 - 1 - yg1 = yg1 * 2 - 1 - yg2 = yg2 * 2 - 1 - - xg1 = torch.permute(xg1, (0, 2, 3, 1)) - xg2 = torch.permute(xg2, (0, 2, 3, 1)) - yg1 = torch.permute(yg1, (0, 2, 3, 4, 1)) - yg2 = torch.permute(yg2, (0, 2, 3, 4, 1)) - - x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) - x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) - - y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) - y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) - - return x, y - -def test(): - net = Model() - net.eval() - - torch.manual_seed(0) - x = torch.rand(1, 3, 12, 16) - xg1 = torch.rand(1, 2, 21, 27) - xg2 = torch.rand(1, 2, 12, 16) - y = torch.rand(1, 5, 10, 12, 16) - yg1 = torch.rand(1, 3, 10, 21, 27) - yg2 = torch.rand(1, 3, 10, 12, 16) - - a0, a1 = net(x, xg1, xg2, y, yg1, yg2) - - # export torchscript - mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) - mod.save("test_pnnx_fuse_permute_gridsample.pt") - - # torchscript to pnnx - import os - os.system("../src/pnnx test_pnnx_fuse_permute_gridsample.pt inputshape=[1,3,12,16],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,3,10,21,27],[1,3,10,12,16]") - - # pnnx inference - import test_pnnx_fuse_permute_gridsample_pnnx - b0, b1 = test_pnnx_fuse_permute_gridsample_pnnx.test_inference() - - return torch.equal(a0, b0) and torch.equal(a1, b1) - -if __name__ == "__main__": - if test(): - exit(0) - else: - exit(1) From f091bb83a2d2b767c57f80136d3c7d2395fdb2ca Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Sun, 26 Feb 2023 23:38:59 +0000 Subject: [PATCH 076/127] apply code-format changes --- src/layer/x86/unaryop_x86.cpp | 1 - tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index 6bbb1c76085..faba25dbd46 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -224,7 +224,6 @@ struct unary_op_ceil return _mm_ceil_ps(x); #endif // __SSE4_1__ - // Use negative zero as the sign bit mask. const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index a91107041ac..9527a20116d 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -109,8 +109,8 @@ pnnx.Output output 1 0 out if (padding_mode == "reflection") op->params["1"] = 3; - op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; - + op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; + const int batch_index = op->inputs[1]->params["__batch_index"].i; const std::vector& dims = captured_params.at("dims").ai; From c2476c907925cecaa9c0ed6e04ef770577d95244 Mon Sep 17 00:00:00 2001 From: Yoh Date: Mon, 27 Feb 2023 22:38:42 +0800 Subject: [PATCH 077/127] fix pnnx gridsample unittest --- tools/pnnx/tests/ncnn/test_F_grid_sample.py | 34 ++++++++++++--------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py index 1322ca0216e..b3e00ec7dfa 100644 --- a/tools/pnnx/tests/ncnn/test_F_grid_sample.py +++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py @@ -22,14 +22,14 @@ class Model(nn.Module): def __init__(self): super(Model, self).__init__() - def forward(self, x, xg1, xg2, y, yg1, yg2): + def forward(self, x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2): # norm to -1 ~ 1 xg1 = xg1 * 2 - 1 xg2 = xg2 * 2 - 1 yg1 = yg1 * 2 - 1 yg2 = yg2 * 2 - 1 - x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) + x0 = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='border', align_corners=False) x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='reflection', align_corners=False) x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='zeros', align_corners=False) @@ -48,7 +48,7 @@ def forward(self, x, xg1, xg2, y, yg1, yg2): x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='border', align_corners=True) x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='reflection', align_corners=True) - y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) + y0 = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) y0 = F.grid_sample(y0, yg2, mode='bilinear', padding_mode='border', align_corners=False) y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='reflection', align_corners=False) y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='zeros', align_corners=False) @@ -61,16 +61,16 @@ def forward(self, x, xg1, xg2, y, yg1, yg2): y0 = F.grid_sample(y0, yg1, mode='nearest', padding_mode='border', align_corners=True) y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='reflection', align_corners=True) - xg1 = torch.permute(xg1, (0, 2, 3, 1)) - xg2 = torch.permute(xg2, (0, 2, 3, 1)) - yg1 = torch.permute(yg1, (0, 2, 3, 4, 1)) - yg2 = torch.permute(yg2, (0, 2, 3, 4, 1)) + xgp1 = torch.permute(xgp1, (0, 2, 3, 1)) + xgp2 = torch.permute(xgp2, (0, 2, 3, 1)) + ygp1 = torch.permute(ygp1, (0, 2, 3, 4, 1)) + ygp2 = torch.permute(ygp2, (0, 2, 3, 4, 1)) - x1 = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) - x1 = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) + x1 = F.grid_sample(x, xgp1, mode='bilinear', padding_mode='zeros', align_corners=False) + x1 = F.grid_sample(x, xgp2, mode='bilinear', padding_mode='border', align_corners=False) - y1 = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) - y1 = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) + y1 = F.grid_sample(y, ygp1, mode='bilinear', padding_mode='zeros', align_corners=False) + y1 = F.grid_sample(y, ygp2, mode='bilinear', padding_mode='border', align_corners=False) return x0, y0, x1, y1 def test(): @@ -81,25 +81,29 @@ def test(): x = torch.rand(1, 3, 12, 16) xg1 = torch.rand(1, 21, 27, 2) xg2 = torch.rand(1, 12, 16, 2) + xgp1 = torch.rand(1, 2, 21, 27) + xgp2 = torch.rand(1, 2, 12, 16) y = torch.rand(1, 5, 10, 12, 16) yg1 = torch.rand(1, 10, 21, 27, 3) yg2 = torch.rand(1, 10, 12, 16, 3) + ygp1 = torch.rand(1, 3, 10, 21, 27) + ygp2 = torch.rand(1, 3, 10, 12, 16) - a0, a1, a2, a3 = net(x, xg1, xg2, y, yg1, yg2) + a0, a1, a2, a3 = net(x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2) # export torchscript - mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) + mod = torch.jit.trace(net, (x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2)) mod.save("test_F_grid_sample.pt") # torchscript to pnnx import os - os.system("../../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3]") + os.system("../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3],[1,3,10,21,27],[1,3,10,12,16]") # ncnn inference import test_F_grid_sample_ncnn b0, b1, b2, b3 = test_F_grid_sample_ncnn.test_inference() - return torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) and torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) + return torch.allclose(a0, b0, 1e-7, 1e-7) and torch.allclose(a1, b1, 1e-7, 1e-7) and torch.allclose(a2, b2, 1e-7, 1e-7) and torch.allclose(a3, b3, 1e-7, 1e-7) if __name__ == "__main__": if test(): From b5e70f622e0a9d9970df5192eff348a2c9fdd8a6 Mon Sep 17 00:00:00 2001 From: Yoh Date: Tue, 28 Feb 2023 00:42:54 +0800 Subject: [PATCH 078/127] [WIP]support cpp03 and code test --- src/layer/x86/gridsample_x86.cpp | 140 +++++++++++++++---------------- tests/test_gridsample.cpp | 12 +-- 2 files changed, 76 insertions(+), 76 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index f57fe8550a5..258e5685fcf 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -251,7 +251,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Border) + else if (padding_mode == Border) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Reflection) + else if (padding_mode == Reflection) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -307,7 +307,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Border) + else if (padding_mode == Border) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Reflection) + else if (padding_mode == Reflection) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -362,7 +362,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Border) + else if (padding_mode == Border) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Reflection) + else if (padding_mode == Reflection) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -427,7 +427,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Border) + else if (padding_mode == Border) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Reflection) + else if (padding_mode == Reflection) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -483,7 +483,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Border) + else if (padding_mode == Border) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == PaddingMode::Reflection) + else if (padding_mode == Reflection) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -552,26 +552,26 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Tue, 28 Feb 2023 01:24:01 +0800 Subject: [PATCH 079/127] remove the namespace of enum --- src/layer/gridsample.cpp | 10 +++---- src/layer/gridsample.h | 27 +++++++++---------- .../x86/gridsample_bicubic_compute_blob.h | 2 +- .../x86/gridsample_bilinear_compute_blob.h | 4 +-- .../x86/gridsample_nearest_compute_blob.h | 4 +-- src/layer/x86/gridsample_x86.cpp | 8 +++--- tests/test_gridsample.cpp | 12 ++++----- 7 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 382f84a9d23..948fc31c0ff 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -224,7 +224,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } - if (sample_type == InterpolationMode::Bilinear) // bilinear + if (sample_type == Bilinear) // bilinear { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -274,7 +274,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == InterpolationMode::Nearest) // nearest + else if (sample_type == Nearest) // nearest { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -307,7 +307,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == InterpolationMode::Bicubic) // bicubic + else if (sample_type == Bicubic) // bicubic { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -472,7 +472,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } - if (sample_type == InterpolationMode::Bilinear) // bilinear + if (sample_type == Bilinear) // bilinear { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -538,7 +538,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == InterpolationMode::Nearest) // nearest + else if (sample_type == Nearest) // nearest { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 96b6b1aeb24..4b54da7793c 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -18,6 +18,19 @@ #include "layer.h" namespace ncnn { +enum InterpolationMode // 1=bilinear 2=nearest 3=bicubic +{ + Bilinear = 1, + Nearest = 2, + Bicubic = 3 +}; + +enum PaddingMode // 1=zeros 2=border 3=reflection +{ + Zeros = 1, + Border = 2, + Reflection = 3 +}; class GridSample : public Layer { @@ -28,20 +41,6 @@ class GridSample : public Layer virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - enum InterpolationMode // 1=bilinear 2=nearest 3=bicubic - { - Bilinear = 1, - Nearest = 2, - Bicubic = 3 - }; - - enum PaddingMode // 1=zeros 2=border 3=reflection - { - Zeros = 1, - Border = 2, - Reflection = 3 - }; - public: // param int sample_type; // 1=bilinear 2=nearest 3=bicubic diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 88e01fc4a0b..beb04c0ea80 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -325,7 +325,7 @@ struct gridsample_2d_bicubic_compute_blob }; template -struct gridsample_2d_bicubic_compute_blob +struct gridsample_2d_bicubic_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 17490785d29..f3f89663da7 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -293,7 +293,7 @@ struct gridsample_2d_bilinear_compute_blob }; template -struct gridsample_2d_bilinear_compute_blob +struct gridsample_2d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -1037,7 +1037,7 @@ struct gridsample_3d_bilinear_compute_blob }; template -struct gridsample_3d_bilinear_compute_blob +struct gridsample_3d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 097270c4c2b..9d37b28c84b 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -157,7 +157,7 @@ struct gridsample_2d_nearest_compute_blob }; template -struct gridsample_2d_nearest_compute_blob +struct gridsample_2d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -480,7 +480,7 @@ struct gridsample_3d_nearest_compute_blob }; template -struct gridsample_3d_nearest_compute_blob +struct gridsample_3d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 258e5685fcf..8450e5fa42a 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -100,8 +100,6 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, #endif // __SSE2__ -typedef GridSample::PaddingMode PaddingMode; - template struct grid_sample_unormalize; @@ -139,7 +137,7 @@ template struct compute_coord; template -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -158,7 +156,7 @@ struct compute_coord }; template<> -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -183,7 +181,7 @@ struct compute_coord }; template<> -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 9876018a2af..51b040b363c 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -43,12 +43,12 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample static int test_gridsample_0() { return 0 - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 0, 0) - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 1, 0) - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 0, 0) - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 1, 0) - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 1, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 2, 0, 0) From 54749a2982e423f9f43ece9259f81804217d650f Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 28 Feb 2023 13:57:33 +0800 Subject: [PATCH 080/127] move enum into gridsample class --- src/layer/gridsample.h | 27 ++-- .../x86/gridsample_bicubic_compute_blob.h | 4 +- .../x86/gridsample_bilinear_compute_blob.h | 8 +- .../x86/gridsample_nearest_compute_blob.h | 8 +- src/layer/x86/gridsample_x86.cpp | 148 +++++++++--------- 5 files changed, 98 insertions(+), 97 deletions(-) diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 4b54da7793c..96b6b1aeb24 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -18,19 +18,6 @@ #include "layer.h" namespace ncnn { -enum InterpolationMode // 1=bilinear 2=nearest 3=bicubic -{ - Bilinear = 1, - Nearest = 2, - Bicubic = 3 -}; - -enum PaddingMode // 1=zeros 2=border 3=reflection -{ - Zeros = 1, - Border = 2, - Reflection = 3 -}; class GridSample : public Layer { @@ -41,6 +28,20 @@ class GridSample : public Layer virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + enum InterpolationMode // 1=bilinear 2=nearest 3=bicubic + { + Bilinear = 1, + Nearest = 2, + Bicubic = 3 + }; + + enum PaddingMode // 1=zeros 2=border 3=reflection + { + Zeros = 1, + Border = 2, + Reflection = 3 + }; + public: // param int sample_type; // 1=bilinear 2=nearest 3=bicubic diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index beb04c0ea80..06028c8d181 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -template +template struct gridsample_2d_bicubic_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) @@ -325,7 +325,7 @@ struct gridsample_2d_bicubic_compute_blob }; template -struct gridsample_2d_bicubic_compute_blob +struct gridsample_2d_bicubic_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index f3f89663da7..d876d15584e 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -template +template struct gridsample_2d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) @@ -293,7 +293,7 @@ struct gridsample_2d_bilinear_compute_blob }; template -struct gridsample_2d_bilinear_compute_blob +struct gridsample_2d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -585,7 +585,7 @@ struct gridsample_2d_bilinear_compute_blob } }; -template +template struct gridsample_3d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) @@ -1037,7 +1037,7 @@ struct gridsample_3d_bilinear_compute_blob }; template -struct gridsample_3d_bilinear_compute_blob +struct gridsample_3d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 9d37b28c84b..49e1f70fc2c 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -template +template struct gridsample_2d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) @@ -157,7 +157,7 @@ struct gridsample_2d_nearest_compute_blob }; template -struct gridsample_2d_nearest_compute_blob +struct gridsample_2d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -303,7 +303,7 @@ struct gridsample_2d_nearest_compute_blob } }; -template +template struct gridsample_3d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) @@ -480,7 +480,7 @@ struct gridsample_3d_nearest_compute_blob }; template -struct gridsample_3d_nearest_compute_blob +struct gridsample_3d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 8450e5fa42a..0b0f73e119a 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -133,11 +133,11 @@ struct grid_sample_unormalize } }; -template +template struct compute_coord; template -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -156,7 +156,7 @@ struct compute_coord }; template<> -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -181,7 +181,7 @@ struct compute_coord }; template<> -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -249,7 +249,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Border) + else if (padding_mode == GridSample::Border) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Reflection) + else if (padding_mode == GridSample::Reflection) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -305,7 +305,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Border) + else if (padding_mode == GridSample::Border) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Reflection) + else if (padding_mode == GridSample::Reflection) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -360,7 +360,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Border) + else if (padding_mode == GridSample::Border) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Reflection) + else if (padding_mode == GridSample::Reflection) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -425,7 +425,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Border) + else if (padding_mode == GridSample::Border) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Reflection) + else if (padding_mode == GridSample::Reflection) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -481,7 +481,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Border) + else if (padding_mode == GridSample::Border) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == Reflection) + else if (padding_mode == GridSample::Reflection) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -550,26 +550,26 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Thu, 2 Mar 2023 17:15:37 +0800 Subject: [PATCH 081/127] fix nearest accuracy problem --- src/layer/x86/gridsample_x86.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 0b0f73e119a..025e3427e71 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -100,6 +100,14 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, #endif // __SSE2__ +#if _MSC_VER +#define OPT_2 +#elif __clang__ +#define OPT_2 __attribute__((optnone)) +#elif __GNUC__ +#define OPT_2 __attribute__((optimize("2"))) +#endif + template struct grid_sample_unormalize; @@ -107,6 +115,7 @@ template<> struct grid_sample_unormalize { #if __AVX__ + OPT_2 __m256 operator()(__m256 length, __m256 coord) { return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(length, *(__m256*)_ps256_1)); @@ -122,7 +131,8 @@ template<> struct grid_sample_unormalize { #if __AVX__ - __m256 operator()(__m256 length, __m256 coord) + OPT_2 + __m256 OPT_2 operator()(__m256 length, __m256 coord) { return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), length, *(__m256*)_ps256_1), *(__m256*)_ps256_2); } From 51580b016d6869d2bf72b7f30d5eec151b02ec51 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Thu, 2 Mar 2023 09:17:43 +0000 Subject: [PATCH 082/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 025e3427e71..17d2c57124d 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -106,7 +106,7 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, #define OPT_2 __attribute__((optnone)) #elif __GNUC__ #define OPT_2 __attribute__((optimize("2"))) -#endif +#endif template struct grid_sample_unormalize; From 9b888b1aed3247ecb8456ab9045fff8085dec303 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Thu, 2 Mar 2023 17:19:01 +0800 Subject: [PATCH 083/127] fix bug --- src/layer/x86/gridsample_x86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 025e3427e71..db7e2c15864 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -132,7 +132,7 @@ struct grid_sample_unormalize { #if __AVX__ OPT_2 - __m256 OPT_2 operator()(__m256 length, __m256 coord) + __m256 operator()(__m256 length, __m256 coord) { return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), length, *(__m256*)_ps256_1), *(__m256*)_ps256_2); } From a2ad5d36961b6e16f32e3ba2393e47d8e794c735 Mon Sep 17 00:00:00 2001 From: Yoh Date: Sat, 11 Mar 2023 20:16:39 +0800 Subject: [PATCH 084/127] simplify unittest --- tests/test_gridsample.cpp | 56 +++------------------------------------ 1 file changed, 4 insertions(+), 52 deletions(-) diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 51b040b363c..b397faa6d6d 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -51,16 +51,8 @@ static int test_gridsample_0() || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 1, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 2, 0, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 2, 1, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 3, 0, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 1, 1, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 2, 0, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 2, 1, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 3, 0, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 3, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 1, 0, 1) || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 1, 1, 1) || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 2, 0, 1) @@ -69,16 +61,8 @@ static int test_gridsample_0() || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 1, 3, 1, 1) || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 1, 0, 1) || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 1, 1, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 2, 0, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 2, 1, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 3, 0, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 2, 3, 1, 1) || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 1, 0, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 1, 1, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 2, 0, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 2, 1, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 3, 0, 1) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 3, 1, 1); + || test_gridsample(RandomMat(3, 7, 24), RandomMat(11, 13, 2), 3, 1, 1, 1); } static int test_gridsample_1() @@ -92,16 +76,8 @@ static int test_gridsample_1() || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 1, 3, 1, 0) || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 1, 0, 0) || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 1, 1, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 2, 0, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 2, 1, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 3, 0, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 2, 3, 1, 0) || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 1, 0, 0) || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 1, 1, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 2, 0, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 2, 1, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 3, 0, 0) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(2, 27, 21), 3, 3, 1, 0) || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 1, 0, 1) || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 1, 1, 1) || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 2, 0, 1) @@ -110,16 +86,8 @@ static int test_gridsample_1() || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 1, 3, 1, 1) || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 1, 0, 1) || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 1, 1, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 2, 0, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 2, 1, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 3, 0, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 2, 3, 1, 1) || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 1, 0, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 1, 1, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 2, 0, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 2, 1, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 3, 0, 1) - || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 3, 1, 1); + || test_gridsample(RandomMat(8, 16, 32), RandomMat(27, 21, 2), 3, 1, 1, 1); } static int test_gridsample_2() @@ -133,10 +101,6 @@ static int test_gridsample_2() || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 1, 0) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 0, 0) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 3, 1, 0) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 1, 0, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 1, 1, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 2, 0, 1) @@ -144,11 +108,7 @@ static int test_gridsample_2() || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 3, 0, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 3, 1, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 2, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 3, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 3, 1, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 1, 1); } static int test_gridsample_3() @@ -162,10 +122,6 @@ static int test_gridsample_3() || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 1, 0) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 0, 0) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 3, 1, 0) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 1, 0, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 1, 1, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 2, 0, 1) @@ -173,11 +129,7 @@ static int test_gridsample_3() || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 3, 0, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 3, 1, 1) || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 2, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 3, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 3, 1, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 1, 1); } int main() From c3891fa1149c6a5d2e05b6c5fb7ab07e21608038 Mon Sep 17 00:00:00 2001 From: Yoh Date: Sun, 12 Mar 2023 02:43:22 +0800 Subject: [PATCH 085/127] include the correct header --- src/layer/x86/gridsample_x86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index ea3781ec051..e45822819a3 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -15,7 +15,7 @@ #include "gridsample_x86.h" #if __SSE2__ -#include +#include #include "sse_mathfun.h" #if __AVX__ #include From 34b78286da9390fd75ac810447e9efcbef5646f9 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 13 Mar 2023 16:13:53 +0800 Subject: [PATCH 086/127] fix code dump(illegal instruction) --- src/layer/x86/gridsample_x86.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index ea3781ec051..6036b45ce90 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -108,6 +108,8 @@ static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, #define OPT_2 __attribute__((optimize("2"))) #endif +namespace GridSample_x86_kernel { + template struct grid_sample_unormalize; @@ -228,8 +230,11 @@ struct compute_coord #include "gridsample_bicubic_compute_blob.h" #include "gridsample_nearest_compute_blob.h" +} //namespace GridSample_x86_kernel + int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { + using namespace GridSample_x86_kernel; const Mat& bottom_blob = bottom_blobs[0]; const Mat& grid = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; From 6091955ac254e90fecf6e2f860aceaf14cd6cb0b Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Mon, 13 Mar 2023 08:18:00 +0000 Subject: [PATCH 087/127] apply code-format changes --- src/layer/x86/gridsample_x86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index ec120610e3f..1e83cf1e20b 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -230,7 +230,7 @@ struct compute_coord #include "gridsample_bicubic_compute_blob.h" #include "gridsample_nearest_compute_blob.h" -} //namespace GridSample_x86_kernel +} //namespace GridSample_x86_kernel int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { From 531a881e21836cd9773983e6589ed51e3cfc4f11 Mon Sep 17 00:00:00 2001 From: Yoh Date: Tue, 14 Mar 2023 02:41:08 +0800 Subject: [PATCH 088/127] fix unittest --- tests/test_gridsample.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index b397faa6d6d..55cf5b75758 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -47,7 +47,7 @@ static int test_gridsample_0() || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 1, 0) - || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) + //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) // just cpp-03-simplestl no pass || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 1, 0) From aa54cd242f927a5f2df01323d05bdb356d5553a9 Mon Sep 17 00:00:00 2001 From: Yoh Date: Tue, 14 Mar 2023 04:11:55 +0800 Subject: [PATCH 089/127] fix cpp-03-simplestl bug --- src/layer/gridsample.cpp | 4 ++-- src/layer/x86/gridsample_x86.cpp | 8 ++++---- tests/test_gridsample.cpp | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 948fc31c0ff..2a5ff23adbc 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -66,8 +66,8 @@ static float border_coord(float x, float border) static float reflect_coord(float x, int high) { - x = abs(x); - x = high - abs(x - high); + x = fabs(x); + x = high - fabs(x - high); return x; } diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 1e83cf1e20b..1759164bb3a 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -185,8 +185,8 @@ struct compute_coord #endif // __AVX__ float operator()(int length, float coord) { - coord = abs(coord); - coord = (length - 1) - abs(coord - (length - 1)); + coord = fabs(coord); + coord = (length - 1) - fabs(coord - (length - 1)); return std::min(length - 1.0f, std::max(coord, 0.0f)); } @@ -219,8 +219,8 @@ struct compute_coord #endif // __AVX__ float operator()(int length, float coord) { - coord = abs(coord + 0.5f); - coord = length - abs(coord - length) - 0.5; + coord = fabs(coord + 0.5f); + coord = length - fabs(coord - length) - 0.5; return std::min(length - 1.0f, std::max(coord, 0.0f)); } diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 55cf5b75758..b397faa6d6d 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -47,7 +47,7 @@ static int test_gridsample_0() || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 1, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 2, 1, 0) - //|| test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) // just cpp-03-simplestl no pass + || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 1, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 24), RandomMat(2, 11, 13), 2, 1, 1, 0) From 9e685f2d7081680b2221e103e890ebc9a837d59d Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 20 Mar 2023 17:00:04 +0800 Subject: [PATCH 090/127] fix some bugs --- src/layer/gridsample.cpp | 11 +---------- src/layer/x86/gridsample_x86.cpp | 16 +++++++++------- tools/pnnx/tests/ncnn/test_F_grid_sample.py | 4 ++-- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 2a5ff23adbc..561f5ddccc3 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -110,7 +110,7 @@ static float get_value_bounded(const Mat& image, int x, int y) static float get_value_bounded(const Mat& image, int x, int y, int z) { - return in_bounds(image, x, y, z) ? image.channel(z).row(y)[x] : 0.f; + return in_bounds(image, x, y, z) ? image.depth(z).row(y)[x] : 0.f; } static float get_value_bounded(const Mat& image, int x, int y, int padding_mode, int align_corner) @@ -121,15 +121,6 @@ static float get_value_bounded(const Mat& image, int x, int y, int padding_mode, return get_value_bounded(image, x, y); } -static float get_value_bounded(const Mat& image, int x, int y, int z, int padding_mode, int align_corner) -{ - x = compute_coord(x, image.w, padding_mode, align_corner); - y = compute_coord(y, image.h, padding_mode, align_corner); - z = compute_coord(z, image.c, padding_mode, align_corner); - - return get_value_bounded(image, x, y, z); -} - static inline void interpolate_cubic(float fx, float* coeffs) { const float A = -0.75f; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 1759164bb3a..1aa3702865d 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -247,18 +247,20 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Thu, 23 Mar 2023 02:09:03 +0800 Subject: [PATCH 091/127] fix unittest --- tools/pnnx/tests/ncnn/test_F_grid_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py index a4594d4c9a6..e1704bbf096 100644 --- a/tools/pnnx/tests/ncnn/test_F_grid_sample.py +++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py @@ -103,7 +103,7 @@ def test(): import test_F_grid_sample_ncnn b0, b1, b2, b3 = test_F_grid_sample_ncnn.test_inference() - return torch.allclose(a0, b0, 1e-7, 1e-7) and torch.allclose(a1, b1, 1e-7, 1e-7) and torch.allclose(a2, b2, 1e-7, 1e-7) and torch.allclose(a3, b3, 1e-7, 1e-7) + return torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) and torch.allclose(a2, b2, 1e-6, 1e-6) and torch.allclose(a3, b3, 1e-6, 1e-6) if __name__ == "__main__": if test(): From 055b750fd81e32fea5277434556be5d95706835f Mon Sep 17 00:00:00 2001 From: Yoh Date: Sun, 26 Mar 2023 03:07:35 +0800 Subject: [PATCH 092/127] fix unittest --- tools/pnnx/tests/ncnn/test_F_grid_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py index e1704bbf096..1329d69e70d 100644 --- a/tools/pnnx/tests/ncnn/test_F_grid_sample.py +++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py @@ -97,7 +97,7 @@ def test(): # torchscript to pnnx import os - os.system("../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3],[1,3,10,21,27],[1,3,10,12,16]") + os.system("../../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3],[1,3,10,21,27],[1,3,10,12,16]") # ncnn inference import test_F_grid_sample_ncnn From d3342a8037653f808f4b74ca0a91894565dbead9 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 14 Apr 2023 19:31:26 +0800 Subject: [PATCH 093/127] fix torch permute in torch=1.8 --- tools/pnnx/tests/ncnn/test_F_grid_sample.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py index 1329d69e70d..95ca812eb51 100644 --- a/tools/pnnx/tests/ncnn/test_F_grid_sample.py +++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py @@ -61,10 +61,10 @@ def forward(self, x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2): y0 = F.grid_sample(y0, yg1, mode='nearest', padding_mode='border', align_corners=True) y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='reflection', align_corners=True) - xgp1 = torch.permute(xgp1, (0, 2, 3, 1)) - xgp2 = torch.permute(xgp2, (0, 2, 3, 1)) - ygp1 = torch.permute(ygp1, (0, 2, 3, 4, 1)) - ygp2 = torch.permute(ygp2, (0, 2, 3, 4, 1)) + xgp1 = xgp1.permute(0, 2, 3, 1) + xgp2 = xgp2.permute(0, 2, 3, 1) + ygp1 = ygp1.permute(0, 2, 3, 4, 1) + ygp2 = ygp2.permute(0, 2, 3, 4, 1) x1 = F.grid_sample(x, xgp1, mode='bilinear', padding_mode='zeros', align_corners=False) x1 = F.grid_sample(x1, xgp2, mode='bilinear', padding_mode='border', align_corners=False) From ca6c0451974930b51128c8ce42894c0e116b6838 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Thu, 20 Apr 2023 15:18:44 +0800 Subject: [PATCH 094/127] optimize code and apply code-format --- src/layer/gridsample.cpp | 14 +- src/layer/gridsample.h | 12 +- src/layer/x86/avx512_mathfun.h | 1 - .../x86/gridsample_bicubic_compute_blob.h | 40 ++-- .../x86/gridsample_bilinear_compute_blob.h | 80 ++++---- .../x86/gridsample_nearest_compute_blob.h | 70 +++---- src/layer/x86/gridsample_x86.cpp | 181 +++++++++--------- 7 files changed, 184 insertions(+), 214 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 561f5ddccc3..baa3470e68d 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -159,7 +159,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& return -100; Mat offset_blob; - offset_blob.create(outw, outh, grid.c, elemsize, opt.blob_allocator); + offset_blob.create(outw, outh, grid.c, elemsize, opt.workspace_allocator); //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly if (permute_fusion == 0) @@ -215,7 +215,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } - if (sample_type == Bilinear) // bilinear + if (sample_type == Interpolation_BILINEAR) // bilinear { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -265,7 +265,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == Nearest) // nearest + else if (sample_type == Interpolation_NEAREST) // nearest { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -298,7 +298,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == Bicubic) // bicubic + else if (sample_type == Interpolation_BICUBIC) // bicubic { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -379,7 +379,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& return -100; Mat offset_blob; - offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.blob_allocator); + offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.workspace_allocator); //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly if (permute_fusion == 0) @@ -463,7 +463,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } - if (sample_type == Bilinear) // bilinear + if (sample_type == Interpolation_BILINEAR) // bilinear { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -529,7 +529,7 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& } } } - else if (sample_type == Nearest) // nearest + else if (sample_type == Interpolation_NEAREST) // nearest { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h index 96b6b1aeb24..f6e17c9d2f4 100644 --- a/src/layer/gridsample.h +++ b/src/layer/gridsample.h @@ -30,16 +30,16 @@ class GridSample : public Layer enum InterpolationMode // 1=bilinear 2=nearest 3=bicubic { - Bilinear = 1, - Nearest = 2, - Bicubic = 3 + Interpolation_BILINEAR = 1, + Interpolation_NEAREST = 2, + Interpolation_BICUBIC = 3 }; enum PaddingMode // 1=zeros 2=border 3=reflection { - Zeros = 1, - Border = 2, - Reflection = 3 + Padding_ZEROS = 1, + Padding_BORDER = 2, + Padding_REFLECTION = 3 }; public: diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index 14a607c9815..2d84e388774 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -44,7 +44,6 @@ _PS512_CONST_TYPE(mant_mask, int, 0x7f800000); _PS512_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); _PS512_CONST_TYPE(sign_mask, int, (int)0x80000000); -_PS512_CONST_TYPE(inv_sign_mask, int, ~0x80000000); _PI32_CONST512(0, 0); _PI32_CONST512(1, 1); diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 06028c8d181..dd71d622de9 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -50,9 +50,9 @@ struct gridsample_2d_bicubic_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + for (; x + 15 < grid_size; x += 16) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); @@ -131,10 +131,9 @@ struct gridsample_2d_bicubic_compute_blob gridptr += 16; } - nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) + for (; x < grid_size; x += 2) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -191,9 +190,9 @@ struct gridsample_2d_bicubic_compute_blob const float* gridptr_x = grid.channel(0); const float* gridptr_y = grid.channel(1); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -266,10 +265,9 @@ struct gridsample_2d_bicubic_compute_blob gridptr_y += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -325,7 +323,7 @@ struct gridsample_2d_bicubic_compute_blob }; template -struct gridsample_2d_bicubic_compute_blob +struct gridsample_2d_bicubic_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -363,9 +361,9 @@ struct gridsample_2d_bicubic_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + for (; x + 15 < grid_size; x += 16) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); @@ -442,10 +440,9 @@ struct gridsample_2d_bicubic_compute_blob gridptr += 16; } - nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) + for (; x < grid_size; x += 2) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -509,9 +506,9 @@ struct gridsample_2d_bicubic_compute_blob const float* gridptr_x = grid.channel(0); const float* gridptr_y = grid.channel(1); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -582,10 +579,9 @@ struct gridsample_2d_bicubic_compute_blob gridptr_y += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -994,14 +990,14 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds const float* value_x = value.channel(0); const float* value_y = value.channel(1); - int nn = grid_size; + int x = 0; #if __SSE2__ #if __AVX__ { __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m256 value_f[4]; - for (int i = 0; i + 7 < grid_size; i += 8) + for (; x + 7 < grid_size; x += 8) { cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); for (int ii = 0; ii < 4; ii++) @@ -1041,13 +1037,12 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds dstptr += 8; } } - nn = grid_size & 7; #endif // __AVX__ { __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m128 value_f[4]; - for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + for (; x + 3 < grid_size; x += 4) { cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); for (int ii = 0; ii < 4; ii++) @@ -1087,13 +1082,12 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds dstptr += 4; } } - nn = grid_size & 3; #endif // __SSE2__ float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; float value_f[4]; - for (int i = grid_size - nn; i < grid_size; i++) + for (; x < grid_size; x++) { cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); for (int ii = 0; ii < 4; ii++) diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index d876d15584e..38f1f824b90 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -44,9 +44,9 @@ struct gridsample_2d_bilinear_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + for (; x + 15 < grid_size; x += 16) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); @@ -115,10 +115,9 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_beta += 8; } - nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) + for (; x < grid_size; x += 2) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -172,9 +171,9 @@ struct gridsample_2d_bilinear_compute_blob const float* gridptr_x = grid.channel(0); const float* gridptr_y = grid.channel(1); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -237,10 +236,9 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_beta += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -293,7 +291,7 @@ struct gridsample_2d_bilinear_compute_blob }; template -struct gridsample_2d_bilinear_compute_blob +struct gridsample_2d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -324,9 +322,9 @@ struct gridsample_2d_bilinear_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + for (; x + 15 < grid_size; x += 16) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); @@ -401,10 +399,9 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_alpha += 8; value_ptr_beta += 8; } - nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) + for (; x < grid_size; x += 2) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -460,9 +457,9 @@ struct gridsample_2d_bilinear_compute_blob const float* gridptr_x = grid.channel(0); const float* gridptr_y = grid.channel(1); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -529,10 +526,9 @@ struct gridsample_2d_bilinear_compute_blob value_ptr_beta += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -629,9 +625,9 @@ struct gridsample_3d_bilinear_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) + for (; x + 23 < grid_size; x += 24) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); @@ -749,10 +745,9 @@ struct gridsample_3d_bilinear_compute_blob value_ptr_beta += 8; value_ptr_gamma += 8; } - nn = grid_size % 24; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) + for (; x < grid_size; x += 3) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -838,9 +833,9 @@ struct gridsample_3d_bilinear_compute_blob const float* gridptr_y = grid.channel(1); const float* gridptr_z = grid.channel(2); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -949,10 +944,9 @@ struct gridsample_3d_bilinear_compute_blob value_ptr_beta += 8; value_ptr_gamma += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -1037,7 +1031,7 @@ struct gridsample_3d_bilinear_compute_blob }; template -struct gridsample_3d_bilinear_compute_blob +struct gridsample_3d_bilinear_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -1079,9 +1073,9 @@ struct gridsample_3d_bilinear_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) + for (; x + 23 < grid_size; x += 24) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); @@ -1204,10 +1198,9 @@ struct gridsample_3d_bilinear_compute_blob value_ptr_beta += 8; value_ptr_gamma += 8; } - nn = grid_size % 24; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) + for (; x < grid_size; x += 3) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -1294,9 +1287,9 @@ struct gridsample_3d_bilinear_compute_blob const float* gridptr_y = grid.channel(1); const float* gridptr_z = grid.channel(2); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -1410,10 +1403,9 @@ struct gridsample_3d_bilinear_compute_blob value_ptr_beta += 8; value_ptr_gamma += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -2072,11 +2064,11 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); - int nn = grid_size; + int x = 0; #if __SSE2__ #if __AVX__ - for (int i = 0; i + 7 < grid_size; i += 8) + for (; x + 7 < grid_size; x += 8) { __m256i v00_offset = _mm256_set_epi32(*(offset_ptr_00 + 7), *(offset_ptr_00 + 6), *(offset_ptr_00 + 5), *(offset_ptr_00 + 4), *(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); __m256i v01_offset = _mm256_set_epi32(*(offset_ptr_01 + 7), *(offset_ptr_01 + 6), *(offset_ptr_01 + 5), *(offset_ptr_01 + 4), *(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); @@ -2117,9 +2109,8 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + for (; x + 3 < grid_size; x += 4) { __m128i v00_offset = _mm_set_epi32(*(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); __m128i v01_offset = _mm_set_epi32(*(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); @@ -2160,9 +2151,8 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 4; } - nn = grid_size & 3; #endif // __SSE2__ - for (int i = grid_size - nn; i < grid_size; i++) + for (; x < grid_size; x++) { float v00 = *in_bound_ptr_00 < 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; float v01 = *in_bound_ptr_01 < 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; @@ -2226,10 +2216,10 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* value_ptr_beta = value.channel(1); const float* value_ptr_gamma = value.channel(2); - int nn = grid_size; + int x = 0; #if __SSE2__ #if __AVX__ - for (int i = 0; i + 7 < grid_size; i += 8) + for (; x + 7 < grid_size; x += 8) { __m256i v000_offset = _mm256_set_epi32(*(offset_ptr_000 + 7), *(offset_ptr_000 + 6), *(offset_ptr_000 + 5), *(offset_ptr_000 + 4), *(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); __m256i v001_offset = _mm256_set_epi32(*(offset_ptr_001 + 7), *(offset_ptr_001 + 6), *(offset_ptr_001 + 5), *(offset_ptr_001 + 4), *(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); @@ -2300,9 +2290,8 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + for (; x + 3 < grid_size; x += 4) { __m128i v000_offset = _mm_set_epi32(*(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); __m128i v001_offset = _mm_set_epi32(*(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); @@ -2372,9 +2361,8 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d dstptr += 4; } - nn = grid_size & 3; #endif // __SSE2__ - for (int i = grid_size - nn; i < grid_size; i++) + for (; x < grid_size; x++) { float v000 = *reinterpret_cast(in_bound_ptr_000) < 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; float v001 = *reinterpret_cast(in_bound_ptr_001) < 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 49e1f70fc2c..d43245da6ed 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -34,9 +34,9 @@ struct gridsample_2d_nearest_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + for (; x + 15 < grid_size; x += 16) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); @@ -69,10 +69,9 @@ struct gridsample_2d_nearest_compute_blob offset_ptr += 8; } - nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) + for (; x < grid_size; x += 2) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -99,9 +98,9 @@ struct gridsample_2d_nearest_compute_blob const float* gridptr_x = grid.channel(0); const float* gridptr_y = grid.channel(1); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -128,10 +127,9 @@ struct gridsample_2d_nearest_compute_blob offset_ptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -157,7 +155,7 @@ struct gridsample_2d_nearest_compute_blob }; template -struct gridsample_2d_nearest_compute_blob +struct gridsample_2d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -179,9 +177,9 @@ struct gridsample_2d_nearest_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 15 < nn; x += 16) + for (; x + 15 < grid_size; x += 16) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); @@ -215,10 +213,9 @@ struct gridsample_2d_nearest_compute_blob in_bound_ptr += 8; } - nn = grid_size & 15; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 2) + for (; x < grid_size; x += 2) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -244,9 +241,9 @@ struct gridsample_2d_nearest_compute_blob const float* gridptr_x = grid.channel(0); const float* gridptr_y = grid.channel(1); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -274,10 +271,9 @@ struct gridsample_2d_nearest_compute_blob in_bound_ptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -326,9 +322,9 @@ struct gridsample_3d_nearest_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) + for (; x + 23 < grid_size; x += 24) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); @@ -372,10 +368,9 @@ struct gridsample_3d_nearest_compute_blob offset_ptr += 8; } - nn = grid_size % 24; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) + for (; x < grid_size; x += 3) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -408,9 +403,9 @@ struct gridsample_3d_nearest_compute_blob const float* gridptr_y = grid.channel(1); const float* gridptr_z = grid.channel(2); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -445,10 +440,9 @@ struct gridsample_3d_nearest_compute_blob offset_ptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -480,7 +474,7 @@ struct gridsample_3d_nearest_compute_blob }; template -struct gridsample_3d_nearest_compute_blob +struct gridsample_3d_nearest_compute_blob { void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { @@ -503,9 +497,9 @@ struct gridsample_3d_nearest_compute_blob for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 23 < nn; x += 24) + for (; x + 23 < grid_size; x += 24) { __m256 tmp_x = _mm256_loadu_ps(gridptr); __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); @@ -549,10 +543,9 @@ struct gridsample_3d_nearest_compute_blob in_bound_ptr += 8; } - nn = grid_size % 24; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x += 3) + for (; x < grid_size; x += 3) { float sample_x = *gridptr; float sample_y = *(gridptr + 1); @@ -581,9 +574,9 @@ struct gridsample_3d_nearest_compute_blob const float* gridptr_y = grid.channel(1); const float* gridptr_z = grid.channel(2); - int nn = grid_size; + int x = 0; #if __AVX__ - for (int x = 0; x + 7 < nn; x += 8) + for (; x + 7 < grid_size; x += 8) { __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); @@ -619,10 +612,9 @@ struct gridsample_3d_nearest_compute_blob in_bound_ptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int x = grid_size - nn; x < grid_size; x++) + for (; x < grid_size; x++) { float sample_x = *gridptr_x; float sample_y = *gridptr_y; @@ -773,10 +765,10 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const float* in_bound_ptr = in_bound.channel(0); - int nn = grid_size; + int x = 0; #if __SSE2__ #if __AVX__ - for (int i = 0; i + 7 < grid_size; i += 8) + for (; x + 7 < grid_size; x += 8) { __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), _mm256_loadu_ps(in_bound_ptr)); @@ -786,9 +778,8 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, in_bound_ptr += 8; dstptr += 8; } - nn = grid_size & 7; #endif // __AVX__ - for (int i = grid_size - nn; i + 3 < grid_size; i += 4) + for (; x + 3 < grid_size; x += 4) { __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), _mm_loadu_ps(in_bound_ptr)); @@ -798,9 +789,8 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, in_bound_ptr += 4; dstptr += 4; } - nn = grid_size & 3; #endif // __SSE2__ - for (int i = grid_size - nn; i < grid_size; i++) + for (; x < grid_size; x++) { *dstptr = *reinterpret_cast(in_bound_ptr) < 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 1aa3702865d..fde46dc1b8f 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -39,6 +39,10 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ +#if __AVX512F__ +_PS512_CONST_TYPE(inv_sign_mask, int, ~0x80000000); +#endif // __AVX512F__ + _PS256_CONST(n1, -1.0f); _PS256_CONST(2, 2.0f); _PI32_CONST256(n1, -1); @@ -69,11 +73,6 @@ static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offse #endif // __AVX__ -const __m128 v1fp4 = _mm_set1_ps(1.0f); -const __m128 vn1fp4 = _mm_set1_ps(-1.0f); -const __m128i v1ip4 = _mm_set1_epi32(1); -const __m128i vn1ip4 = _mm_set1_epi32(-1); - static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) { #if __AVX2__ @@ -149,7 +148,7 @@ template struct compute_coord; template -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -168,7 +167,7 @@ struct compute_coord }; template<> -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -193,7 +192,7 @@ struct compute_coord }; template<> -struct compute_coord +struct compute_coord { #if __AVX__ __m256 operator()(__m256 length, __m256 coord) @@ -266,52 +265,52 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Border) + else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Reflection) + else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; + gridsample_2d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -322,51 +321,51 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Border) + else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Reflection) + else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; + gridsample_2d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -377,50 +376,50 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Border) + else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Reflection) + else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; + gridsample_2d_bicubic_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -442,52 +441,52 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Border) + else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Reflection) + else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; + gridsample_3d_bilinear_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -498,51 +497,51 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Border) + else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } - else if (padding_mode == GridSample::Reflection) + else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; + gridsample_3d_nearest_compute_blob op; op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); } } @@ -567,26 +566,26 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Fri, 21 Apr 2023 18:15:07 +0800 Subject: [PATCH 095/127] use abs avx_mathfun --- src/layer/x86/gridsample_x86.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index fde46dc1b8f..34c4832e3ee 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -38,11 +38,6 @@ GridSample_x86::GridSample_x86() #if __SSE2__ #if __AVX__ - -#if __AVX512F__ -_PS512_CONST_TYPE(inv_sign_mask, int, ~0x80000000); -#endif // __AVX512F__ - _PS256_CONST(n1, -1.0f); _PS256_CONST(2, 2.0f); _PI32_CONST256(n1, -1); @@ -174,9 +169,9 @@ struct compute_coord { const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); - coord = _mm256_and_ps(coord, *(__m256*)_ps256_inv_sign_mask); + coord = abs256_ps(coord); - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(coord, border_x), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, border_x)); coord = _mm256_sub_ps(border_x, reflectx_v); return coord; @@ -202,9 +197,9 @@ struct compute_coord __m256 v0p5fp8 = _mm256_set1_ps(0.5f); coord = _mm256_add_ps(coord, v0p5fp8); - coord = _mm256_and_ps(coord, *(__m256*)_ps256_inv_sign_mask); + coord = abs256_ps(coord); - __m256 reflectx_v = _mm256_and_ps(_mm256_sub_ps(coord, length), *(__m256*)_ps256_inv_sign_mask); + __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, length)); coord = _mm256_sub_ps(length, reflectx_v); coord = _mm256_sub_ps(coord, v0p5fp8); From eca51672afda59abe12ca9d8b3bbef28f5f9931b Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 21 Apr 2023 19:36:37 +0800 Subject: [PATCH 096/127] fix vs2017 test failed --- src/layer/x86/gridsample_x86.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 34c4832e3ee..f8d5f5c4c5a 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -42,7 +42,7 @@ _PS256_CONST(n1, -1.0f); _PS256_CONST(2, 2.0f); _PI32_CONST256(n1, -1); -static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) +static __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) { #if __AVX2__ __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); @@ -68,7 +68,7 @@ static NCNN_FORCEINLINE __m256 mask_gather_ps256(const float* ptr, __m256i offse #endif // __AVX__ -static NCNN_FORCEINLINE __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) +static __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) { #if __AVX2__ __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); From f59091a43122b9bbccc000a7b94a0ca39ebd5f2c Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 24 Apr 2023 17:08:10 +0800 Subject: [PATCH 097/127] [WIP]disable avx512, test vs2017 CI --- .github/workflows/android-armv7-cpu.yml | 56 - .github/workflows/android-armv7-gpu.yml | 45 - .github/workflows/android-armv8-cpu.yml | 41 - .github/workflows/android-armv8-gpu.yml | 75 - .github/workflows/android-x64-cpu.yml | 41 - .github/workflows/android-x64-gpu.yml | 45 - .github/workflows/android-x86-cpu.yml | 41 - .github/workflows/android-x86-gpu.yml | 45 - .github/workflows/code-format.yml | 61 - .github/workflows/codeql-analysis.yml | 84 - .github/workflows/elf-riscv32-cpu-gcc.yml | 126 -- .github/workflows/elf-riscv64-cpu-gcc.yml | 123 -- .github/workflows/ios-arm64-gpu.yml | 86 - .github/workflows/ios-cpu.yml | 82 - .github/workflows/ios-simulator.yml | 73 - .github/workflows/linux-aarch64-cpu-gcc.yml | 193 -- .github/workflows/linux-arm-cpu-gcc.yml | 206 -- .../workflows/linux-loongarch64-cpu-gcc.yml | 78 - .github/workflows/linux-mips-cpu-gcc.yml | 126 -- .github/workflows/linux-mips64-cpu-gcc.yml | 138 -- .github/workflows/linux-ppc64-cpu-gcc.yml | 75 - .github/workflows/linux-riscv64-cpu-gcc.yml | 186 -- .../workflows/linux-riscv64-cpu-gnu-clang.yml | 142 -- .../workflows/linux-x64-cpu-clang-python.yml | 68 - .github/workflows/linux-x64-cpu-clang.yml | 128 -- .github/workflows/linux-x64-cpu-gcc-musl.yml | 67 - .github/workflows/linux-x64-cpu-gcc-san.yml | 42 - .github/workflows/linux-x64-cpu-gcc-sde.yml | 57 - .github/workflows/linux-x64-cpu-gcc.yml | 134 -- .../workflows/linux-x64-gpu-clang-python.yml | 113 -- .github/workflows/linux-x64-gpu-clang.yml | 91 - .github/workflows/linux-x64-gpu-gcc.yml | 128 -- .github/workflows/linux-x86-cpu-clang.yml | 67 - .github/workflows/linux-x86-cpu-gcc.yml | 65 - .github/workflows/macos-arm64-cpu.yml | 76 - .github/workflows/macos-arm64-gpu.yml | 92 - .github/workflows/macos-x64-cpu-python.yml | 96 - .github/workflows/macos-x64-cpu.yml | 88 - .github/workflows/macos-x64-gpu.yml | 131 -- .github/workflows/release-python.yml | 165 -- .github/workflows/release.yml | 1802 ----------------- .github/workflows/sync-wiki.yml | 32 - .github/workflows/test-coverage.yml | 147 -- .github/workflows/web-assembly.yml | 76 - .github/workflows/windows-arm-cpu.yml | 57 - .github/workflows/windows-arm64-cpu.yml | 57 - .../windows-x64-cpu-vs2019-python.yml | 67 - .github/workflows/windows-x64-cpu.yml | 102 - .github/workflows/windows-x64-gpu.yml | 2 + .github/workflows/windows-x86-cpu.yml | 67 - src/layer/x86/gridsample_x86.cpp | 10 +- 51 files changed, 10 insertions(+), 6185 deletions(-) delete mode 100644 .github/workflows/android-armv7-cpu.yml delete mode 100644 .github/workflows/android-armv7-gpu.yml delete mode 100644 .github/workflows/android-armv8-cpu.yml delete mode 100644 .github/workflows/android-armv8-gpu.yml delete mode 100644 .github/workflows/android-x64-cpu.yml delete mode 100644 .github/workflows/android-x64-gpu.yml delete mode 100644 .github/workflows/android-x86-cpu.yml delete mode 100644 .github/workflows/android-x86-gpu.yml delete mode 100644 .github/workflows/code-format.yml delete mode 100644 .github/workflows/codeql-analysis.yml delete mode 100644 .github/workflows/elf-riscv32-cpu-gcc.yml delete mode 100644 .github/workflows/elf-riscv64-cpu-gcc.yml delete mode 100644 .github/workflows/ios-arm64-gpu.yml delete mode 100644 .github/workflows/ios-cpu.yml delete mode 100644 .github/workflows/ios-simulator.yml delete mode 100644 .github/workflows/linux-aarch64-cpu-gcc.yml delete mode 100644 .github/workflows/linux-arm-cpu-gcc.yml delete mode 100644 .github/workflows/linux-loongarch64-cpu-gcc.yml delete mode 100644 .github/workflows/linux-mips-cpu-gcc.yml delete mode 100644 .github/workflows/linux-mips64-cpu-gcc.yml delete mode 100644 .github/workflows/linux-ppc64-cpu-gcc.yml delete mode 100644 .github/workflows/linux-riscv64-cpu-gcc.yml delete mode 100644 .github/workflows/linux-riscv64-cpu-gnu-clang.yml delete mode 100644 .github/workflows/linux-x64-cpu-clang-python.yml delete mode 100644 .github/workflows/linux-x64-cpu-clang.yml delete mode 100644 .github/workflows/linux-x64-cpu-gcc-musl.yml delete mode 100644 .github/workflows/linux-x64-cpu-gcc-san.yml delete mode 100644 .github/workflows/linux-x64-cpu-gcc-sde.yml delete mode 100644 .github/workflows/linux-x64-cpu-gcc.yml delete mode 100644 .github/workflows/linux-x64-gpu-clang-python.yml delete mode 100644 .github/workflows/linux-x64-gpu-clang.yml delete mode 100644 .github/workflows/linux-x64-gpu-gcc.yml delete mode 100644 .github/workflows/linux-x86-cpu-clang.yml delete mode 100644 .github/workflows/linux-x86-cpu-gcc.yml delete mode 100644 .github/workflows/macos-arm64-cpu.yml delete mode 100644 .github/workflows/macos-arm64-gpu.yml delete mode 100644 .github/workflows/macos-x64-cpu-python.yml delete mode 100644 .github/workflows/macos-x64-cpu.yml delete mode 100644 .github/workflows/macos-x64-gpu.yml delete mode 100644 .github/workflows/release-python.yml delete mode 100644 .github/workflows/release.yml delete mode 100644 .github/workflows/sync-wiki.yml delete mode 100644 .github/workflows/test-coverage.yml delete mode 100644 .github/workflows/web-assembly.yml delete mode 100644 .github/workflows/windows-arm-cpu.yml delete mode 100644 .github/workflows/windows-arm64-cpu.yml delete mode 100644 .github/workflows/windows-x64-cpu-vs2019-python.yml delete mode 100644 .github/workflows/windows-x64-cpu.yml delete mode 100644 .github/workflows/windows-x86-cpu.yml diff --git a/.github/workflows/android-armv7-cpu.yml b/.github/workflows/android-armv7-cpu.yml deleted file mode 100644 index e7e0a56e4fe..00000000000 --- a/.github/workflows/android-armv7-cpu.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: android-armv7-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-armv7-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-armv7-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' -concurrency: - group: android-armv7-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-armv7: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF .. - cmake --build . -j 2 - - - name: ndk-r16b - run: | - wget https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip - cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip - - name: build-noneon - run: | - mkdir build-noneon && cd build-noneon - cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 .. - cmake --build . -j 2 - - name: build-noneon-shared - run: | - mkdir build-noneon-shared && cd build-noneon-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF .. - cmake --build . -j 2 diff --git a/.github/workflows/android-armv7-gpu.yml b/.github/workflows/android-armv7-gpu.yml deleted file mode 100644 index f41ff60e5e8..00000000000 --- a/.github/workflows/android-armv7-gpu.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: android-armv7-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-armv7-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-armv7-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' -concurrency: - group: android-armv7-gpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-armv7-gpu: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF .. - cmake --build . -j 2 diff --git a/.github/workflows/android-armv8-cpu.yml b/.github/workflows/android-armv8-cpu.yml deleted file mode 100644 index b32cbc43b97..00000000000 --- a/.github/workflows/android-armv8-cpu.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: android-armv8-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-armv8-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-armv8-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' -concurrency: - group: android-armv8-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-aarch64: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 diff --git a/.github/workflows/android-armv8-gpu.yml b/.github/workflows/android-armv8-gpu.yml deleted file mode 100644 index ba54b268a51..00000000000 --- a/.github/workflows/android-armv8-gpu.yml +++ /dev/null @@ -1,75 +0,0 @@ -name: android-armv8-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-armv8-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-armv8-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' -concurrency: - group: android-armv8-gpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-aarch64-gpu: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - - name: build-termux - run: | - mkdir build-termux && cd build-termux - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_PLATFORM_API=OFF .. - cmake --build . -j 2 - - name: build-android-29 - run: | - mkdir build-android-29 && cd build-android-29 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON .. - cmake --build . -j 2 - - name: build-android-29-shared - run: | - mkdir build-android-29-shared && cd build-android-29-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - - android-aarch64-gpu-ndk-r16b: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: ndk-r16b - run: | - wget https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip - cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - - name: build - run: cmake --build build -j 2 diff --git a/.github/workflows/android-x64-cpu.yml b/.github/workflows/android-x64-cpu.yml deleted file mode 100644 index 5b8b65b2bab..00000000000 --- a/.github/workflows/android-x64-cpu.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: android-x64-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-x64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-x64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' -concurrency: - group: android-x64-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-x86_64: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 diff --git a/.github/workflows/android-x64-gpu.yml b/.github/workflows/android-x64-gpu.yml deleted file mode 100644 index 65770bdc8e9..00000000000 --- a/.github/workflows/android-x64-gpu.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: android-x64-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-x64-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-x64-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' -concurrency: - group: android-x64-gpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-x86_64-gpu: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 diff --git a/.github/workflows/android-x86-cpu.yml b/.github/workflows/android-x86-cpu.yml deleted file mode 100644 index 28938d6046f..00000000000 --- a/.github/workflows/android-x86-cpu.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: android-x86-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-x86-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-x86-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' -concurrency: - group: android-x86-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-x86: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 diff --git a/.github/workflows/android-x86-gpu.yml b/.github/workflows/android-x86-gpu.yml deleted file mode 100644 index ec46fa00c1c..00000000000 --- a/.github/workflows/android-x86-gpu.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: android-x86-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/android-x86-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/android-x86-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' -concurrency: - group: android-x86-gpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - android-x86-gpu: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - cmake --build . -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml deleted file mode 100644 index e65a7996994..00000000000 --- a/.github/workflows/code-format.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: code-format - -on: [push, pull_request, pull_request_target] - -concurrency: - group: code-format-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - code-format: - permissions: - contents: write # for stefanzweifel/git-auto-commit-action to push code in repo - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-clang-format - id: cache-clang-format - uses: actions/cache@v3 - with: - path: clang-format-install - key: clang-format-install-4 - - name: clang-format - if: steps.cache-clang-format.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/llvm-project-10.0.1.tar.xz - tar -xf llvm-project-10.0.1.tar.xz - cd llvm-project-10.0.1 - mkdir build - cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ../llvm/ - make -j2 clang-format - mkdir $GITHUB_WORKSPACE/clang-format-install - cp -r bin/clang-format $GITHUB_WORKSPACE/clang-format-install - cd ../../ - rm -rf llvm-project-10.0.1 - rm llvm-project-10.0.1.tar.xz - - - name: astyle - run: | - sudo apt-get update - sudo apt-get install -y astyle - - - name: code-format - run: | - mkdir -p ~/bin - mv $GITHUB_WORKSPACE/clang-format-install/clang-format ~/bin/clang-format - rm -rf $GITHUB_WORKSPACE/clang-format-install - export PATH=~/bin:$PATH - sh codeformat.sh - - uses: stefanzweifel/git-auto-commit-action@v4 - with: - commit_message: apply code-format changes - - - name: restore-clang-format-cache - run: | - mkdir $GITHUB_WORKSPACE/clang-format-install - cp -r ~/bin/clang-format $GITHUB_WORKSPACE/clang-format-install diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index fe43e581d32..00000000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,84 +0,0 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -name: "CodeQL" - -on: - push: - branches: [master] - paths-ignore: ['**.md'] - pull_request: - # The branches below must be a subset of the branches above - branches: [master] - paths-ignore: ['**.md'] - schedule: - - cron: '0 20 * * 4' - -concurrency: - group: CodeQL-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - analyze: - permissions: - actions: read # for github/codeql-action/init to get workflow details - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/autobuild to send a status report - name: Analyze - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - # Override automatic language detection by changing the below list - # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] - language: ['cpp'] - # Learn more... - # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - # We must fetch at least the immediate parents so that if this is - # a pull request then we can checkout the head. - fetch-depth: 2 - - # If this run was triggered by a pull request event, then checkout - # the head of the pull request instead of the merge commit. - - run: git checkout HEAD^2 - if: ${{ github.event_name == 'pull_request' }} - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v2 - with: - languages: ${{ matrix.language }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - # queries: ./path/to/local/query, your-org/your-repo/queries@main - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v2 - - # ℹ️ Command-line programs to run using the OS shell. - # 📚 https://git.io/JvXDl - - # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines - # and modify them (or add more) to build your code if your project - # uses a compiled language - - #- run: | - # make bootstrap - # make release - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/elf-riscv32-cpu-gcc.yml b/.github/workflows/elf-riscv32-cpu-gcc.yml deleted file mode 100644 index 87a9477449a..00000000000 --- a/.github/workflows/elf-riscv32-cpu-gcc.yml +++ /dev/null @@ -1,126 +0,0 @@ -name: elf-riscv32-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/elf-riscv32-cpu-gcc.yml' - - 'toolchains/riscv32-unknown-elf.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/elf-riscv32-cpu-gcc.yml' - - 'toolchains/riscv32-unknown-elf.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' -concurrency: - group: elf-riscv32-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - newlib-rv32imc-gcc: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v3 - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v3 - #with: - #path: rv32imc-install - #key: rv32imc-newlib-install-20210425 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: b715e4f01b43efef487166f75d5d85d3c33fa7ef - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-glibc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-newlib - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #./configure --prefix=$GITHUB_WORKSPACE/rv32imc-install --with-arch=rv32imc - #make -j2 - - #- name: checkout-riscv-pk - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-pk - #path: riscv-pk - #ref: ef7bebaf9bf24d3e90bcaae96387ce418e136b6d - #- name: riscv-pk - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-pk - #mkdir build - #cd build - #export PATH=$GITHUB_WORKSPACE/rv32imc-install/bin:$PATH - #../configure --prefix=$GITHUB_WORKSPACE/rv32imc-install --with-arch=rv32imc --host=riscv32-unknown-elf - #make -j2 - #make install - - #- name: checkout-riscv-isa-sim - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-isa-sim - #path: riscv-isa-sim - #ref: 9d4f45c2ebf105503974fc80a42590ca1584c354 - #- name: riscv-isa-sim - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-isa-sim - #mkdir build - #cd build - #export PATH=$GITHUB_WORKSPACE/rv32imc-install/bin:$PATH - #../configure --prefix=$GITHUB_WORKSPACE/rv32imc-install --with-isa=rv32imc - #make -j2 - #make install - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv32imc-install -type f | xargs -i strip -g {} || true - - - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv32imc-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv32-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - # too slow for softfloat arch :( - #- name: test - #run: | - #sudo apt-get update - #sudo apt-get install device-tree-compiler - #export PATH=/data/action/osd/rv32imc-install/bin:$PATH - #cd build - #TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS=/data/action/osd/rv32imc-install/riscv32-unknown-elf/bin/pk ctest --output-on-failure -j 2 diff --git a/.github/workflows/elf-riscv64-cpu-gcc.yml b/.github/workflows/elf-riscv64-cpu-gcc.yml deleted file mode 100644 index fbf2c0227a1..00000000000 --- a/.github/workflows/elf-riscv64-cpu-gcc.yml +++ /dev/null @@ -1,123 +0,0 @@ -name: elf-riscv64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/elf-riscv64-cpu-gcc.yml' - - 'toolchains/riscv64-unknown-elf.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/elf-riscv64-cpu-gcc.yml' - - 'toolchains/riscv64-unknown-elf.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' -concurrency: - group: elf-riscv64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - newlib-rv64gc-gcc: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v3 - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v3 - #with: - #path: rv64gc-install - #key: rv64gc-newlib-install-20210425 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: b715e4f01b43efef487166f75d5d85d3c33fa7ef - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-glibc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-newlib - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gc-install --with-arch=rv64gc - #make -j2 - - #- name: checkout-riscv-pk - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-pk - #path: riscv-pk - #ref: ef7bebaf9bf24d3e90bcaae96387ce418e136b6d - #- name: riscv-pk - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-pk - #mkdir build - #cd build - #export PATH=$GITHUB_WORKSPACE/rv64gc-install/bin:$PATH - #../configure --prefix=$GITHUB_WORKSPACE/rv64gc-install --with-arch=rv64gc --host=riscv64-unknown-elf - #make -j2 - #make install - - #- name: checkout-riscv-isa-sim - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-isa-sim - #path: riscv-isa-sim - #ref: 9d4f45c2ebf105503974fc80a42590ca1584c354 - #- name: riscv-isa-sim - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-isa-sim - #mkdir build - #cd build - #export PATH=$GITHUB_WORKSPACE/rv64gc-install/bin:$PATH - #../configure --prefix=$GITHUB_WORKSPACE/rv64gc-install --with-isa=rv64gc - #make -j2 - #make install - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gc-install -type f | xargs -i strip -g {} || true - - - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv64gc-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - name: test - run: | - export PATH=/data/action/osd/rv64gc-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS=/data/action/osd/rv64gc-install/riscv64-unknown-elf/bin/pk ctest --output-on-failure -j 4 diff --git a/.github/workflows/ios-arm64-gpu.yml b/.github/workflows/ios-arm64-gpu.yml deleted file mode 100644 index d22136d02b9..00000000000 --- a/.github/workflows/ios-arm64-gpu.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: ios-arm64-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/ios-arm64-gpu.yml' - - 'toolchains/ios.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/ios-arm64-gpu.yml' - - 'toolchains/ios.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' -concurrency: - group: ios-arm64-gpu-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - ios-iphone-os-gpu: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-ios-install-20201213-4 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - - name: vulkansdk - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install - hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 - - name: configure - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DIOS_ARCH="arm64;arm64e" \ - -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/iOS/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - - name: build - run: cmake --build build -j 3 diff --git a/.github/workflows/ios-cpu.yml b/.github/workflows/ios-cpu.yml deleted file mode 100644 index 515bc256df2..00000000000 --- a/.github/workflows/ios-cpu.yml +++ /dev/null @@ -1,82 +0,0 @@ -name: ios-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/ios-cpu.yml' - - 'toolchains/ios.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/ios-cpu.yml' - - 'toolchains/ios.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' -concurrency: - group: ios-cpu-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - ios-iphone-os: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-ios-install-20201213-4 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS -DIOS_ARCH="armv7" \ - -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - .. - cmake --build . -j 3 - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS -DIOS_ARCH="arm64;arm64e" \ - -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - .. - cmake --build . -j 3 diff --git a/.github/workflows/ios-simulator.yml b/.github/workflows/ios-simulator.yml deleted file mode 100644 index 0aa2de45305..00000000000 --- a/.github/workflows/ios-simulator.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: ios-simulator -on: - push: - branches: [master] - paths: - - '.github/workflows/ios-simulator.yml' - - 'toolchains/ios.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/ios-simulator.yml' - - 'toolchains/ios.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' -concurrency: - group: ios-simulator-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - ios-iphone-simulator: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-ios-simulator-install-20201213 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: configure - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=SIMULATOR -DIOS_ARCH="i386;x86_64" \ - -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ - .. - - name: build - run: cmake --build build -j 3 diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml deleted file mode 100644 index 4940421d362..00000000000 --- a/.github/workflows/linux-aarch64-cpu-gcc.yml +++ /dev/null @@ -1,193 +0,0 @@ -name: linux-aarch64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-aarch64-cpu-gcc.yml' - - 'toolchains/aarch64-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-aarch64-cpu-gcc.yml' - - 'toolchains/aarch64-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' -concurrency: - group: linux-aarch64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-noint8 - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-noint8 - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - linux-gcc-arm82: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. - cmake --build . -j 2 - - name: test-noint8 - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-noint8 - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - linux-gcc-arm86: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-arm-cpu-gcc.yml b/.github/workflows/linux-arm-cpu-gcc.yml deleted file mode 100644 index 72c075d4413..00000000000 --- a/.github/workflows/linux-arm-cpu-gcc.yml +++ /dev/null @@ -1,206 +0,0 @@ -name: linux-arm-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-arm-cpu-gcc.yml' - - 'toolchains/arm-linux-gnueabi.toolchain.cmake' - - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-arm-cpu-gcc.yml' - - 'toolchains/arm-linux-gnueabi.toolchain.cmake' - - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' -concurrency: - group: linux-arm-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-arm: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-arm-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system - make -j2 - make install - - - name: arm-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-arm-linux-gnueabi - - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2 - - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-noint8 - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-noint8 - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2 - - linux-gcc-armhf: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-arm-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system - make -j2 - make install - - - name: arm-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-arm-linux-gnueabihf - - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 - - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. - cmake --build . -j 2 - - name: test-noint8 - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-noint8 - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 - - linux-gcc-armhf-vfpv3-d16: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-arm-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system - make -j2 - make install - - - name: arm-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-arm-linux-gnueabihf - - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 - - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. - cmake --build . -j 2 - - name: test-noint8 - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-noint8 - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-loongarch64-cpu-gcc.yml b/.github/workflows/linux-loongarch64-cpu-gcc.yml deleted file mode 100644 index fbdbee1dc21..00000000000 --- a/.github/workflows/linux-loongarch64-cpu-gcc.yml +++ /dev/null @@ -1,78 +0,0 @@ -name: linux-loongarch64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-loongarch64-cpu-gcc.yml' - - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/loongarch/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-loongarch64-cpu-gcc.yml' - - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/loongarch/**' - - 'tests/**' -concurrency: - group: linux-loongarch64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-loongarch64: - runs-on: [self-hosted, linux, centos] - - steps: - - uses: actions/checkout@v3 - - - name: loongarch64-toolchain - run: | - wget https://github.com/loongson/build-tools/releases/download/2022.05.29/loongarch64-clfs-5.0-cross-tools-gcc-full.tar.xz - tar -xf loongarch64-clfs-5.0-cross-tools-gcc-full.tar.xz - wget https://github.com/loongson/build-tools/releases/download/2022.05.29/qemu-loongarch64 - chmod +x qemu-loongarch64 - - - name: configure - run: | - export LOONGARCH64_ROOT_PATH=$GITHUB_WORKSPACE/cross-tools - export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cross-tools/target/usr/lib64:$LD_LIBRARY_PATH - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - - name: build - run: cmake --build build -j 4 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE:$PATH - export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cross-tools/target/usr/lib64:$LD_LIBRARY_PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-loongarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;$GITHUB_WORKSPACE/cross-tools/target/usr" ctest --output-on-failure -j 4 - - linux-gcc-loongarch64-lsx: - runs-on: [self-hosted, linux, centos] - - steps: - - uses: actions/checkout@v3 - - - name: configure - run: | - export LOONGARCH64_ROOT_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1 - export LD_LIBRARY_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1/sysroot/usr/lib64:$LD_LIBRARY_PATH - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - - name: build - run: cmake --build build -j 4 diff --git a/.github/workflows/linux-mips-cpu-gcc.yml b/.github/workflows/linux-mips-cpu-gcc.yml deleted file mode 100644 index ba41d4b0293..00000000000 --- a/.github/workflows/linux-mips-cpu-gcc.yml +++ /dev/null @@ -1,126 +0,0 @@ -name: linux-mips-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-mips-cpu-gcc.yml' - - 'toolchains/mipsel-linux-gnu.toolchain.cmake' - - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/mips/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-mips-cpu-gcc.yml' - - 'toolchains/mipsel-linux-gnu.toolchain.cmake' - - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/mips/**' - - 'tests/**' -concurrency: - group: linux-mips-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-mipsel: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-mipsel-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system - make -j2 - make install - - - name: mipsel-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-mipsel-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsel-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j 2 - - linux-gcc-mipsisa32r6el: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-mipsel-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system - make -j2 - make install - - - name: mipsisa32r6el-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-mipsisa32r6el-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-mips64-cpu-gcc.yml b/.github/workflows/linux-mips64-cpu-gcc.yml deleted file mode 100644 index 3d0fe1229ca..00000000000 --- a/.github/workflows/linux-mips64-cpu-gcc.yml +++ /dev/null @@ -1,138 +0,0 @@ -name: linux-mips64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-mips64-cpu-gcc.yml' - - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' - - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/mips/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-mips64-cpu-gcc.yml' - - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' - - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/mips/**' - - 'tests/**' -concurrency: - group: linux-mips64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-mips64el: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-mips64el-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system - make -j2 - make install - - - name: mips64el-gnuabi64-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-mips64el-linux-gnuabi64 - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips64el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j 2 - - linux-gcc-mipsisa64r6el: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-mips64el-install-20220502-4 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0001-target-mips-Fix-SAT_S-trans-helper.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch - patch -p1 -i 0001-target-mips-Fix-SAT_S-trans-helper.patch - patch -p1 -i 0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch - patch -p1 -i 0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch - patch -p1 -i 0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch - patch -p1 -i 0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch - patch -p1 -i 0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system - make -j2 - make install - - - name: mipsisa64r6el-gnuabi64-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-mipsisa64r6el-linux-gnuabi64 - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-ppc64-cpu-gcc.yml b/.github/workflows/linux-ppc64-cpu-gcc.yml deleted file mode 100644 index a4ae93a6172..00000000000 --- a/.github/workflows/linux-ppc64-cpu-gcc.yml +++ /dev/null @@ -1,75 +0,0 @@ -name: linux-ppc64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-ppc64-cpu-gcc.yml' - - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-ppc64-cpu-gcc.yml' - - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'tests/**' -concurrency: - group: linux-ppc64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-ppc64le: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-ppc64le-install-20220502-2 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system - make -j2 - make install - - - name: powerpc64le-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-powerpc64le-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml deleted file mode 100644 index 9c5393e4f49..00000000000 --- a/.github/workflows/linux-riscv64-cpu-gcc.yml +++ /dev/null @@ -1,186 +0,0 @@ -name: linux-riscv64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gcc.yml' - - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gcc.yml' - - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' -concurrency: - group: linux-riscv64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-riscv64: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-riscv64-install-20220502-4 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - make -j2 - make install - - - name: riscv64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-riscv64-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2 - - linux-gcc-riscv64-c906: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v3 - - - name: configure - run: | - export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1 - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. - - name: build - run: cmake --build build -j 4 - - linux-gcc-riscv64-rvv: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v3 - - #- name: cache-qemu - #id: cache-qemu - #uses: actions/cache@v3 - #with: - #path: qemu-install - #key: qemu-riscv64-install-20220502-3 - #- name: install-qemu-build-deps - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev ninja-build - #- name: checkout-qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: qemu/qemu - #path: qemu - #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - #- name: qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #cd qemu - #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - #make -j2 - #make install - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v3 - #with: - #path: rv64gcv-install-next - #key: rv64gcv-linux-install-20210504 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 glibc - #git submodule update --init --recursive --depth 1 newlib - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh - #make linux - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true - - - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - - name: test-vlen256 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 - - - name: test-vlen128 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml deleted file mode 100644 index 18ad114efa4..00000000000 --- a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml +++ /dev/null @@ -1,142 +0,0 @@ -name: linux-riscv64-cpu-gnu-clang -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' - - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' - - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' -concurrency: - group: linux-riscv64-cpu-gnu-clang-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-riscv64-rvv: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v3 - - #- name: cache-qemu - #id: cache-qemu - #uses: actions/cache@v3 - #with: - #path: qemu-install - #key: qemu-riscv64-install-20220502-3 - #- name: install-qemu-build-deps - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev ninja-build - #- name: checkout-qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: qemu/qemu - #path: qemu - #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - #- name: qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #cd qemu - #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - #make -j2 - #make install - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v3 - #with: - #path: rv64gcv-install-next - #key: rv64gcv-linux-install-20210504 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 glibc - #git submodule update --init --recursive --depth 1 newlib - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh - #make linux - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true - - # - name: install-clang - # run: | - # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz - # tar -xf llvm-project-15.0.1.src.tar.xz - # cd llvm-project-15.0.1.src - # mkdir build - # cd build - # cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ - # make -j16 - # make install - - - name: build - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next - export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - - name: test-vlen256 - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 - - - name: test-vlen128 - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-x64-cpu-clang-python.yml b/.github/workflows/linux-x64-cpu-clang-python.yml deleted file mode 100644 index 91292d2113d..00000000000 --- a/.github/workflows/linux-x64-cpu-clang-python.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: linux-x64-cpu-clang-python -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-clang-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'python/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-clang-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'python/**' -concurrency: - group: linux-x64-cpu-clang-python-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-clang-python: - runs-on: ubuntu-20.04 - strategy: - matrix: - python-version: [3.6, 3.7, 3.8, 3.9] - - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: set up python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest setuptools wheel twine - - name: configure - env: - CC: clang - CXX: clang++ - run: mkdir build && cd build && cmake -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - - name: build - run: cmake --build build -j 2 - - name: install python - run: cd python && pip install . - - name: test - run: cd python && pytest tests - - name: build and publish - if: startsWith(github.ref, 'refs/tags') - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} - TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" - run: | - cd python - python setup.py bdist_wheel - twine upload dist/* diff --git a/.github/workflows/linux-x64-cpu-clang.yml b/.github/workflows/linux-x64-cpu-clang.yml deleted file mode 100644 index 82e17655994..00000000000 --- a/.github/workflows/linux-x64-cpu-clang.yml +++ /dev/null @@ -1,128 +0,0 @@ -name: linux-x64-cpu-clang -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-clang.yml' - - 'toolchains/host-c.clang.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-clang.yml' - - 'toolchains/host-c.clang.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: linux-x64-cpu-clang-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-clang: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: protobuf - run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev - - name: build-sse2 - env: - CC: clang - CXX: clang++ - run: | - mkdir build-sse2 && cd build-sse2 - cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-sse2 - run: cd build-sse2 && ctest --output-on-failure -j 2 - - name: build-shared - env: - CC: clang - CXX: clang++ - run: | - mkdir build-shared && cd build-shared - cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - - name: build-avx2 - env: - CC: clang - CXX: clang++ - run: | - mkdir build-avx2 && cd build-avx2 - cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx2 - run: cd build-avx2 && ctest --output-on-failure -j 2 - - name: build-avx - env: - CC: clang - CXX: clang++ - run: | - mkdir build-avx && cd build-avx - cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx - run: cd build-avx && ctest --output-on-failure -j 2 - - name: build-avx1-2 - env: - CC: clang - CXX: clang++ - run: | - mkdir build-avx1-2 && cd build-avx1-2 - cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx1-2 - run: cd build-avx1-2 && ctest --output-on-failure -j 2 - - name: build-noint8 - env: - CC: clang - CXX: clang++ - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-noint8 - run: cd build-noint8 && ctest --output-on-failure -j 2 - - linux-clang-simplestl: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: build-simplestl - env: - CC: clang - CXX: clang++ - run: | - mkdir build-simplestl && cd build-simplestl - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test-simplestl - run: cd build-simplestl && ctest --output-on-failure -j 2 - - name: build-simplestl-simpleomp - env: - CC: clang - CXX: clang++ - run: | - mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test-simplestl-simpleomp - run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc-musl.yml b/.github/workflows/linux-x64-cpu-gcc-musl.yml deleted file mode 100644 index 208ffc4525f..00000000000 --- a/.github/workflows/linux-x64-cpu-gcc-musl.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: linux-x64-cpu-gcc-musl -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc-musl.yml' - - 'toolchains/host-c.gcc.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc-musl.yml' - - 'toolchains/host-c.gcc.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: linux-x64-cpu-gcc-musl-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-musl: - runs-on: ubuntu-latest - steps: - - uses: jirutka/setup-alpine@v1 - with: - packages: > - cmake - clang - clang-dev - make - gcc - g++ - libc-dev - linux-headers - - - uses: actions/checkout@v3 - - name: build - shell: alpine.sh {0} - run: | - mkdir build && cd build - cmake -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - shell: alpine.sh {0} - run: cd build && ctest --output-on-failure -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc-san.yml b/.github/workflows/linux-x64-cpu-gcc-san.yml deleted file mode 100644 index ae57b37bc54..00000000000 --- a/.github/workflows/linux-x64-cpu-gcc-san.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: linux-x64-cpu-gcc-san -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc-san.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc-san.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' -concurrency: - group: linux-x64-cpu-gcc-san-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-san: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test - run: | - cd build - ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc-sde.yml b/.github/workflows/linux-x64-cpu-gcc-sde.yml deleted file mode 100644 index 51eb6861553..00000000000 --- a/.github/workflows/linux-x64-cpu-gcc-sde.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: linux-x64-cpu-gcc-sde -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc-sde.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc-sde.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: linux-x64-cpu-gcc-sde-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-sde: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: gcc12 - run: sudo apt-get install gcc-12 g++-12 - - name: Setup SDE binaries - uses: petarpetrovt/setup-sde@v2 - - name: build-avx512-spr - env: - CC: gcc-12 - CXX: g++-12 - run: | - mkdir build-avx512-spr && cd build-avx512-spr - cmake -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx512-spr - run: | - cd build-avx512-spr - TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml deleted file mode 100644 index 3b6b305e125..00000000000 --- a/.github/workflows/linux-x64-cpu-gcc.yml +++ /dev/null @@ -1,134 +0,0 @@ -name: linux-x64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc.yml' - - 'toolchains/host-c.gcc.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-cpu-gcc.yml' - - 'toolchains/host-c.gcc.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: linux-x64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: protobuf - run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev - - name: build-sse2 - run: | - mkdir build-sse2 && cd build-sse2 - cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-sse2 - run: cd build-sse2 && ctest --output-on-failure -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - - name: build-avx2 - run: | - mkdir build-avx2 && cd build-avx2 - cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx2 - run: cd build-avx2 && ctest --output-on-failure -j 2 - - name: build-avx - run: | - mkdir build-avx && cd build-avx - cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx - run: cd build-avx && ctest --output-on-failure -j 2 - - name: build-avx1-2 - run: | - mkdir build-avx1-2 && cd build-avx1-2 - cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx1-2 - run: cd build-avx1-2 && ctest --output-on-failure -j 2 - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-noint8 - run: cd build-noint8 && ctest --output-on-failure -j 2 - - linux-gcc-cpp03-nostdio-nostring-simplestl: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: build-nostdio - run: | - mkdir build-nostdio && cd build-nostdio - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test-nostdio - run: cd build-nostdio && ctest --output-on-failure -j 2 - - name: build-nostdio-nostring - run: | - mkdir build-nostdio-nostring && cd build-nostdio-nostring - cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: build-simplestl - run: | - mkdir build-simplestl && cd build-simplestl - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test-simplestl - run: cd build-simplestl && ctest --output-on-failure -j 2 - - name: build-simplestl-simpleomp - run: | - mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test-simplestl-simpleomp - run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2 - - linux-gcc-avx512: - runs-on: [self-hosted, linux, t4] - steps: - - uses: actions/checkout@v3 - - name: build - env: - CC: gcc - CXX: g++ - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - mkdir build && cd build - cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 4 - - name: test - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: cd build && ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-x64-gpu-clang-python.yml b/.github/workflows/linux-x64-gpu-clang-python.yml deleted file mode 100644 index 710fa101119..00000000000 --- a/.github/workflows/linux-x64-gpu-clang-python.yml +++ /dev/null @@ -1,113 +0,0 @@ -name: linux-x64-gpu-clang-python -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-gpu-clang-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'python/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-gpu-clang-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'python/**' -concurrency: - group: linux-x64-gpu-clang-python-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-clang-gpu: - runs-on: ubuntu-20.04 - strategy: - matrix: - python-version: [3.6, 3.8] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-vulkansdk - id: cache-vulkansdk - uses: actions/cache@v3 - with: - path: "1.3.236.0" - key: vulkansdk-linux-x86_64-1.3.236.0 - - name: vulkansdk - if: steps.cache-vulkansdk.outputs.cache-hit != 'true' - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.3.236.0.tar.gz - tar -xf vulkansdk-linux-x86_64-1.3.236.0.tar.gz - rm -rf 1.3.236.0/source 1.3.236.0/samples - find 1.3.236.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm - - name: cache-swiftshader - id: cache-swiftshader - uses: actions/cache@v3 - with: - path: swiftshader-install - key: swiftshader-linux-install-20230420 - - name: checkout-swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: google/swiftshader - path: swiftshader - ref: dd55e592406dc0bae219df11adec6363840aff4a - - name: checkout-swiftshader-submodules - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - - name: swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. - cmake --build . -j 2 - mkdir $GITHUB_WORKSPACE/swiftshader-install - cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - - name: set up python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest setuptools wheel twine - - name: build - env: - CC: clang - CXX: clang++ - run: | - export VULKAN_SDK=`pwd`/1.3.236.0/x86_64 - mkdir build && cd build - cmake -DNCNN_VULKAN=ON -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: install python - run: cd python && pip install . - - name: test - run: | - export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" - cd python && pytest tests - - name: build and publish - if: startsWith(github.ref, 'refs/tags') - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} - TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" - run: | - cd python - python setup.py bdist_wheel - twine upload dist/* diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml deleted file mode 100644 index 14671e4337f..00000000000 --- a/.github/workflows/linux-x64-gpu-clang.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: linux-x64-gpu-clang -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-gpu-clang.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-gpu-clang.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: linux-x64-gpu-clang-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-clang-gpu: - runs-on: [self-hosted, linux, cvm] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-swiftshader - id: cache-swiftshader - uses: actions/cache@v3 - with: - path: swiftshader-install - key: swiftshader-linux-install-20230420 - - name: checkout-swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: google/swiftshader - path: swiftshader - ref: dd55e592406dc0bae219df11adec6363840aff4a - - name: checkout-swiftshader-submodules - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - - name: swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. - cmake --build . -j 4 - mkdir $GITHUB_WORKSPACE/swiftshader-install - cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - - name: build - env: - CC: clang - CXX: clang++ - run: | - mkdir build && cd build - cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - name: test - run: | - printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini - export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" - cd build && ctest --output-on-failure -j 4 - - name: build-shared - env: - CC: clang - CXX: clang++ - run: | - mkdir build-shared && cd build-shared - cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 4 diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml deleted file mode 100644 index f9d0cdc9a88..00000000000 --- a/.github/workflows/linux-x64-gpu-gcc.yml +++ /dev/null @@ -1,128 +0,0 @@ -name: linux-x64-gpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x64-gpu-gcc.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x64-gpu-gcc.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: linux-x64-gpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-gpu: - runs-on: [self-hosted, linux, cvm] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-swiftshader - id: cache-swiftshader - uses: actions/cache@v3 - with: - path: swiftshader-install - key: swiftshader-linux-install-20230420 - - name: checkout-swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: google/swiftshader - path: swiftshader - ref: dd55e592406dc0bae219df11adec6363840aff4a - - name: checkout-swiftshader-submodules - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - - name: swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. - cmake --build . -j 4 - mkdir $GITHUB_WORKSPACE/swiftshader-install - cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - - name: build - run: | - mkdir build && cd build - cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - name: test - run: | - printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini - export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" - cd build && ctest --output-on-failure -j 4 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 4 - - linux-gcc-gpu-system-glslang: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: install-deps - run: | - sudo apt-get update - sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev libvulkan-dev glslang-dev spirv-tools - - - name: build - run: | - mkdir build && cd build - cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake .. - cmake --build . -j 4 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 4 - - linux-gcc-gpu-t4: - runs-on: [self-hosted, linux, t4] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: build - env: - CC: gcc - CXX: g++ - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export VULKAN_SDK=/data/action/osd/1.2.189.0/x86_64 - mkdir build && cd build - cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 4 - - name: test - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - cd build && ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-x86-cpu-clang.yml b/.github/workflows/linux-x86-cpu-clang.yml deleted file mode 100644 index 92544f4e474..00000000000 --- a/.github/workflows/linux-x86-cpu-clang.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: linux-x86-cpu-clang -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x86-cpu-clang.yml' - - 'toolchains/host.clang-m32.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x86-cpu-clang.yml' - - 'toolchains/host.clang-m32.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' -concurrency: - group: linux-x86-cpu-clang-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-clang: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: gcc-multilib - run: sudo apt-get install gcc-multilib g++-multilib - - name: build - env: - CC: clang - CXX: clang++ - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test - run: cd build && ctest --output-on-failure -j 2 - - name: build-shared - env: - CC: clang - CXX: clang++ - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - - name: build-noint8 - env: - CC: clang - CXX: clang++ - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF .. - cmake --build . -j 2 - - name: test-noint8 - run: cd build-noint8 && ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x86-cpu-gcc.yml b/.github/workflows/linux-x86-cpu-gcc.yml deleted file mode 100644 index c6385f0b011..00000000000 --- a/.github/workflows/linux-x86-cpu-gcc.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: linux-x86-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-x86-cpu-gcc.yml' - - 'toolchains/host.gcc-m32.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-x86-cpu-gcc.yml' - - 'toolchains/host.gcc-m32.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' -concurrency: - group: linux-x86-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: gcc-multilib - run: sudo apt-get install gcc-multilib g++-multilib - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test - run: cd build && ctest --output-on-failure -j 2 - - name: build-nosse - run: | - mkdir build-nosse && cd build-nosse - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 2 - - name: test-nosse - run: cd build-nosse && ctest --output-on-failure -j 2 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - - name: build-noint8 - run: | - mkdir build-noint8 && cd build-noint8 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF .. - cmake --build . -j 2 - - name: test-noint8 - run: cd build-noint8 && ctest --output-on-failure -j 2 diff --git a/.github/workflows/macos-arm64-cpu.yml b/.github/workflows/macos-arm64-cpu.yml deleted file mode 100644 index df85f05353f..00000000000 --- a/.github/workflows/macos-arm64-cpu.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: macos-arm64-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/macos-arm64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/macos-arm64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' -concurrency: - group: macos-arm64-cpu-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - macos-clang: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-macos-install-20201213 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 3 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 3 diff --git a/.github/workflows/macos-arm64-gpu.yml b/.github/workflows/macos-arm64-gpu.yml deleted file mode 100644 index eac0da0bb73..00000000000 --- a/.github/workflows/macos-arm64-gpu.yml +++ /dev/null @@ -1,92 +0,0 @@ -name: macos-arm64-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/macos-arm64-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/macos-arm64-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'src/layer/vulkan/**' -concurrency: - group: macos-arm64-gpu-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - macos-clang-gpu: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-macos-install-20201213 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: vulkansdk - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install - hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 - - name: build - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build && cd build - cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 3 - - name: build-shared - run: | - export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.236.0/macOS - mkdir build-shared && cd build-shared - cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 3 diff --git a/.github/workflows/macos-x64-cpu-python.yml b/.github/workflows/macos-x64-cpu-python.yml deleted file mode 100644 index 64472dc4da9..00000000000 --- a/.github/workflows/macos-x64-cpu-python.yml +++ /dev/null @@ -1,96 +0,0 @@ -name: macos-x64-cpu-python -on: - push: - branches: [master] - paths: - - '.github/workflows/macos-x64-cpu-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'python/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/macos-x64-cpu-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'python/**' -concurrency: - group: macos-x64-cpu-python-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - macos-clang: - runs-on: macos-latest - strategy: - matrix: - python-version: [3.6, 3.7, 3.8, 3.9] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: set up python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest setuptools wheel twine - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-macos-install-20201213 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 3 - - name: install python - run: cd python && pip install . - - name: test - run: cd python && pytest tests - - name: build and publish - if: startsWith(github.ref, 'refs/tags') - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} - TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" - run: | - cd python - python setup.py bdist_wheel - twine upload dist/* diff --git a/.github/workflows/macos-x64-cpu.yml b/.github/workflows/macos-x64-cpu.yml deleted file mode 100644 index 1b1c0f75cc9..00000000000 --- a/.github/workflows/macos-x64-cpu.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: macos-x64-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/macos-x64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/macos-x64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: macos-x64-cpu-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - macos-clang: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - - name: protobuf - run: brew install protobuf opencv3 - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-macos-install-20201213 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 3 - - name: test - run: cd build && ctest --output-on-failure -j 3 - - name: build-shared - run: | - mkdir build-shared && cd build-shared - cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 3 diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml deleted file mode 100644 index 861c71e82d4..00000000000 --- a/.github/workflows/macos-x64-gpu.yml +++ /dev/null @@ -1,131 +0,0 @@ -name: macos-x64-gpu -on: - push: - branches: [master] - paths: - - '.github/workflows/macos-x64-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/macos-x64-gpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'src/layer/vulkan/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: macos-x64-gpu-${{ github.ref }} - cancel-in-progress: true -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer -permissions: - contents: read - -jobs: - macos-clang-gpu: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: protobuf - run: brew install protobuf opencv3 - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-macos-install-20201213 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - mkdir -p build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - mkdir $GITHUB_WORKSPACE/openmp-install - cp -r install/* $GITHUB_WORKSPACE/openmp-install - - name: install-openmp - run: | - sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: vulkansdk - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install - hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 - - name: cache-swiftshader - id: cache-swiftshader - uses: actions/cache@v3 - with: - path: swiftshader-install - key: swiftshader-macos-install-20230420 - - name: checkout-swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: google/swiftshader - path: swiftshader - ref: dd55e592406dc0bae219df11adec6363840aff4a - - name: checkout-swiftshader-submodules - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - - name: swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. - cmake --build . -j 3 - mkdir $GITHUB_WORKSPACE/swiftshader-install - cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install - - name: build - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build && cd build - cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 3 - - name: test - run: | - printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini - export DYLD_LIBRARY_PATH="$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS/lib":$DYLD_LIBRARY_PATH - export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" - cd build && ctest --output-on-failure -j 3 - - name: build-shared - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build-shared && cd build-shared - cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 3 diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml deleted file mode 100644 index d28cc54eb25..00000000000 --- a/.github/workflows/release-python.yml +++ /dev/null @@ -1,165 +0,0 @@ -name: release-python -# on: [push, pull_request] -on: - push: - tags: - - '*' - -jobs: - build_sdist: - name: Build SDist - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Install deps - run: python -m pip install twine build - - - name: Build SDist - run: python -m build -s - - - name: Check metadata - run: twine check dist/* - - - uses: actions/upload-artifact@v3 - with: - path: dist/*.tar.gz - - build_wheels: - name: ${{ matrix.arch }} ${{ matrix.build }} on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - include: - - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-manylinux*' } - - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-musllinux*' } - - { os: ubuntu-20.04, arch: x86_64, build: 'pp*' } - - { os: ubuntu-20.04, arch: i686, build: 'cp*-manylinux*' } - - { os: ubuntu-20.04, arch: i686, build: 'cp*-musllinux*' } - - { os: ubuntu-20.04, arch: i686, build: 'pp*' } - - { os: windows-2019, arch: x86, build: 'cp*' } - - { os: windows-2019, arch: AMD64, build: 'cp*' } - - { os: windows-2019, arch: AMD64, build: 'pp*' } - - { os: windows-2019, arch: ARM64, build: 'cp*' } - - { os: macos-latest, arch: x86_64, build: 'cp*' } - - { os: macos-latest, arch: x86_64, build: 'pp*' } - - { os: macos-latest, arch: arm64, build: 'cp*' } - - { os: macos-latest, arch: universal2, build: 'cp*' } - - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: brew uninstall libomp - if: matrix.os == 'macos-latest' - run: | - brew uninstall --ignore-dependencies libomp - - - name: Build wheels - uses: pypa/cibuildwheel@v2.12.3 - env: - CIBW_ARCHS_MACOS: ${{ matrix.arch }} - CIBW_ARCHS_LINUX: ${{ matrix.arch }} - CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} - CIBW_BUILD: ${{ matrix.build }} - CIBW_BUILD_VERBOSITY: 1 - CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2 - - - name: Show files - run: ls -lh wheelhouse - shell: bash - - - name: Verify clean directory - run: git diff --exit-code - shell: bash - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - path: wheelhouse/*.whl - - build_wheels_qemu: - name: ${{ matrix.arch }} ${{ matrix.build }} - runs-on: ubuntu-20.04 - - strategy: - fail-fast: false - matrix: - arch: [aarch64, ppc64le, s390x] - build: ['cp36-*', 'cp37-*', 'cp38-*', 'cp39-*', 'cp310-*', 'cp311-*'] - include: - - arch: aarch64 - build: 'pp37-*' - - arch: aarch64 - build: 'pp38-*' - - arch: aarch64 - build: 'pp39-*' - - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - with: - platforms: all - - - name: Build wheels - uses: pypa/cibuildwheel@v2.12.3 - env: - CIBW_ARCHS_LINUX: ${{ matrix.arch }} - CIBW_BUILD: ${{ matrix.build }} - CIBW_BUILD_VERBOSITY: 1 - CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2 - - - name: Show files - run: ls -lh wheelhouse - shell: bash - - - name: Verify clean directory - run: git diff --exit-code - shell: bash - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - path: wheelhouse/*.whl - - upload_all: - permissions: - contents: none - name: Upload - needs: [build_wheels, build_wheels_qemu, build_sdist] - runs-on: ubuntu-latest - - steps: - - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - uses: actions/download-artifact@v3 - with: - name: artifact - path: dist - - - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 8b8fa49222d..00000000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,1802 +0,0 @@ -name: release -on: - push: - tags: - - '*' - -env: - DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer - EMSCRIPTEN_VERSION: 3.1.28 - -permissions: - contents: read - -jobs: - - setup: - permissions: - contents: none - runs-on: ubuntu-latest - outputs: - VERSION: ${{ steps.get_version.outputs.VERSION }} - steps: - - name: get-version - id: get_version - run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT - - full-source: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-full-source - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: package - run: | - rm -rf .git - rm -f /tmp/${{ env.PACKAGENAME }}.zip - zip -9 -y -r /tmp/${{ env.PACKAGENAME }}.zip . - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: /tmp/${{ env.PACKAGENAME }}.zip - - ubuntu-2004: - needs: [setup] - runs-on: ubuntu-20.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004 - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2004-shared: - needs: [setup] - runs-on: ubuntu-20.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004-shared - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a -P build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2204: - needs: [setup] - runs-on: ubuntu-22.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204 - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2204-shared: - needs: [setup] - runs-on: ubuntu-22.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204-shared - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a -P build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - openmp-macos: - runs-on: macos-latest - steps: - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-macos-release-11.0.0 - - name: checkout - if: steps.cache-openmp.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - - name: build-x86_64 - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - cd openmp-11.0.0.src - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install/strip - - name: build-arm64 - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - cd openmp-11.0.0.src - mkdir build-arm64 && cd build-arm64 - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install/strip - - name: merge-fat-library - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - rm -rf $GITHUB_WORKSPACE/openmp-install - mkdir -p $GITHUB_WORKSPACE/openmp-install - cp -a openmp-11.0.0.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install - mkdir -p $GITHUB_WORKSPACE/openmp-install/lib - lipo -create openmp-11.0.0.src/build-x86_64/install/lib/libomp.a openmp-11.0.0.src/build-arm64/install/lib/libomp.a -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - - name: upload - uses: actions/upload-artifact@v3 - with: - name: openmp-macos - path: openmp-install - - macos: - needs: [setup, openmp-macos] - runs-on: macos-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos - steps: - - uses: actions/checkout@v3 - - name: download-openmp-macos - uses: actions/download-artifact@v3 - with: - name: openmp-macos - path: openmp-macos - - name: install-openmp - run: | - sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="x86_64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - macos-gpu: - needs: [setup, openmp-macos] - runs-on: macos-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: download-openmp-macos - uses: actions/download-artifact@v3 - with: - name: openmp-macos - path: openmp-macos - - name: install-openmp - run: | - sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: vulkansdk - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install - hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 - - name: build-x86_64 - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="x86_64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install/strip - - name: build-arm64 - run: | - export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.236.0/macOS - mkdir build-arm64 && cd build-arm64 - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package-glslang - run: | - rm -rf glslang.framework - mkdir -p glslang.framework/Versions/A/Headers - mkdir -p glslang.framework/Versions/A/Resources - ln -s A glslang.framework/Versions/Current - ln -s Versions/Current/Headers glslang.framework/Headers - ln -s Versions/Current/Resources glslang.framework/Resources - ln -s Versions/Current/glslang glslang.framework/glslang - libtool -static build-x86_64/install/lib/libglslang.a build-x86_64/install/lib/libMachineIndependent.a build-x86_64/install/lib/libGenericCodeGen.a build-x86_64/install/lib/libSPIRV.a build-x86_64/install/lib/libOGLCompiler.a build-x86_64/install/lib/libOSDependent.a -o build-x86_64/install/lib/libglslang_combined.a - libtool -static build-arm64/install/lib/libglslang.a build-arm64/install/lib/libMachineIndependent.a build-arm64/install/lib/libGenericCodeGen.a build-arm64/install/lib/libSPIRV.a build-arm64/install/lib/libOGLCompiler.a build-arm64/install/lib/libOSDependent.a -o build-arm64/install/lib/libglslang_combined.a - lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang - cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ - sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - openmp-ios: - runs-on: macos-latest - steps: - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-ios-release-11.0.0 - - name: checkout - if: steps.cache-openmp.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - - name: build - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - cd openmp-11.0.0.src - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-simulator - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - cd openmp-11.0.0.src - mkdir build-simulator && cd build-simulator - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: merge-fat-library - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - rm -rf $GITHUB_WORKSPACE/openmp-install - mkdir -p $GITHUB_WORKSPACE/openmp-install - cp -a openmp-11.0.0.src/build/install/include $GITHUB_WORKSPACE/openmp-install - mkdir -p $GITHUB_WORKSPACE/openmp-install/lib - lipo -create openmp-11.0.0.src/build/install/lib/libomp.a openmp-11.0.0.src/build-simulator/install/lib/libomp.a -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - - name: upload - uses: actions/upload-artifact@v3 - with: - name: openmp-ios - path: openmp-install - - ios: - needs: [setup, openmp-ios] - runs-on: macos-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios - steps: - - uses: actions/checkout@v3 - - name: download-openmp-ios - uses: actions/download-artifact@v3 - with: - name: openmp-ios - path: openmp-ios - - name: install-openmp - run: | - sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-simulator - run: | - mkdir build-simulator && cd build-simulator - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build-armv7/install/lib/libncnn.a build-arm64/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ios-gpu: - needs: [setup, openmp-ios] - runs-on: macos-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: download-openmp-ios - uses: actions/download-artifact@v3 - with: - name: openmp-ios - path: openmp-ios - - name: install-openmp - run: | - sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: vulkansdk - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install - hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 - - name: build - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=OS64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-simulator - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build-simulator && cd build-simulator - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=SIMULATOR64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="x86_64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package-glslang - run: | - rm -rf glslang.framework - mkdir -p glslang.framework/Versions/A/Headers - mkdir -p glslang.framework/Versions/A/Resources - ln -s A glslang.framework/Versions/Current - ln -s Versions/Current/Headers glslang.framework/Headers - ln -s Versions/Current/Resources glslang.framework/Resources - ln -s Versions/Current/glslang glslang.framework/glslang - libtool -static build/install/lib/libglslang.a build/install/lib/libMachineIndependent.a build/install/lib/libGenericCodeGen.a build/install/lib/libSPIRV.a build/install/lib/libOGLCompiler.a build/install/lib/libOSDependent.a -o build/install/lib/libglslang_combined.a - libtool -static build-simulator/install/lib/libglslang.a build-simulator/install/lib/libMachineIndependent.a build-simulator/install/lib/libGenericCodeGen.a build-simulator/install/lib/libSPIRV.a build-simulator/install/lib/libOGLCompiler.a build-simulator/install/lib/libOSDependent.a -o build-simulator/install/lib/libglslang_combined.a - lipo -create build/install/lib/libglslang_combined.a build-simulator/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang - cp -a build/install/include/glslang glslang.framework/Versions/A/Headers/ - sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - openmp-ios-bitcode: - runs-on: macos-latest - steps: - - name: cache-openmp - id: cache-openmp - uses: actions/cache@v3 - with: - path: openmp-install - key: openmp-ios-bitcode-release-11.0.0 - - name: checkout - if: steps.cache-openmp.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - - name: openmp - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz - tar -xf openmp-11.0.0.src.tar.xz - cd openmp-11.0.0.src - sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S - sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S - - name: build - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - cd openmp-11.0.0.src - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-simulator - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - cd openmp-11.0.0.src - mkdir build-simulator && cd build-simulator - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ - -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ - -DPERL_EXECUTABLE=/usr/local/bin/perl \ - -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: merge-fat-library - if: steps.cache-openmp.outputs.cache-hit != 'true' - run: | - rm -rf $GITHUB_WORKSPACE/openmp-install - mkdir -p $GITHUB_WORKSPACE/openmp-install - cp -a openmp-11.0.0.src/build/install/include $GITHUB_WORKSPACE/openmp-install - mkdir -p $GITHUB_WORKSPACE/openmp-install/lib - lipo -create openmp-11.0.0.src/build/install/lib/libomp.a openmp-11.0.0.src/build-simulator/install/lib/libomp.a -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a - - name: upload - uses: actions/upload-artifact@v3 - with: - name: openmp-ios-bitcode - path: openmp-install - - ios-bitcode: - needs: [setup, openmp-ios-bitcode] - runs-on: macos-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-bitcode - steps: - - uses: actions/checkout@v3 - - name: download-openmp-ios-bitcode - uses: actions/download-artifact@v3 - with: - name: openmp-ios-bitcode - path: openmp-ios-bitcode - - name: install-openmp - run: | - sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=OS -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-simulator - run: | - mkdir build-simulator && cd build-simulator - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios-bitcode/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios-bitcode/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build-armv7/install/lib/libncnn.a build-arm64/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ios-gpu-bitcode: - needs: [setup, openmp-ios-bitcode] - runs-on: macos-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan-bitcode - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: download-openmp-ios-bitcode - uses: actions/download-artifact@v3 - with: - name: openmp-ios-bitcode - path: openmp-ios-bitcode - - name: install-openmp - run: | - sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: vulkansdk - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install - hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 - - name: build - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=OS64 -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: build-simulator - run: | - export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS - mkdir build-simulator && cd build-simulator - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DIOS_PLATFORM=SIMULATOR64 -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="x86_64" \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ - -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ - -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 3 - cmake --build . --target install - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios-bitcode/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios-bitcode/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package-glslang - run: | - rm -rf glslang.framework - mkdir -p glslang.framework/Versions/A/Headers - mkdir -p glslang.framework/Versions/A/Resources - ln -s A glslang.framework/Versions/Current - ln -s Versions/Current/Headers glslang.framework/Headers - ln -s Versions/Current/Resources glslang.framework/Resources - ln -s Versions/Current/glslang glslang.framework/glslang - libtool -static build/install/lib/libglslang.a build/install/lib/libMachineIndependent.a build/install/lib/libGenericCodeGen.a build/install/lib/libSPIRV.a build/install/lib/libOGLCompiler.a build/install/lib/libOSDependent.a -o build/install/lib/libglslang_combined.a - libtool -static build-simulator/install/lib/libglslang.a build-simulator/install/lib/libMachineIndependent.a build-simulator/install/lib/libGenericCodeGen.a build-simulator/install/lib/libSPIRV.a build-simulator/install/lib/libOGLCompiler.a build-simulator/install/lib/libOSDependent.a -o build-simulator/install/lib/libglslang_combined.a - lipo -create build/install/lib/libglslang_combined.a build-simulator/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang - cp -a build/install/include/glslang glslang.framework/Versions/A/Headers/ - sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - android: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android - steps: - - uses: actions/checkout@v3 - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-aarch64 - run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86 - run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - android-shared: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-shared - steps: - - uses: actions/checkout@v3 - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-aarch64 - run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86 - run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - android-gpu: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-vulkansdk - id: cache-vulkansdk - uses: actions/cache@v3 - with: - path: "1.3.236.0" - key: vulkansdk-linux-x86_64-1.3.236.0 - - name: vulkansdk - if: steps.cache-vulkansdk.outputs.cache-hit != 'true' - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.3.236.0.tar.gz - tar -xf vulkansdk-linux-x86_64-1.3.236.0.tar.gz - rm -rf 1.3.236.0/source 1.3.236.0/samples - find 1.3.236.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-aarch64 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86_64 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - android-gpu-shared: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan-shared - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-vulkansdk - id: cache-vulkansdk - uses: actions/cache@v3 - with: - path: "1.3.236.0" - key: vulkansdk-linux-x86_64-1.3.236.0 - - name: vulkansdk - if: steps.cache-vulkansdk.outputs.cache-hit != 'true' - run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.3.236.0.tar.gz - tar -xf vulkansdk-linux-x86_64-1.3.236.0.tar.gz - rm -rf 1.3.236.0/source 1.3.236.0/samples - find 1.3.236.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-aarch64 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-x86_64 - run: | - export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - webassembly: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly - steps: - - uses: actions/checkout@v3 - - name: emsdk - run: | - git clone https://github.com/emscripten-core/emsdk.git - cd emsdk - ./emsdk install $EMSCRIPTEN_VERSION - ./emsdk activate $EMSCRIPTEN_VERSION - - name: build - run: | - source emsdk/emsdk_env.sh - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-simd - run: | - source emsdk/emsdk_env.sh - mkdir build-simd && cd build-simd - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-threads - run: | - source emsdk/emsdk_env.sh - mkdir build-threads && cd build-threads - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: build-simd-threads - run: | - source emsdk/emsdk_env.sh - mkdir build-simd-threads && cd build-simd-threads - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install ${{ env.PACKAGENAME }}/basic - cp -a build-simd/install ${{ env.PACKAGENAME }}/simd - cp -a build-threads/install ${{ env.PACKAGENAME }}/threads - cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2015: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2015-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2015-shared: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2015-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2017: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2017-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2017-shared: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2017-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2019: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2019-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2019-shared: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2019-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2022: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2022-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2022-shared: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-vs2022-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: vulkansdk - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe - .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit - - name: build-x86 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x86; cd build-x86 - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-x64 - run: | - $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" - $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v3 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - release: - permissions: - contents: write # for softprops/action-gh-release to create a release - needs: [setup, full-source, ubuntu-2004, ubuntu-2004-shared, ubuntu-2204, ubuntu-2204-shared, macos, macos-gpu, ios, ios-gpu, ios-bitcode, ios-gpu-bitcode, android, android-shared, android-gpu, android-gpu-shared, webassembly, windows-vs2015, windows-vs2015-shared, windows-vs2017, windows-vs2017-shared, windows-vs2019, windows-vs2019-shared, windows-vs2022, windows-vs2022-shared] - runs-on: ubuntu-latest - steps: - - name: download - uses: actions/download-artifact@v3 - with: - path: artifacts - - - name: create-release - uses: softprops/action-gh-release@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} - tag_name: ${{ needs.setup.outputs.VERSION }} - name: Release ${{ needs.setup.outputs.VERSION }} - files: artifacts/*/*.zip diff --git a/.github/workflows/sync-wiki.yml b/.github/workflows/sync-wiki.yml deleted file mode 100644 index 11dfa08058b..00000000000 --- a/.github/workflows/sync-wiki.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: sync-wiki -on: - push: - branches: [master] - paths: - - '.github/workflows/sync-wiki.yml' - - 'docs/**' -concurrency: - group: sync-wiki-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - sync-wiki: - permissions: - contents: write # for Git to git push - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: sync - run: | - cp -r docs $GITHUB_WORKSPACE/ncnn.wiki - cd $GITHUB_WORKSPACE/ncnn.wiki - git config --global user.name "wiki-sync-bot" - git config --global user.email "wiki-sync-bot@qq.com" - git init - git add . - git commit -m "sync" - git remote add upstream https://${{ secrets.WIKI_SYNC_BOT_TOKEN }}@github.com/Tencent/ncnn.wiki.git - git push upstream master -f diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml deleted file mode 100644 index 87401acd00f..00000000000 --- a/.github/workflows/test-coverage.yml +++ /dev/null @@ -1,147 +0,0 @@ -name: test-coverage -on: - push: - branches: [master] - paths: - - '.github/workflows/test-coverage.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/**' - - 'tests/**' - - 'toolchains/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/test-coverage.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/**' - - 'tests/**' - - 'toolchains/**' -concurrency: - group: test-coverage-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-gpu-t4: - runs-on: [self-hosted, linux, t4] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: build - env: - CC: gcc - CXX: g++ - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export VULKAN_SDK=/data/action/osd/1.2.189.0/x86_64 - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_VULKAN=ON -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - name: test - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: cd build && ctest --output-on-failure -j 4 - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/install/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - id: codecov - continue-on-error: true - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-1 - continue-on-error: true - id: codecov-vlen256-retry-1 - if: steps.codecov.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-2 - continue-on-error: true - id: codecov-vlen256-retry-2 - if: steps.codecov-vlen256-retry-1.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-3 - continue-on-error: true - id: codecov-vlen256-retry-3 - if: steps.codecov-vlen256-retry-2.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-4 - continue-on-error: true - id: codecov-vlen256-retry-4 - if: steps.codecov-vlen256-retry-3.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-5 - continue-on-error: true - id: codecov-vlen256-retry-5 - if: steps.codecov-vlen256-retry-4.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: set the status - if: always() - run: | - if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then - echo fine - else - exit 1 - fi - - linux-gcc-x64-avx512-spr: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: gcc12 - run: sudo apt-get install gcc-12 g++-12 - - name: lcov - run: sudo apt-get install lcov - - name: Setup SDE binaries - uses: petarpetrovt/setup-sde@v2 - - name: build-avx512-spr - env: - CC: gcc-12 - CXX: g++-12 - run: | - mkdir build-avx512-spr && cd build-avx512-spr - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx512-spr - run: | - cd build-avx512-spr - TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-avx512-spr - lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info - lcov --list lcov.info - - name: codecov-avx512-spr - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-avx512-spr/lcov.info diff --git a/.github/workflows/web-assembly.yml b/.github/workflows/web-assembly.yml deleted file mode 100644 index 61756a2d059..00000000000 --- a/.github/workflows/web-assembly.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: web-assembly -on: - push: - branches: [master] - paths: - - '.github/workflows/web-assembly.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/web-assembly.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - -env: - EMSCRIPTEN_VERSION: 3.1.28 - -concurrency: - group: web-assembly-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - webassembly: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: emsdk - run: | - git clone https://github.com/emscripten-core/emsdk.git - cd emsdk - ./emsdk install $EMSCRIPTEN_VERSION - ./emsdk activate $EMSCRIPTEN_VERSION - - name: build-basic - run: | - source emsdk/emsdk_env.sh - export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" - mkdir build-basic && cd build-basic - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-basic - run: | - cd build-basic - TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j 2 - - name: build-simd - run: | - source emsdk/emsdk_env.sh - export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" - mkdir build-simd && cd build-simd - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-simd - run: | - cd build-simd - TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j 2 - - name: build-simd-omp - run: | - source emsdk/emsdk_env.sh - export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" - mkdir build-simd-omp && cd build-simd-omp - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-simd-omp - run: | - cd build-simd-omp - TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j 2 diff --git a/.github/workflows/windows-arm-cpu.yml b/.github/workflows/windows-arm-cpu.yml deleted file mode 100644 index d789482a595..00000000000 --- a/.github/workflows/windows-arm-cpu.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: windows-arm-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/windows-arm-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/windows-arm-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' -concurrency: - group: windows-arm-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - windows: - name: ${{ matrix.vs-version }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - vs-version: vs2019 - toolset-version: v142 - os: windows-2022 - - - vs-version: vs2022 - toolset-version: v143 - os: windows-2022 - - env: - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build; cd build - cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . --config Release -j 2 - - name: build-shared - run: | - mkdir build-shared; cd build-shared - cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 diff --git a/.github/workflows/windows-arm64-cpu.yml b/.github/workflows/windows-arm64-cpu.yml deleted file mode 100644 index a6bdbda01de..00000000000 --- a/.github/workflows/windows-arm64-cpu.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: windows-arm64-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/windows-arm64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/windows-arm64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/arm/**' - - 'tests/**' -concurrency: - group: windows-arm64-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - windows: - name: ${{ matrix.vs-version }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - vs-version: vs2019 - toolset-version: v142 - os: windows-2022 - - - vs-version: vs2022 - toolset-version: v143 - os: windows-2022 - - env: - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build; cd build - cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . --config Release -j 2 - - name: build-shared - run: | - mkdir build-shared; cd build-shared - cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 diff --git a/.github/workflows/windows-x64-cpu-vs2019-python.yml b/.github/workflows/windows-x64-cpu-vs2019-python.yml deleted file mode 100644 index 3d4e6583766..00000000000 --- a/.github/workflows/windows-x64-cpu-vs2019-python.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: windows-x64-cpu-vs2019-python -on: - push: - branches: [master] - paths: - - '.github/workflows/windows-x64-cpu-vs2019-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'python/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/windows-x64-cpu-vs2019-python.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'python/**' -concurrency: - group: windows-x64-cpu-vs2019-python-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - windows-vs2019-python: - runs-on: windows-latest - strategy: - matrix: - python-version: [3.6, 3.7, 3.8, 3.9] - env: - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: set up python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest setuptools wheel twine - - name: build - run: | - mkdir build; cd build - cmake -T v142,host=x64 -A x64 -DNCNN_PYTHON=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=OFF .. - cmake --build . --config Release -j 2 - - name: install python - run: cd python && pip install . - - name: test - run: cd python && pytest tests - - name: build and publish - if: startsWith(github.ref, 'refs/tags') - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} - TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" - run: | - cd python - python setup.py bdist_wheel - twine upload dist/* diff --git a/.github/workflows/windows-x64-cpu.yml b/.github/workflows/windows-x64-cpu.yml deleted file mode 100644 index 200185d1a56..00000000000 --- a/.github/workflows/windows-x64-cpu.yml +++ /dev/null @@ -1,102 +0,0 @@ -name: windows-x64-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/windows-x64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/windows-x64-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - - 'tools/**' - - '!tools/pnnx/**' - - 'examples/**' -concurrency: - group: windows-x64-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - windows: - name: ${{ matrix.vs-version }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - vs-version: vs2015 - toolset-version: v140 - os: windows-2019 - - - vs-version: vs2017 - toolset-version: v141 - os: windows-2019 - - - vs-version: vs2019 - toolset-version: v142 - os: windows-2022 - - - vs-version: vs2022 - toolset-version: v143 - os: windows-2022 - - env: - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v3 - with: - path: "protobuf-install" - key: protobuf-${{ matrix.vs-version }}-x64-install-2 - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}; cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake - cmake --build . --config Release -j 2 - cmake --build . --config Release --target install - - name: build-sse2 - run: | - mkdir build-sse2; cd build-sse2 - cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . --config Release -j 2 - - name: test-sse2 - run: cd build-sse2; ctest -C Release --output-on-failure -j 2 - - name: build-shared - run: | - mkdir build-shared; cd build-shared - cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=ON -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 - - name: build-avx2 - run: | - mkdir build-avx2; cd build-avx2 - cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . --config Release -j 2 - - name: test-avx2 - run: cd build-avx2; ctest -C Release --output-on-failure -j 2 - - name: build-avx - run: | - mkdir build-avx; cd build-avx - cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. - cmake --build . --config Release -j 2 - - name: test-avx - run: cd build-avx; ctest -C Release --output-on-failure -j 2 diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml index 118e408498c..53ec0ef71bc 100644 --- a/.github/workflows/windows-x64-gpu.yml +++ b/.github/workflows/windows-x64-gpu.yml @@ -60,6 +60,8 @@ jobs: env: UseMultiToolTask: true steps: + - name: Setup Debug Session + uses: csexton/debugger-action@master - uses: actions/checkout@v3 with: submodules: true diff --git a/.github/workflows/windows-x86-cpu.yml b/.github/workflows/windows-x86-cpu.yml deleted file mode 100644 index b48431a97ac..00000000000 --- a/.github/workflows/windows-x86-cpu.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: windows-x86-cpu -on: - push: - branches: [master] - paths: - - '.github/workflows/windows-x86-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/windows-x86-cpu.yml' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/x86/**' - - 'tests/**' -concurrency: - group: windows-x86-cpu-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - windows-x86: - name: ${{ matrix.vs-version }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - vs-version: vs2015 - toolset-version: v140 - os: windows-2019 - - - vs-version: vs2017 - toolset-version: v141 - os: windows-2019 - - - vs-version: vs2019 - toolset-version: v142 - os: windows-2022 - - - vs-version: vs2022 - toolset-version: v143 - os: windows-2022 - - env: - UseMultiToolTask: true - steps: - - uses: actions/checkout@v3 - - name: build - run: | - mkdir build; cd build - cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . --config Release -j 2 - - name: test - run: cd build; ctest -C Release --output-on-failure -j 2 - - name: build-shared - run: | - mkdir build-shared; cd build-shared - cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 2 diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index f8d5f5c4c5a..db65e10a33d 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -229,10 +229,10 @@ struct compute_coord int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { using namespace GridSample_x86_kernel; - const Mat& bottom_blob = bottom_blobs[0]; + Mat& bottom_blob = bottom_blobs[0].clone(); const Mat& grid = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; - const int elempack = bottom_blob.elempack; + int elempack = bottom_blob.elempack; int channels = bottom_blob.c; int dims = bottom_blob.dims; @@ -557,6 +557,12 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Tue, 25 Apr 2023 02:28:47 +0800 Subject: [PATCH 098/127] use volatile to fix swiftshader bugs --- .github/workflows/android-armv7-cpu.yml | 56 + .github/workflows/android-armv7-gpu.yml | 45 + .github/workflows/android-armv8-cpu.yml | 41 + .github/workflows/android-armv8-gpu.yml | 75 + .github/workflows/android-x64-cpu.yml | 41 + .github/workflows/android-x64-gpu.yml | 45 + .github/workflows/android-x86-cpu.yml | 41 + .github/workflows/android-x86-gpu.yml | 45 + .github/workflows/code-format.yml | 61 + .github/workflows/codeql-analysis.yml | 84 + .github/workflows/elf-riscv32-cpu-gcc.yml | 126 ++ .github/workflows/elf-riscv64-cpu-gcc.yml | 123 ++ .github/workflows/ios-arm64-gpu.yml | 86 + .github/workflows/ios-cpu.yml | 82 + .github/workflows/ios-simulator.yml | 73 + .github/workflows/linux-aarch64-cpu-gcc.yml | 193 ++ .github/workflows/linux-arm-cpu-gcc.yml | 206 ++ .../workflows/linux-loongarch64-cpu-gcc.yml | 78 + .github/workflows/linux-mips-cpu-gcc.yml | 126 ++ .github/workflows/linux-mips64-cpu-gcc.yml | 138 ++ .github/workflows/linux-ppc64-cpu-gcc.yml | 75 + .github/workflows/linux-riscv64-cpu-gcc.yml | 186 ++ .../workflows/linux-riscv64-cpu-gnu-clang.yml | 142 ++ .../workflows/linux-x64-cpu-clang-python.yml | 68 + .github/workflows/linux-x64-cpu-clang.yml | 128 ++ .github/workflows/linux-x64-cpu-gcc-musl.yml | 67 + .github/workflows/linux-x64-cpu-gcc-san.yml | 42 + .github/workflows/linux-x64-cpu-gcc-sde.yml | 57 + .github/workflows/linux-x64-cpu-gcc.yml | 134 ++ .../workflows/linux-x64-gpu-clang-python.yml | 113 ++ .github/workflows/linux-x64-gpu-clang.yml | 91 + .github/workflows/linux-x64-gpu-gcc.yml | 128 ++ .github/workflows/linux-x86-cpu-clang.yml | 67 + .github/workflows/linux-x86-cpu-gcc.yml | 65 + .github/workflows/macos-arm64-cpu.yml | 76 + .github/workflows/macos-arm64-gpu.yml | 92 + .github/workflows/macos-x64-cpu-python.yml | 96 + .github/workflows/macos-x64-cpu.yml | 88 + .github/workflows/macos-x64-gpu.yml | 131 ++ .github/workflows/release-python.yml | 165 ++ .github/workflows/release.yml | 1802 +++++++++++++++++ .github/workflows/sync-wiki.yml | 32 + .github/workflows/test-coverage.yml | 147 ++ .github/workflows/web-assembly.yml | 76 + .github/workflows/windows-arm-cpu.yml | 57 + .github/workflows/windows-arm64-cpu.yml | 57 + .../windows-x64-cpu-vs2019-python.yml | 67 + .github/workflows/windows-x64-cpu.yml | 102 + .github/workflows/windows-x64-gpu.yml | 2 - .github/workflows/windows-x86-cpu.yml | 67 + .../x86/gridsample_bicubic_compute_blob.h | 156 +- .../x86/gridsample_bilinear_compute_blob.h | 320 ++- .../x86/gridsample_nearest_compute_blob.h | 150 +- src/layer/x86/gridsample_x86.cpp | 22 +- tests/test_gridsample.cpp | 80 +- 55 files changed, 6528 insertions(+), 385 deletions(-) create mode 100644 .github/workflows/android-armv7-cpu.yml create mode 100644 .github/workflows/android-armv7-gpu.yml create mode 100644 .github/workflows/android-armv8-cpu.yml create mode 100644 .github/workflows/android-armv8-gpu.yml create mode 100644 .github/workflows/android-x64-cpu.yml create mode 100644 .github/workflows/android-x64-gpu.yml create mode 100644 .github/workflows/android-x86-cpu.yml create mode 100644 .github/workflows/android-x86-gpu.yml create mode 100644 .github/workflows/code-format.yml create mode 100644 .github/workflows/codeql-analysis.yml create mode 100644 .github/workflows/elf-riscv32-cpu-gcc.yml create mode 100644 .github/workflows/elf-riscv64-cpu-gcc.yml create mode 100644 .github/workflows/ios-arm64-gpu.yml create mode 100644 .github/workflows/ios-cpu.yml create mode 100644 .github/workflows/ios-simulator.yml create mode 100644 .github/workflows/linux-aarch64-cpu-gcc.yml create mode 100644 .github/workflows/linux-arm-cpu-gcc.yml create mode 100644 .github/workflows/linux-loongarch64-cpu-gcc.yml create mode 100644 .github/workflows/linux-mips-cpu-gcc.yml create mode 100644 .github/workflows/linux-mips64-cpu-gcc.yml create mode 100644 .github/workflows/linux-ppc64-cpu-gcc.yml create mode 100644 .github/workflows/linux-riscv64-cpu-gcc.yml create mode 100644 .github/workflows/linux-riscv64-cpu-gnu-clang.yml create mode 100644 .github/workflows/linux-x64-cpu-clang-python.yml create mode 100644 .github/workflows/linux-x64-cpu-clang.yml create mode 100644 .github/workflows/linux-x64-cpu-gcc-musl.yml create mode 100644 .github/workflows/linux-x64-cpu-gcc-san.yml create mode 100644 .github/workflows/linux-x64-cpu-gcc-sde.yml create mode 100644 .github/workflows/linux-x64-cpu-gcc.yml create mode 100644 .github/workflows/linux-x64-gpu-clang-python.yml create mode 100644 .github/workflows/linux-x64-gpu-clang.yml create mode 100644 .github/workflows/linux-x64-gpu-gcc.yml create mode 100644 .github/workflows/linux-x86-cpu-clang.yml create mode 100644 .github/workflows/linux-x86-cpu-gcc.yml create mode 100644 .github/workflows/macos-arm64-cpu.yml create mode 100644 .github/workflows/macos-arm64-gpu.yml create mode 100644 .github/workflows/macos-x64-cpu-python.yml create mode 100644 .github/workflows/macos-x64-cpu.yml create mode 100644 .github/workflows/macos-x64-gpu.yml create mode 100644 .github/workflows/release-python.yml create mode 100644 .github/workflows/release.yml create mode 100644 .github/workflows/sync-wiki.yml create mode 100644 .github/workflows/test-coverage.yml create mode 100644 .github/workflows/web-assembly.yml create mode 100644 .github/workflows/windows-arm-cpu.yml create mode 100644 .github/workflows/windows-arm64-cpu.yml create mode 100644 .github/workflows/windows-x64-cpu-vs2019-python.yml create mode 100644 .github/workflows/windows-x64-cpu.yml create mode 100644 .github/workflows/windows-x86-cpu.yml diff --git a/.github/workflows/android-armv7-cpu.yml b/.github/workflows/android-armv7-cpu.yml new file mode 100644 index 00000000000..e7e0a56e4fe --- /dev/null +++ b/.github/workflows/android-armv7-cpu.yml @@ -0,0 +1,56 @@ +name: android-armv7-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-armv7-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-armv7-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' +concurrency: + group: android-armv7-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-armv7: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF .. + cmake --build . -j 2 + + - name: ndk-r16b + run: | + wget https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip + cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip + - name: build-noneon + run: | + mkdir build-noneon && cd build-noneon + cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 .. + cmake --build . -j 2 + - name: build-noneon-shared + run: | + mkdir build-noneon-shared && cd build-noneon-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF .. + cmake --build . -j 2 diff --git a/.github/workflows/android-armv7-gpu.yml b/.github/workflows/android-armv7-gpu.yml new file mode 100644 index 00000000000..f41ff60e5e8 --- /dev/null +++ b/.github/workflows/android-armv7-gpu.yml @@ -0,0 +1,45 @@ +name: android-armv7-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-armv7-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-armv7-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' +concurrency: + group: android-armv7-gpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-armv7-gpu: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF .. + cmake --build . -j 2 diff --git a/.github/workflows/android-armv8-cpu.yml b/.github/workflows/android-armv8-cpu.yml new file mode 100644 index 00000000000..b32cbc43b97 --- /dev/null +++ b/.github/workflows/android-armv8-cpu.yml @@ -0,0 +1,41 @@ +name: android-armv8-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-armv8-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-armv8-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' +concurrency: + group: android-armv8-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-aarch64: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 diff --git a/.github/workflows/android-armv8-gpu.yml b/.github/workflows/android-armv8-gpu.yml new file mode 100644 index 00000000000..ba54b268a51 --- /dev/null +++ b/.github/workflows/android-armv8-gpu.yml @@ -0,0 +1,75 @@ +name: android-armv8-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-armv8-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-armv8-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' +concurrency: + group: android-armv8-gpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-aarch64-gpu: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + - name: build-termux + run: | + mkdir build-termux && cd build-termux + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_PLATFORM_API=OFF .. + cmake --build . -j 2 + - name: build-android-29 + run: | + mkdir build-android-29 && cd build-android-29 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON .. + cmake --build . -j 2 + - name: build-android-29-shared + run: | + mkdir build-android-29-shared && cd build-android-29-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + + android-aarch64-gpu-ndk-r16b: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: ndk-r16b + run: | + wget https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip + cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. + - name: build + run: cmake --build build -j 2 diff --git a/.github/workflows/android-x64-cpu.yml b/.github/workflows/android-x64-cpu.yml new file mode 100644 index 00000000000..5b8b65b2bab --- /dev/null +++ b/.github/workflows/android-x64-cpu.yml @@ -0,0 +1,41 @@ +name: android-x64-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-x64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-x64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' +concurrency: + group: android-x64-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-x86_64: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 diff --git a/.github/workflows/android-x64-gpu.yml b/.github/workflows/android-x64-gpu.yml new file mode 100644 index 00000000000..65770bdc8e9 --- /dev/null +++ b/.github/workflows/android-x64-gpu.yml @@ -0,0 +1,45 @@ +name: android-x64-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-x64-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-x64-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' +concurrency: + group: android-x64-gpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-x86_64-gpu: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 diff --git a/.github/workflows/android-x86-cpu.yml b/.github/workflows/android-x86-cpu.yml new file mode 100644 index 00000000000..28938d6046f --- /dev/null +++ b/.github/workflows/android-x86-cpu.yml @@ -0,0 +1,41 @@ +name: android-x86-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-x86-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-x86-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' +concurrency: + group: android-x86-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-x86: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 diff --git a/.github/workflows/android-x86-gpu.yml b/.github/workflows/android-x86-gpu.yml new file mode 100644 index 00000000000..ec46fa00c1c --- /dev/null +++ b/.github/workflows/android-x86-gpu.yml @@ -0,0 +1,45 @@ +name: android-x86-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/android-x86-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/android-x86-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' +concurrency: + group: android-x86-gpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + android-x86-gpu: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. + cmake --build . -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml new file mode 100644 index 00000000000..e65a7996994 --- /dev/null +++ b/.github/workflows/code-format.yml @@ -0,0 +1,61 @@ +name: code-format + +on: [push, pull_request, pull_request_target] + +concurrency: + group: code-format-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + code-format: + permissions: + contents: write # for stefanzweifel/git-auto-commit-action to push code in repo + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-clang-format + id: cache-clang-format + uses: actions/cache@v3 + with: + path: clang-format-install + key: clang-format-install-4 + - name: clang-format + if: steps.cache-clang-format.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/llvm-project-10.0.1.tar.xz + tar -xf llvm-project-10.0.1.tar.xz + cd llvm-project-10.0.1 + mkdir build + cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ../llvm/ + make -j2 clang-format + mkdir $GITHUB_WORKSPACE/clang-format-install + cp -r bin/clang-format $GITHUB_WORKSPACE/clang-format-install + cd ../../ + rm -rf llvm-project-10.0.1 + rm llvm-project-10.0.1.tar.xz + + - name: astyle + run: | + sudo apt-get update + sudo apt-get install -y astyle + + - name: code-format + run: | + mkdir -p ~/bin + mv $GITHUB_WORKSPACE/clang-format-install/clang-format ~/bin/clang-format + rm -rf $GITHUB_WORKSPACE/clang-format-install + export PATH=~/bin:$PATH + sh codeformat.sh + - uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: apply code-format changes + + - name: restore-clang-format-cache + run: | + mkdir $GITHUB_WORKSPACE/clang-format-install + cp -r ~/bin/clang-format $GITHUB_WORKSPACE/clang-format-install diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 00000000000..fe43e581d32 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,84 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +name: "CodeQL" + +on: + push: + branches: [master] + paths-ignore: ['**.md'] + pull_request: + # The branches below must be a subset of the branches above + branches: [master] + paths-ignore: ['**.md'] + schedule: + - cron: '0 20 * * 4' + +concurrency: + group: CodeQL-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + analyze: + permissions: + actions: read # for github/codeql-action/init to get workflow details + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/autobuild to send a status report + name: Analyze + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + # Override automatic language detection by changing the below list + # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python'] + language: ['cpp'] + # Learn more... + # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + # We must fetch at least the immediate parents so that if this is + # a pull request then we can checkout the head. + fetch-depth: 2 + + # If this run was triggered by a pull request event, then checkout + # the head of the pull request instead of the merge commit. + - run: git checkout HEAD^2 + if: ${{ github.event_name == 'pull_request' }} + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/elf-riscv32-cpu-gcc.yml b/.github/workflows/elf-riscv32-cpu-gcc.yml new file mode 100644 index 00000000000..87a9477449a --- /dev/null +++ b/.github/workflows/elf-riscv32-cpu-gcc.yml @@ -0,0 +1,126 @@ +name: elf-riscv32-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/elf-riscv32-cpu-gcc.yml' + - 'toolchains/riscv32-unknown-elf.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/elf-riscv32-cpu-gcc.yml' + - 'toolchains/riscv32-unknown-elf.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: elf-riscv32-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + newlib-rv32imc-gcc: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v3 + #with: + #path: rv32imc-install + #key: rv32imc-newlib-install-20210425 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: b715e4f01b43efef487166f75d5d85d3c33fa7ef + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-glibc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-newlib + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #./configure --prefix=$GITHUB_WORKSPACE/rv32imc-install --with-arch=rv32imc + #make -j2 + + #- name: checkout-riscv-pk + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-pk + #path: riscv-pk + #ref: ef7bebaf9bf24d3e90bcaae96387ce418e136b6d + #- name: riscv-pk + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-pk + #mkdir build + #cd build + #export PATH=$GITHUB_WORKSPACE/rv32imc-install/bin:$PATH + #../configure --prefix=$GITHUB_WORKSPACE/rv32imc-install --with-arch=rv32imc --host=riscv32-unknown-elf + #make -j2 + #make install + + #- name: checkout-riscv-isa-sim + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-isa-sim + #path: riscv-isa-sim + #ref: 9d4f45c2ebf105503974fc80a42590ca1584c354 + #- name: riscv-isa-sim + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-isa-sim + #mkdir build + #cd build + #export PATH=$GITHUB_WORKSPACE/rv32imc-install/bin:$PATH + #../configure --prefix=$GITHUB_WORKSPACE/rv32imc-install --with-isa=rv32imc + #make -j2 + #make install + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/rv32imc-install -type f | xargs -i strip -g {} || true + + - name: configure + run: export RISCV_ROOT_PATH=/data/action/osd/rv32imc-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv32-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 4 + + # too slow for softfloat arch :( + #- name: test + #run: | + #sudo apt-get update + #sudo apt-get install device-tree-compiler + #export PATH=/data/action/osd/rv32imc-install/bin:$PATH + #cd build + #TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS=/data/action/osd/rv32imc-install/riscv32-unknown-elf/bin/pk ctest --output-on-failure -j 2 diff --git a/.github/workflows/elf-riscv64-cpu-gcc.yml b/.github/workflows/elf-riscv64-cpu-gcc.yml new file mode 100644 index 00000000000..fbf2c0227a1 --- /dev/null +++ b/.github/workflows/elf-riscv64-cpu-gcc.yml @@ -0,0 +1,123 @@ +name: elf-riscv64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/elf-riscv64-cpu-gcc.yml' + - 'toolchains/riscv64-unknown-elf.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/elf-riscv64-cpu-gcc.yml' + - 'toolchains/riscv64-unknown-elf.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: elf-riscv64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + newlib-rv64gc-gcc: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v3 + #with: + #path: rv64gc-install + #key: rv64gc-newlib-install-20210425 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: b715e4f01b43efef487166f75d5d85d3c33fa7ef + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-glibc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-newlib + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gc-install --with-arch=rv64gc + #make -j2 + + #- name: checkout-riscv-pk + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-pk + #path: riscv-pk + #ref: ef7bebaf9bf24d3e90bcaae96387ce418e136b6d + #- name: riscv-pk + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-pk + #mkdir build + #cd build + #export PATH=$GITHUB_WORKSPACE/rv64gc-install/bin:$PATH + #../configure --prefix=$GITHUB_WORKSPACE/rv64gc-install --with-arch=rv64gc --host=riscv64-unknown-elf + #make -j2 + #make install + + #- name: checkout-riscv-isa-sim + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-isa-sim + #path: riscv-isa-sim + #ref: 9d4f45c2ebf105503974fc80a42590ca1584c354 + #- name: riscv-isa-sim + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-isa-sim + #mkdir build + #cd build + #export PATH=$GITHUB_WORKSPACE/rv64gc-install/bin:$PATH + #../configure --prefix=$GITHUB_WORKSPACE/rv64gc-install --with-isa=rv64gc + #make -j2 + #make install + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/rv64gc-install -type f | xargs -i strip -g {} || true + + - name: configure + run: export RISCV_ROOT_PATH=/data/action/osd/rv64gc-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 4 + - name: test + run: | + export PATH=/data/action/osd/rv64gc-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS=/data/action/osd/rv64gc-install/riscv64-unknown-elf/bin/pk ctest --output-on-failure -j 4 diff --git a/.github/workflows/ios-arm64-gpu.yml b/.github/workflows/ios-arm64-gpu.yml new file mode 100644 index 00000000000..d22136d02b9 --- /dev/null +++ b/.github/workflows/ios-arm64-gpu.yml @@ -0,0 +1,86 @@ +name: ios-arm64-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/ios-arm64-gpu.yml' + - 'toolchains/ios.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/ios-arm64-gpu.yml' + - 'toolchains/ios.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' +concurrency: + group: ios-arm64-gpu-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + ios-iphone-os-gpu: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-ios-install-20201213-4 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib + - name: vulkansdk + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg + hdiutil attach vulkansdk-macos-1.3.236.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 + - name: configure + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DIOS_ARCH="arm64;arm64e" \ + -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/iOS/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + - name: build + run: cmake --build build -j 3 diff --git a/.github/workflows/ios-cpu.yml b/.github/workflows/ios-cpu.yml new file mode 100644 index 00000000000..515bc256df2 --- /dev/null +++ b/.github/workflows/ios-cpu.yml @@ -0,0 +1,82 @@ +name: ios-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/ios-cpu.yml' + - 'toolchains/ios.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/ios-cpu.yml' + - 'toolchains/ios.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' +concurrency: + group: ios-cpu-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + ios-iphone-os: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-ios-install-20201213-4 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib + - name: build-armv7 + run: | + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS -DIOS_ARCH="armv7" \ + -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + .. + cmake --build . -j 3 + - name: build-arm64 + run: | + mkdir build-arm64 && cd build-arm64 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS -DIOS_ARCH="arm64;arm64e" \ + -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + .. + cmake --build . -j 3 diff --git a/.github/workflows/ios-simulator.yml b/.github/workflows/ios-simulator.yml new file mode 100644 index 00000000000..0aa2de45305 --- /dev/null +++ b/.github/workflows/ios-simulator.yml @@ -0,0 +1,73 @@ +name: ios-simulator +on: + push: + branches: [master] + paths: + - '.github/workflows/ios-simulator.yml' + - 'toolchains/ios.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/ios-simulator.yml' + - 'toolchains/ios.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' +concurrency: + group: ios-simulator-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + ios-iphone-simulator: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-ios-simulator-install-20201213 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib + - name: configure + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=SIMULATOR -DIOS_ARCH="i386;x86_64" \ + -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ + .. + - name: build + run: cmake --build build -j 3 diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml new file mode 100644 index 00000000000..4940421d362 --- /dev/null +++ b/.github/workflows/linux-aarch64-cpu-gcc.yml @@ -0,0 +1,193 @@ +name: linux-aarch64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-aarch64-cpu-gcc.yml' + - 'toolchains/aarch64-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-aarch64-cpu-gcc.yml' + - 'toolchains/aarch64-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' +concurrency: + group: linux-aarch64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-aarch64-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system + make -j2 + make install + + - name: aarch64-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-aarch64-linux-gnu + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 + + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-noint8 + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build-noint8 + TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 + + linux-gcc-arm82: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-aarch64-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system + make -j2 + make install + + - name: aarch64-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-aarch64-linux-gnu + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 + + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. + cmake --build . -j 2 + - name: test-noint8 + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build-noint8 + TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 + + linux-gcc-arm86: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-aarch64-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system + make -j2 + make install + + - name: aarch64-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-aarch64-linux-gnu + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-arm-cpu-gcc.yml b/.github/workflows/linux-arm-cpu-gcc.yml new file mode 100644 index 00000000000..72c075d4413 --- /dev/null +++ b/.github/workflows/linux-arm-cpu-gcc.yml @@ -0,0 +1,206 @@ +name: linux-arm-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-arm-cpu-gcc.yml' + - 'toolchains/arm-linux-gnueabi.toolchain.cmake' + - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-arm-cpu-gcc.yml' + - 'toolchains/arm-linux-gnueabi.toolchain.cmake' + - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' +concurrency: + group: linux-arm-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-arm: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-arm-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system + make -j2 + make install + + - name: arm-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-arm-linux-gnueabi + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2 + + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-noint8 + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build-noint8 + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2 + + linux-gcc-armhf: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-arm-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system + make -j2 + make install + + - name: arm-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-arm-linux-gnueabihf + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 + + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. + cmake --build . -j 2 + - name: test-noint8 + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build-noint8 + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 + + linux-gcc-armhf-vfpv3-d16: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-arm-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system + make -j2 + make install + + - name: arm-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-arm-linux-gnueabihf + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 + + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF .. + cmake --build . -j 2 + - name: test-noint8 + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build-noint8 + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-loongarch64-cpu-gcc.yml b/.github/workflows/linux-loongarch64-cpu-gcc.yml new file mode 100644 index 00000000000..fbdbee1dc21 --- /dev/null +++ b/.github/workflows/linux-loongarch64-cpu-gcc.yml @@ -0,0 +1,78 @@ +name: linux-loongarch64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-loongarch64-cpu-gcc.yml' + - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' + - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/loongarch/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-loongarch64-cpu-gcc.yml' + - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' + - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/loongarch/**' + - 'tests/**' +concurrency: + group: linux-loongarch64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-loongarch64: + runs-on: [self-hosted, linux, centos] + + steps: + - uses: actions/checkout@v3 + + - name: loongarch64-toolchain + run: | + wget https://github.com/loongson/build-tools/releases/download/2022.05.29/loongarch64-clfs-5.0-cross-tools-gcc-full.tar.xz + tar -xf loongarch64-clfs-5.0-cross-tools-gcc-full.tar.xz + wget https://github.com/loongson/build-tools/releases/download/2022.05.29/qemu-loongarch64 + chmod +x qemu-loongarch64 + + - name: configure + run: | + export LOONGARCH64_ROOT_PATH=$GITHUB_WORKSPACE/cross-tools + export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cross-tools/target/usr/lib64:$LD_LIBRARY_PATH + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + + - name: build + run: cmake --build build -j 4 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE:$PATH + export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cross-tools/target/usr/lib64:$LD_LIBRARY_PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-loongarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;$GITHUB_WORKSPACE/cross-tools/target/usr" ctest --output-on-failure -j 4 + + linux-gcc-loongarch64-lsx: + runs-on: [self-hosted, linux, centos] + + steps: + - uses: actions/checkout@v3 + + - name: configure + run: | + export LOONGARCH64_ROOT_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1 + export LD_LIBRARY_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1/sysroot/usr/lib64:$LD_LIBRARY_PATH + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + + - name: build + run: cmake --build build -j 4 diff --git a/.github/workflows/linux-mips-cpu-gcc.yml b/.github/workflows/linux-mips-cpu-gcc.yml new file mode 100644 index 00000000000..ba41d4b0293 --- /dev/null +++ b/.github/workflows/linux-mips-cpu-gcc.yml @@ -0,0 +1,126 @@ +name: linux-mips-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-mips-cpu-gcc.yml' + - 'toolchains/mipsel-linux-gnu.toolchain.cmake' + - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/mips/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-mips-cpu-gcc.yml' + - 'toolchains/mipsel-linux-gnu.toolchain.cmake' + - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/mips/**' + - 'tests/**' +concurrency: + group: linux-mips-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-mipsel: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-mipsel-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system + make -j2 + make install + + - name: mipsel-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-mipsel-linux-gnu + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsel-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 2 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j 2 + + linux-gcc-mipsisa32r6el: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-mipsel-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system + make -j2 + make install + + - name: mipsisa32r6el-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-mipsisa32r6el-linux-gnu + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 2 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-mips64-cpu-gcc.yml b/.github/workflows/linux-mips64-cpu-gcc.yml new file mode 100644 index 00000000000..3d0fe1229ca --- /dev/null +++ b/.github/workflows/linux-mips64-cpu-gcc.yml @@ -0,0 +1,138 @@ +name: linux-mips64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-mips64-cpu-gcc.yml' + - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' + - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/mips/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-mips64-cpu-gcc.yml' + - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' + - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/mips/**' + - 'tests/**' +concurrency: + group: linux-mips64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-mips64el: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-mips64el-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system + make -j2 + make install + + - name: mips64el-gnuabi64-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-mips64el-linux-gnuabi64 + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips64el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 2 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j 2 + + linux-gcc-mipsisa64r6el: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-mips64el-install-20220502-4 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0001-target-mips-Fix-SAT_S-trans-helper.patch + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch + patch -p1 -i 0001-target-mips-Fix-SAT_S-trans-helper.patch + patch -p1 -i 0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch + patch -p1 -i 0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch + patch -p1 -i 0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch + patch -p1 -i 0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch + patch -p1 -i 0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system + make -j2 + make install + + - name: mipsisa64r6el-gnuabi64-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-mipsisa64r6el-linux-gnuabi64 + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 2 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-ppc64-cpu-gcc.yml b/.github/workflows/linux-ppc64-cpu-gcc.yml new file mode 100644 index 00000000000..a4ae93a6172 --- /dev/null +++ b/.github/workflows/linux-ppc64-cpu-gcc.yml @@ -0,0 +1,75 @@ +name: linux-ppc64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-ppc64-cpu-gcc.yml' + - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-ppc64-cpu-gcc.yml' + - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'tests/**' +concurrency: + group: linux-ppc64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-ppc64le: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-ppc64le-install-20220502-2 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system + make -j2 + make install + + - name: powerpc64le-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-powerpc64le-linux-gnu + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 2 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml new file mode 100644 index 00000000000..9c5393e4f49 --- /dev/null +++ b/.github/workflows/linux-riscv64-cpu-gcc.yml @@ -0,0 +1,186 @@ +name: linux-riscv64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gcc.yml' + - 'toolchains/riscv64-linux-gnu.toolchain.cmake' + - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gcc.yml' + - 'toolchains/riscv64-linux-gnu.toolchain.cmake' + - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: linux-riscv64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-riscv64: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v3 + with: + path: qemu-install + key: qemu-riscv64-install-20220502-4 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + make -j2 + make install + + - name: riscv64-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-riscv64-linux-gnu + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 2 + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2 + + linux-gcc-riscv64-c906: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + - name: configure + run: | + export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1 + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. + - name: build + run: cmake --build build -j 4 + + linux-gcc-riscv64-rvv: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + #- name: cache-qemu + #id: cache-qemu + #uses: actions/cache@v3 + #with: + #path: qemu-install + #key: qemu-riscv64-install-20220502-3 + #- name: install-qemu-build-deps + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev ninja-build + #- name: checkout-qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: qemu/qemu + #path: qemu + #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + #- name: qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #cd qemu + #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + #make -j2 + #make install + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v3 + #with: + #path: rv64gcv-install-next + #key: rv64gcv-linux-install-20210504 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh + #make linux + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true + + - name: configure + run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 4 + + - name: test-vlen256 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 + + - name: test-vlen128 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml new file mode 100644 index 00000000000..18ad114efa4 --- /dev/null +++ b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml @@ -0,0 +1,142 @@ +name: linux-riscv64-cpu-gnu-clang +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: linux-riscv64-cpu-gnu-clang-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-riscv64-rvv: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + #- name: cache-qemu + #id: cache-qemu + #uses: actions/cache@v3 + #with: + #path: qemu-install + #key: qemu-riscv64-install-20220502-3 + #- name: install-qemu-build-deps + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev ninja-build + #- name: checkout-qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: qemu/qemu + #path: qemu + #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + #- name: qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #cd qemu + #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + #make -j2 + #make install + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v3 + #with: + #path: rv64gcv-install-next + #key: rv64gcv-linux-install-20210504 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh + #make linux + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true + + # - name: install-clang + # run: | + # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz + # tar -xf llvm-project-15.0.1.src.tar.xz + # cd llvm-project-15.0.1.src + # mkdir build + # cd build + # cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ + # make -j16 + # make install + + - name: build + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next + export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 4 + + - name: test-vlen256 + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 + + - name: test-vlen128 + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-x64-cpu-clang-python.yml b/.github/workflows/linux-x64-cpu-clang-python.yml new file mode 100644 index 00000000000..91292d2113d --- /dev/null +++ b/.github/workflows/linux-x64-cpu-clang-python.yml @@ -0,0 +1,68 @@ +name: linux-x64-cpu-clang-python +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-clang-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'python/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-clang-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'python/**' +concurrency: + group: linux-x64-cpu-clang-python-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-clang-python: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: set up python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest setuptools wheel twine + - name: configure + env: + CC: clang + CXX: clang++ + run: mkdir build && cd build && cmake -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + - name: build + run: cmake --build build -j 2 + - name: install python + run: cd python && pip install . + - name: test + run: cd python && pytest tests + - name: build and publish + if: startsWith(github.ref, 'refs/tags') + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} + TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" + run: | + cd python + python setup.py bdist_wheel + twine upload dist/* diff --git a/.github/workflows/linux-x64-cpu-clang.yml b/.github/workflows/linux-x64-cpu-clang.yml new file mode 100644 index 00000000000..82e17655994 --- /dev/null +++ b/.github/workflows/linux-x64-cpu-clang.yml @@ -0,0 +1,128 @@ +name: linux-x64-cpu-clang +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-clang.yml' + - 'toolchains/host-c.clang.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-clang.yml' + - 'toolchains/host-c.clang.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-cpu-clang-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-clang: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: protobuf + run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev + - name: build-sse2 + env: + CC: clang + CXX: clang++ + run: | + mkdir build-sse2 && cd build-sse2 + cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-sse2 + run: cd build-sse2 && ctest --output-on-failure -j 2 + - name: build-shared + env: + CC: clang + CXX: clang++ + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + - name: build-avx2 + env: + CC: clang + CXX: clang++ + run: | + mkdir build-avx2 && cd build-avx2 + cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx2 + run: cd build-avx2 && ctest --output-on-failure -j 2 + - name: build-avx + env: + CC: clang + CXX: clang++ + run: | + mkdir build-avx && cd build-avx + cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx + run: cd build-avx && ctest --output-on-failure -j 2 + - name: build-avx1-2 + env: + CC: clang + CXX: clang++ + run: | + mkdir build-avx1-2 && cd build-avx1-2 + cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx1-2 + run: cd build-avx1-2 && ctest --output-on-failure -j 2 + - name: build-noint8 + env: + CC: clang + CXX: clang++ + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-noint8 + run: cd build-noint8 && ctest --output-on-failure -j 2 + + linux-clang-simplestl: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: build-simplestl + env: + CC: clang + CXX: clang++ + run: | + mkdir build-simplestl && cd build-simplestl + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test-simplestl + run: cd build-simplestl && ctest --output-on-failure -j 2 + - name: build-simplestl-simpleomp + env: + CC: clang + CXX: clang++ + run: | + mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test-simplestl-simpleomp + run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc-musl.yml b/.github/workflows/linux-x64-cpu-gcc-musl.yml new file mode 100644 index 00000000000..208ffc4525f --- /dev/null +++ b/.github/workflows/linux-x64-cpu-gcc-musl.yml @@ -0,0 +1,67 @@ +name: linux-x64-cpu-gcc-musl +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc-musl.yml' + - 'toolchains/host-c.gcc.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc-musl.yml' + - 'toolchains/host-c.gcc.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-cpu-gcc-musl-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-musl: + runs-on: ubuntu-latest + steps: + - uses: jirutka/setup-alpine@v1 + with: + packages: > + cmake + clang + clang-dev + make + gcc + g++ + libc-dev + linux-headers + + - uses: actions/checkout@v3 + - name: build + shell: alpine.sh {0} + run: | + mkdir build && cd build + cmake -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test + shell: alpine.sh {0} + run: cd build && ctest --output-on-failure -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc-san.yml b/.github/workflows/linux-x64-cpu-gcc-san.yml new file mode 100644 index 00000000000..ae57b37bc54 --- /dev/null +++ b/.github/workflows/linux-x64-cpu-gcc-san.yml @@ -0,0 +1,42 @@ +name: linux-x64-cpu-gcc-san +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc-san.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc-san.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' +concurrency: + group: linux-x64-cpu-gcc-san-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-san: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test + run: | + cd build + ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc-sde.yml b/.github/workflows/linux-x64-cpu-gcc-sde.yml new file mode 100644 index 00000000000..51eb6861553 --- /dev/null +++ b/.github/workflows/linux-x64-cpu-gcc-sde.yml @@ -0,0 +1,57 @@ +name: linux-x64-cpu-gcc-sde +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc-sde.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc-sde.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-cpu-gcc-sde-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-sde: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: gcc12 + run: sudo apt-get install gcc-12 g++-12 + - name: Setup SDE binaries + uses: petarpetrovt/setup-sde@v2 + - name: build-avx512-spr + env: + CC: gcc-12 + CXX: g++-12 + run: | + mkdir build-avx512-spr && cd build-avx512-spr + cmake -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx512-spr + run: | + cd build-avx512-spr + TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml new file mode 100644 index 00000000000..3b6b305e125 --- /dev/null +++ b/.github/workflows/linux-x64-cpu-gcc.yml @@ -0,0 +1,134 @@ +name: linux-x64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc.yml' + - 'toolchains/host-c.gcc.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-cpu-gcc.yml' + - 'toolchains/host-c.gcc.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: protobuf + run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev + - name: build-sse2 + run: | + mkdir build-sse2 && cd build-sse2 + cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-sse2 + run: cd build-sse2 && ctest --output-on-failure -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + - name: build-avx2 + run: | + mkdir build-avx2 && cd build-avx2 + cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx2 + run: cd build-avx2 && ctest --output-on-failure -j 2 + - name: build-avx + run: | + mkdir build-avx && cd build-avx + cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx + run: cd build-avx && ctest --output-on-failure -j 2 + - name: build-avx1-2 + run: | + mkdir build-avx1-2 && cd build-avx1-2 + cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx1-2 + run: cd build-avx1-2 && ctest --output-on-failure -j 2 + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-noint8 + run: cd build-noint8 && ctest --output-on-failure -j 2 + + linux-gcc-cpp03-nostdio-nostring-simplestl: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - name: build-nostdio + run: | + mkdir build-nostdio && cd build-nostdio + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test-nostdio + run: cd build-nostdio && ctest --output-on-failure -j 2 + - name: build-nostdio-nostring + run: | + mkdir build-nostdio-nostring && cd build-nostdio-nostring + cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: build-simplestl + run: | + mkdir build-simplestl && cd build-simplestl + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test-simplestl + run: cd build-simplestl && ctest --output-on-failure -j 2 + - name: build-simplestl-simpleomp + run: | + mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test-simplestl-simpleomp + run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2 + + linux-gcc-avx512: + runs-on: [self-hosted, linux, t4] + steps: + - uses: actions/checkout@v3 + - name: build + env: + CC: gcc + CXX: g++ + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + mkdir build && cd build + cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 4 + - name: test + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: cd build && ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-x64-gpu-clang-python.yml b/.github/workflows/linux-x64-gpu-clang-python.yml new file mode 100644 index 00000000000..710fa101119 --- /dev/null +++ b/.github/workflows/linux-x64-gpu-clang-python.yml @@ -0,0 +1,113 @@ +name: linux-x64-gpu-clang-python +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-gpu-clang-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'python/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-gpu-clang-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'python/**' +concurrency: + group: linux-x64-gpu-clang-python-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-clang-gpu: + runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: [3.6, 3.8] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v3 + with: + path: "1.3.236.0" + key: vulkansdk-linux-x86_64-1.3.236.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.3.236.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.3.236.0.tar.gz + rm -rf 1.3.236.0/source 1.3.236.0/samples + find 1.3.236.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v3 + with: + path: swiftshader-install + key: swiftshader-linux-install-20230420 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: google/swiftshader + path: swiftshader + ref: dd55e592406dc0bae219df11adec6363840aff4a + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 2 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Linux/* $GITHUB_WORKSPACE/swiftshader-install + - name: set up python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest setuptools wheel twine + - name: build + env: + CC: clang + CXX: clang++ + run: | + export VULKAN_SDK=`pwd`/1.3.236.0/x86_64 + mkdir build && cd build + cmake -DNCNN_VULKAN=ON -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: install python + run: cd python && pip install . + - name: test + run: | + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd python && pytest tests + - name: build and publish + if: startsWith(github.ref, 'refs/tags') + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} + TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" + run: | + cd python + python setup.py bdist_wheel + twine upload dist/* diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml new file mode 100644 index 00000000000..14671e4337f --- /dev/null +++ b/.github/workflows/linux-x64-gpu-clang.yml @@ -0,0 +1,91 @@ +name: linux-x64-gpu-clang +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-gpu-clang.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-gpu-clang.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-gpu-clang-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-clang-gpu: + runs-on: [self-hosted, linux, cvm] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v3 + with: + path: swiftshader-install + key: swiftshader-linux-install-20230420 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: google/swiftshader + path: swiftshader + ref: dd55e592406dc0bae219df11adec6363840aff4a + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 4 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Linux/* $GITHUB_WORKSPACE/swiftshader-install + - name: build + env: + CC: clang + CXX: clang++ + run: | + mkdir build && cd build + cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 4 + - name: test + run: | + printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 4 + - name: build-shared + env: + CC: clang + CXX: clang++ + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 4 diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml new file mode 100644 index 00000000000..f9d0cdc9a88 --- /dev/null +++ b/.github/workflows/linux-x64-gpu-gcc.yml @@ -0,0 +1,128 @@ +name: linux-x64-gpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x64-gpu-gcc.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x64-gpu-gcc.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-gpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-gpu: + runs-on: [self-hosted, linux, cvm] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v3 + with: + path: swiftshader-install + key: swiftshader-linux-install-20230420 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: google/swiftshader + path: swiftshader + ref: dd55e592406dc0bae219df11adec6363840aff4a + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 4 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Linux/* $GITHUB_WORKSPACE/swiftshader-install + - name: build + run: | + mkdir build && cd build + cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 4 + - name: test + run: | + printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 4 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 4 + + linux-gcc-gpu-system-glslang: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + + - name: install-deps + run: | + sudo apt-get update + sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev libvulkan-dev glslang-dev spirv-tools + + - name: build + run: | + mkdir build && cd build + cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake .. + cmake --build . -j 4 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 4 + + linux-gcc-gpu-t4: + runs-on: [self-hosted, linux, t4] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: build + env: + CC: gcc + CXX: g++ + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export VULKAN_SDK=/data/action/osd/1.2.189.0/x86_64 + mkdir build && cd build + cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 4 + - name: test + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + cd build && ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-x86-cpu-clang.yml b/.github/workflows/linux-x86-cpu-clang.yml new file mode 100644 index 00000000000..92544f4e474 --- /dev/null +++ b/.github/workflows/linux-x86-cpu-clang.yml @@ -0,0 +1,67 @@ +name: linux-x86-cpu-clang +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x86-cpu-clang.yml' + - 'toolchains/host.clang-m32.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x86-cpu-clang.yml' + - 'toolchains/host.clang-m32.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' +concurrency: + group: linux-x86-cpu-clang-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-clang: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: gcc-multilib + run: sudo apt-get install gcc-multilib g++-multilib + - name: build + env: + CC: clang + CXX: clang++ + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test + run: cd build && ctest --output-on-failure -j 2 + - name: build-shared + env: + CC: clang + CXX: clang++ + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + - name: build-noint8 + env: + CC: clang + CXX: clang++ + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF .. + cmake --build . -j 2 + - name: test-noint8 + run: cd build-noint8 && ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x86-cpu-gcc.yml b/.github/workflows/linux-x86-cpu-gcc.yml new file mode 100644 index 00000000000..c6385f0b011 --- /dev/null +++ b/.github/workflows/linux-x86-cpu-gcc.yml @@ -0,0 +1,65 @@ +name: linux-x86-cpu-gcc +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-x86-cpu-gcc.yml' + - 'toolchains/host.gcc-m32.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-x86-cpu-gcc.yml' + - 'toolchains/host.gcc-m32.toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' +concurrency: + group: linux-x86-cpu-gcc-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: gcc-multilib + run: sudo apt-get install gcc-multilib g++-multilib + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test + run: cd build && ctest --output-on-failure -j 2 + - name: build-nosse + run: | + mkdir build-nosse && cd build-nosse + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 2 + - name: test-nosse + run: cd build-nosse && ctest --output-on-failure -j 2 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF .. + cmake --build . -j 2 + - name: test-noint8 + run: cd build-noint8 && ctest --output-on-failure -j 2 diff --git a/.github/workflows/macos-arm64-cpu.yml b/.github/workflows/macos-arm64-cpu.yml new file mode 100644 index 00000000000..df85f05353f --- /dev/null +++ b/.github/workflows/macos-arm64-cpu.yml @@ -0,0 +1,76 @@ +name: macos-arm64-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/macos-arm64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/macos-arm64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' +concurrency: + group: macos-arm64-cpu-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + macos-clang: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-macos-install-20201213 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 3 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 3 diff --git a/.github/workflows/macos-arm64-gpu.yml b/.github/workflows/macos-arm64-gpu.yml new file mode 100644 index 00000000000..eac0da0bb73 --- /dev/null +++ b/.github/workflows/macos-arm64-gpu.yml @@ -0,0 +1,92 @@ +name: macos-arm64-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/macos-arm64-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/macos-arm64-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'src/layer/vulkan/**' +concurrency: + group: macos-arm64-gpu-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + macos-clang-gpu: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-macos-install-20201213 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: vulkansdk + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg + hdiutil attach vulkansdk-macos-1.3.236.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 + - name: build + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build && cd build + cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 3 + - name: build-shared + run: | + export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.236.0/macOS + mkdir build-shared && cd build-shared + cmake -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 3 diff --git a/.github/workflows/macos-x64-cpu-python.yml b/.github/workflows/macos-x64-cpu-python.yml new file mode 100644 index 00000000000..64472dc4da9 --- /dev/null +++ b/.github/workflows/macos-x64-cpu-python.yml @@ -0,0 +1,96 @@ +name: macos-x64-cpu-python +on: + push: + branches: [master] + paths: + - '.github/workflows/macos-x64-cpu-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'python/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/macos-x64-cpu-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'python/**' +concurrency: + group: macos-x64-cpu-python-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + macos-clang: + runs-on: macos-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: set up python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest setuptools wheel twine + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-macos-install-20201213 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j 3 + - name: install python + run: cd python && pip install . + - name: test + run: cd python && pytest tests + - name: build and publish + if: startsWith(github.ref, 'refs/tags') + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} + TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" + run: | + cd python + python setup.py bdist_wheel + twine upload dist/* diff --git a/.github/workflows/macos-x64-cpu.yml b/.github/workflows/macos-x64-cpu.yml new file mode 100644 index 00000000000..1b1c0f75cc9 --- /dev/null +++ b/.github/workflows/macos-x64-cpu.yml @@ -0,0 +1,88 @@ +name: macos-x64-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/macos-x64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/macos-x64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: macos-x64-cpu-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + macos-clang: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: protobuf + run: brew install protobuf opencv3 + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-macos-install-20201213 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 3 + - name: test + run: cd build && ctest --output-on-failure -j 3 + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 3 diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml new file mode 100644 index 00000000000..861c71e82d4 --- /dev/null +++ b/.github/workflows/macos-x64-gpu.yml @@ -0,0 +1,131 @@ +name: macos-x64-gpu +on: + push: + branches: [master] + paths: + - '.github/workflows/macos-x64-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/macos-x64-gpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'src/layer/vulkan/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: macos-x64-gpu-${{ github.ref }} + cancel-in-progress: true +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer +permissions: + contents: read + +jobs: + macos-clang-gpu: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: protobuf + run: brew install protobuf opencv3 + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-macos-install-20201213 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + mkdir -p build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + mkdir $GITHUB_WORKSPACE/openmp-install + cp -r install/* $GITHUB_WORKSPACE/openmp-install + - name: install-openmp + run: | + sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: vulkansdk + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg + hdiutil attach vulkansdk-macos-1.3.236.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v3 + with: + path: swiftshader-install + key: swiftshader-macos-install-20230420 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: google/swiftshader + path: swiftshader + ref: dd55e592406dc0bae219df11adec6363840aff4a + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 3 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install + - name: build + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build && cd build + cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 3 + - name: test + run: | + printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini + export DYLD_LIBRARY_PATH="$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS/lib":$DYLD_LIBRARY_PATH + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 3 + - name: build-shared + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build-shared && cd build-shared + cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 3 diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml new file mode 100644 index 00000000000..d28cc54eb25 --- /dev/null +++ b/.github/workflows/release-python.yml @@ -0,0 +1,165 @@ +name: release-python +# on: [push, pull_request] +on: + push: + tags: + - '*' + +jobs: + build_sdist: + name: Build SDist + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install deps + run: python -m pip install twine build + + - name: Build SDist + run: python -m build -s + + - name: Check metadata + run: twine check dist/* + + - uses: actions/upload-artifact@v3 + with: + path: dist/*.tar.gz + + build_wheels: + name: ${{ matrix.arch }} ${{ matrix.build }} on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-manylinux*' } + - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-musllinux*' } + - { os: ubuntu-20.04, arch: x86_64, build: 'pp*' } + - { os: ubuntu-20.04, arch: i686, build: 'cp*-manylinux*' } + - { os: ubuntu-20.04, arch: i686, build: 'cp*-musllinux*' } + - { os: ubuntu-20.04, arch: i686, build: 'pp*' } + - { os: windows-2019, arch: x86, build: 'cp*' } + - { os: windows-2019, arch: AMD64, build: 'cp*' } + - { os: windows-2019, arch: AMD64, build: 'pp*' } + - { os: windows-2019, arch: ARM64, build: 'cp*' } + - { os: macos-latest, arch: x86_64, build: 'cp*' } + - { os: macos-latest, arch: x86_64, build: 'pp*' } + - { os: macos-latest, arch: arm64, build: 'cp*' } + - { os: macos-latest, arch: universal2, build: 'cp*' } + + steps: + - uses: actions/checkout@v3 + with: + submodules: true + + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: brew uninstall libomp + if: matrix.os == 'macos-latest' + run: | + brew uninstall --ignore-dependencies libomp + + - name: Build wheels + uses: pypa/cibuildwheel@v2.12.3 + env: + CIBW_ARCHS_MACOS: ${{ matrix.arch }} + CIBW_ARCHS_LINUX: ${{ matrix.arch }} + CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} + CIBW_BUILD: ${{ matrix.build }} + CIBW_BUILD_VERBOSITY: 1 + CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2 + + - name: Show files + run: ls -lh wheelhouse + shell: bash + + - name: Verify clean directory + run: git diff --exit-code + shell: bash + + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + path: wheelhouse/*.whl + + build_wheels_qemu: + name: ${{ matrix.arch }} ${{ matrix.build }} + runs-on: ubuntu-20.04 + + strategy: + fail-fast: false + matrix: + arch: [aarch64, ppc64le, s390x] + build: ['cp36-*', 'cp37-*', 'cp38-*', 'cp39-*', 'cp310-*', 'cp311-*'] + include: + - arch: aarch64 + build: 'pp37-*' + - arch: aarch64 + build: 'pp38-*' + - arch: aarch64 + build: 'pp39-*' + + steps: + - uses: actions/checkout@v3 + with: + submodules: true + + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + with: + platforms: all + + - name: Build wheels + uses: pypa/cibuildwheel@v2.12.3 + env: + CIBW_ARCHS_LINUX: ${{ matrix.arch }} + CIBW_BUILD: ${{ matrix.build }} + CIBW_BUILD_VERBOSITY: 1 + CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2 + + - name: Show files + run: ls -lh wheelhouse + shell: bash + + - name: Verify clean directory + run: git diff --exit-code + shell: bash + + - name: Upload wheels + uses: actions/upload-artifact@v3 + with: + path: wheelhouse/*.whl + + upload_all: + permissions: + contents: none + name: Upload + needs: [build_wheels, build_wheels_qemu, build_sdist] + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - uses: actions/download-artifact@v3 + with: + name: artifact + path: dist + + - uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000000..8b8fa49222d --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,1802 @@ +name: release +on: + push: + tags: + - '*' + +env: + DEVELOPER_DIR: /Applications/Xcode_14.0.1.app/Contents/Developer + EMSCRIPTEN_VERSION: 3.1.28 + +permissions: + contents: read + +jobs: + + setup: + permissions: + contents: none + runs-on: ubuntu-latest + outputs: + VERSION: ${{ steps.get_version.outputs.VERSION }} + steps: + - name: get-version + id: get_version + run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT + + full-source: + needs: [setup] + runs-on: ubuntu-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-full-source + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: package + run: | + rm -rf .git + rm -f /tmp/${{ env.PACKAGENAME }}.zip + zip -9 -y -r /tmp/${{ env.PACKAGENAME }}.zip . + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: /tmp/${{ env.PACKAGENAME }}.zip + + ubuntu-2004: + needs: [setup] + runs-on: ubuntu-20.04 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004 + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: apt + run: | + sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build/install/* ${{ env.PACKAGENAME }} + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + ubuntu-2004-shared: + needs: [setup] + runs-on: ubuntu-20.04 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004-shared + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: apt + run: | + sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a -P build/install/* ${{ env.PACKAGENAME }} + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + ubuntu-2204: + needs: [setup] + runs-on: ubuntu-22.04 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204 + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: apt + run: | + sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build/install/* ${{ env.PACKAGENAME }} + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + ubuntu-2204-shared: + needs: [setup] + runs-on: ubuntu-22.04 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204-shared + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: apt + run: | + sudo apt-get install -y libvulkan-dev libprotobuf-dev protobuf-compiler + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a -P build/install/* ${{ env.PACKAGENAME }} + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + openmp-macos: + runs-on: macos-latest + steps: + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-macos-release-11.0.0 + - name: checkout + if: steps.cache-openmp.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + - name: build-x86_64 + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + cd openmp-11.0.0.src + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install/strip + - name: build-arm64 + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + cd openmp-11.0.0.src + mkdir build-arm64 && cd build-arm64 + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="arm64" \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install/strip + - name: merge-fat-library + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + rm -rf $GITHUB_WORKSPACE/openmp-install + mkdir -p $GITHUB_WORKSPACE/openmp-install + cp -a openmp-11.0.0.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install + mkdir -p $GITHUB_WORKSPACE/openmp-install/lib + lipo -create openmp-11.0.0.src/build-x86_64/install/lib/libomp.a openmp-11.0.0.src/build-arm64/install/lib/libomp.a -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a + - name: upload + uses: actions/upload-artifact@v3 + with: + name: openmp-macos + path: openmp-install + + macos: + needs: [setup, openmp-macos] + runs-on: macos-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos + steps: + - uses: actions/checkout@v3 + - name: download-openmp-macos + uses: actions/download-artifact@v3 + with: + name: openmp-macos + path: openmp-macos + - name: install-openmp + run: | + sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: build-x86_64 + run: | + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="x86_64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install/strip + - name: build-arm64 + run: | + mkdir build-arm64 && cd build-arm64 + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="arm64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install/strip + - name: package-openmp + run: | + rm -rf openmp.framework + mkdir -p openmp.framework/Versions/A/Headers + mkdir -p openmp.framework/Versions/A/Resources + ln -s A openmp.framework/Versions/Current + ln -s Versions/Current/Headers openmp.framework/Headers + ln -s Versions/Current/Resources openmp.framework/Resources + ln -s Versions/Current/openmp openmp.framework/openmp + cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp + cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ + sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist + - name: package + run: | + rm -rf ncnn.framework + mkdir -p ncnn.framework/Versions/A/Headers + mkdir -p ncnn.framework/Versions/A/Resources + ln -s A ncnn.framework/Versions/Current + ln -s Versions/Current/Headers ncnn.framework/Headers + ln -s Versions/Current/Resources ncnn.framework/Resources + ln -s Versions/Current/ncnn ncnn.framework/ncnn + lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ + sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + macos-gpu: + needs: [setup, openmp-macos] + runs-on: macos-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: download-openmp-macos + uses: actions/download-artifact@v3 + with: + name: openmp-macos + path: openmp-macos + - name: install-openmp + run: | + sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include + sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib + - name: vulkansdk + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg + hdiutil attach vulkansdk-macos-1.3.236.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 + - name: build-x86_64 + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="x86_64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install/strip + - name: build-arm64 + run: | + export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.236.0/macOS + mkdir build-arm64 && cd build-arm64 + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DCMAKE_OSX_ARCHITECTURES="arm64" \ + -DCMAKE_CROSSCOMPILING=ON -DCMAKE_SYSTEM_PROCESSOR=arm64 \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install/strip + - name: package-openmp + run: | + rm -rf openmp.framework + mkdir -p openmp.framework/Versions/A/Headers + mkdir -p openmp.framework/Versions/A/Resources + ln -s A openmp.framework/Versions/Current + ln -s Versions/Current/Headers openmp.framework/Headers + ln -s Versions/Current/Resources openmp.framework/Resources + ln -s Versions/Current/openmp openmp.framework/openmp + cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp + cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ + sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist + - name: package-glslang + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static build-x86_64/install/lib/libglslang.a build-x86_64/install/lib/libMachineIndependent.a build-x86_64/install/lib/libGenericCodeGen.a build-x86_64/install/lib/libSPIRV.a build-x86_64/install/lib/libOGLCompiler.a build-x86_64/install/lib/libOSDependent.a -o build-x86_64/install/lib/libglslang_combined.a + libtool -static build-arm64/install/lib/libglslang.a build-arm64/install/lib/libMachineIndependent.a build-arm64/install/lib/libGenericCodeGen.a build-arm64/install/lib/libSPIRV.a build-arm64/install/lib/libOGLCompiler.a build-arm64/install/lib/libOSDependent.a -o build-arm64/install/lib/libglslang_combined.a + lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang + cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package + run: | + rm -rf ncnn.framework + mkdir -p ncnn.framework/Versions/A/Headers + mkdir -p ncnn.framework/Versions/A/Resources + ln -s A ncnn.framework/Versions/Current + ln -s Versions/Current/Headers ncnn.framework/Headers + ln -s Versions/Current/Resources ncnn.framework/Resources + ln -s Versions/Current/ncnn ncnn.framework/ncnn + lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ + sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + openmp-ios: + runs-on: macos-latest + steps: + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-ios-release-11.0.0 + - name: checkout + if: steps.cache-openmp.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + - name: build + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + cd openmp-11.0.0.src + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-simulator + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + cd openmp-11.0.0.src + mkdir build-simulator && cd build-simulator + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: merge-fat-library + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + rm -rf $GITHUB_WORKSPACE/openmp-install + mkdir -p $GITHUB_WORKSPACE/openmp-install + cp -a openmp-11.0.0.src/build/install/include $GITHUB_WORKSPACE/openmp-install + mkdir -p $GITHUB_WORKSPACE/openmp-install/lib + lipo -create openmp-11.0.0.src/build/install/lib/libomp.a openmp-11.0.0.src/build-simulator/install/lib/libomp.a -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a + - name: upload + uses: actions/upload-artifact@v3 + with: + name: openmp-ios + path: openmp-install + + ios: + needs: [setup, openmp-ios] + runs-on: macos-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios + steps: + - uses: actions/checkout@v3 + - name: download-openmp-ios + uses: actions/download-artifact@v3 + with: + name: openmp-ios + path: openmp-ios + - name: install-openmp + run: | + sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include + sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib + sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include + sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib + - name: build-armv7 + run: | + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-arm64 + run: | + mkdir build-arm64 && cd build-arm64 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-simulator + run: | + mkdir build-simulator && cd build-simulator + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: package-openmp + run: | + rm -rf openmp.framework + mkdir -p openmp.framework/Versions/A/Headers + mkdir -p openmp.framework/Versions/A/Resources + ln -s A openmp.framework/Versions/Current + ln -s Versions/Current/Headers openmp.framework/Headers + ln -s Versions/Current/Resources openmp.framework/Resources + ln -s Versions/Current/openmp openmp.framework/openmp + cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp + cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ + sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist + - name: package + run: | + rm -rf ncnn.framework + mkdir -p ncnn.framework/Versions/A/Headers + mkdir -p ncnn.framework/Versions/A/Resources + ln -s A ncnn.framework/Versions/Current + ln -s Versions/Current/Headers ncnn.framework/Headers + ln -s Versions/Current/Resources ncnn.framework/Resources + ln -s Versions/Current/ncnn ncnn.framework/ncnn + lipo -create build-armv7/install/lib/libncnn.a build-arm64/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn + cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ + sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + ios-gpu: + needs: [setup, openmp-ios] + runs-on: macos-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: download-openmp-ios + uses: actions/download-artifact@v3 + with: + name: openmp-ios + path: openmp-ios + - name: install-openmp + run: | + sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include + sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib + sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include + sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib + - name: vulkansdk + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg + hdiutil attach vulkansdk-macos-1.3.236.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 + - name: build + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=OS64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-simulator + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build-simulator && cd build-simulator + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=SIMULATOR64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="x86_64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: package-openmp + run: | + rm -rf openmp.framework + mkdir -p openmp.framework/Versions/A/Headers + mkdir -p openmp.framework/Versions/A/Resources + ln -s A openmp.framework/Versions/Current + ln -s Versions/Current/Headers openmp.framework/Headers + ln -s Versions/Current/Resources openmp.framework/Resources + ln -s Versions/Current/openmp openmp.framework/openmp + cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp + cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ + sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist + - name: package-glslang + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static build/install/lib/libglslang.a build/install/lib/libMachineIndependent.a build/install/lib/libGenericCodeGen.a build/install/lib/libSPIRV.a build/install/lib/libOGLCompiler.a build/install/lib/libOSDependent.a -o build/install/lib/libglslang_combined.a + libtool -static build-simulator/install/lib/libglslang.a build-simulator/install/lib/libMachineIndependent.a build-simulator/install/lib/libGenericCodeGen.a build-simulator/install/lib/libSPIRV.a build-simulator/install/lib/libOGLCompiler.a build-simulator/install/lib/libOSDependent.a -o build-simulator/install/lib/libglslang_combined.a + lipo -create build/install/lib/libglslang_combined.a build-simulator/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang + cp -a build/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package + run: | + rm -rf ncnn.framework + mkdir -p ncnn.framework/Versions/A/Headers + mkdir -p ncnn.framework/Versions/A/Resources + ln -s A ncnn.framework/Versions/Current + ln -s Versions/Current/Headers ncnn.framework/Headers + ln -s Versions/Current/Resources ncnn.framework/Resources + ln -s Versions/Current/ncnn ncnn.framework/ncnn + lipo -create build/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn + cp -a build/install/include/* ncnn.framework/Versions/A/Headers/ + sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + openmp-ios-bitcode: + runs-on: macos-latest + steps: + - name: cache-openmp + id: cache-openmp + uses: actions/cache@v3 + with: + path: openmp-install + key: openmp-ios-bitcode-release-11.0.0 + - name: checkout + if: steps.cache-openmp.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + - name: openmp + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz + tar -xf openmp-11.0.0.src.tar.xz + cd openmp-11.0.0.src + sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S + sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S + - name: build + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + cd openmp-11.0.0.src + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-simulator + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + cd openmp-11.0.0.src + mkdir build-simulator && cd build-simulator + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \ + -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ + -DPERL_EXECUTABLE=/usr/local/bin/perl \ + -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: merge-fat-library + if: steps.cache-openmp.outputs.cache-hit != 'true' + run: | + rm -rf $GITHUB_WORKSPACE/openmp-install + mkdir -p $GITHUB_WORKSPACE/openmp-install + cp -a openmp-11.0.0.src/build/install/include $GITHUB_WORKSPACE/openmp-install + mkdir -p $GITHUB_WORKSPACE/openmp-install/lib + lipo -create openmp-11.0.0.src/build/install/lib/libomp.a openmp-11.0.0.src/build-simulator/install/lib/libomp.a -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a + - name: upload + uses: actions/upload-artifact@v3 + with: + name: openmp-ios-bitcode + path: openmp-install + + ios-bitcode: + needs: [setup, openmp-ios-bitcode] + runs-on: macos-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-bitcode + steps: + - uses: actions/checkout@v3 + - name: download-openmp-ios-bitcode + uses: actions/download-artifact@v3 + with: + name: openmp-ios-bitcode + path: openmp-ios-bitcode + - name: install-openmp + run: | + sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include + sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib + sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include + sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib + - name: build-armv7 + run: | + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-arm64 + run: | + mkdir build-arm64 && cd build-arm64 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=OS -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-simulator + run: | + mkdir build-simulator && cd build-simulator + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: package-openmp + run: | + rm -rf openmp.framework + mkdir -p openmp.framework/Versions/A/Headers + mkdir -p openmp.framework/Versions/A/Resources + ln -s A openmp.framework/Versions/Current + ln -s Versions/Current/Headers openmp.framework/Headers + ln -s Versions/Current/Resources openmp.framework/Resources + ln -s Versions/Current/openmp openmp.framework/openmp + cp openmp-ios-bitcode/lib/libomp.a openmp.framework/Versions/A/openmp + cp -a openmp-ios-bitcode/include/* openmp.framework/Versions/A/Headers/ + sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist + - name: package + run: | + rm -rf ncnn.framework + mkdir -p ncnn.framework/Versions/A/Headers + mkdir -p ncnn.framework/Versions/A/Resources + ln -s A ncnn.framework/Versions/Current + ln -s Versions/Current/Headers ncnn.framework/Headers + ln -s Versions/Current/Resources ncnn.framework/Resources + ln -s Versions/Current/ncnn ncnn.framework/ncnn + lipo -create build-armv7/install/lib/libncnn.a build-arm64/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn + cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ + sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + ios-gpu-bitcode: + needs: [setup, openmp-ios-bitcode] + runs-on: macos-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan-bitcode + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: download-openmp-ios-bitcode + uses: actions/download-artifact@v3 + with: + name: openmp-ios-bitcode + path: openmp-ios-bitcode + - name: install-openmp + run: | + sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include + sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib + sudo cp openmp-ios-bitcode/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include + sudo cp openmp-ios-bitcode/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib + - name: vulkansdk + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg + hdiutil attach vulkansdk-macos-1.3.236.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-1.3.236.0 + - name: build + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=OS64 -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="arm64;arm64e" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: build-simulator + run: | + export VULKAN_SDK=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/macOS + mkdir build-simulator && cd build-simulator + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DIOS_PLATFORM=SIMULATOR64 -DENABLE_BITCODE=1 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="x86_64" \ + -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ + -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ + -DOpenMP_libomp_LIBRARY="$DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \ + -DVulkan_INCLUDE_DIR=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/include \ + -DVulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 3 + cmake --build . --target install + - name: package-openmp + run: | + rm -rf openmp.framework + mkdir -p openmp.framework/Versions/A/Headers + mkdir -p openmp.framework/Versions/A/Resources + ln -s A openmp.framework/Versions/Current + ln -s Versions/Current/Headers openmp.framework/Headers + ln -s Versions/Current/Resources openmp.framework/Resources + ln -s Versions/Current/openmp openmp.framework/openmp + cp openmp-ios-bitcode/lib/libomp.a openmp.framework/Versions/A/openmp + cp -a openmp-ios-bitcode/include/* openmp.framework/Versions/A/Headers/ + sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist + - name: package-glslang + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static build/install/lib/libglslang.a build/install/lib/libMachineIndependent.a build/install/lib/libGenericCodeGen.a build/install/lib/libSPIRV.a build/install/lib/libOGLCompiler.a build/install/lib/libOSDependent.a -o build/install/lib/libglslang_combined.a + libtool -static build-simulator/install/lib/libglslang.a build-simulator/install/lib/libMachineIndependent.a build-simulator/install/lib/libGenericCodeGen.a build-simulator/install/lib/libSPIRV.a build-simulator/install/lib/libOGLCompiler.a build-simulator/install/lib/libOSDependent.a -o build-simulator/install/lib/libglslang_combined.a + lipo -create build/install/lib/libglslang_combined.a build-simulator/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang + cp -a build/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package + run: | + rm -rf ncnn.framework + mkdir -p ncnn.framework/Versions/A/Headers + mkdir -p ncnn.framework/Versions/A/Resources + ln -s A ncnn.framework/Versions/Current + ln -s Versions/Current/Headers ncnn.framework/Headers + ln -s Versions/Current/Resources ncnn.framework/Resources + ln -s Versions/Current/ncnn ncnn.framework/ncnn + lipo -create build/install/lib/libncnn.a build-simulator/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn + cp -a build/install/include/* ncnn.framework/Versions/A/Headers/ + sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + android: + needs: [setup] + runs-on: ubuntu-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android + steps: + - uses: actions/checkout@v3 + - name: ndk-fix-debug + run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake + - name: build-armv7 + run: | + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-aarch64 + run: | + mkdir build-aarch64 && cd build-aarch64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86 + run: | + mkdir build-x86 && cd build-x86 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86_64 + run: | + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ + -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a + cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a + cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 + cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + android-shared: + needs: [setup] + runs-on: ubuntu-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-shared + steps: + - uses: actions/checkout@v3 + - name: ndk-fix-debug + run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake + - name: build-armv7 + run: | + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ + -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-aarch64 + run: | + mkdir build-aarch64 && cd build-aarch64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ + -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86 + run: | + mkdir build-x86 && cd build-x86 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ + -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86_64 + run: | + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ + -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a + cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a + cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 + cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + android-gpu: + needs: [setup] + runs-on: ubuntu-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v3 + with: + path: "1.3.236.0" + key: vulkansdk-linux-x86_64-1.3.236.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.3.236.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.3.236.0.tar.gz + rm -rf 1.3.236.0/source 1.3.236.0/samples + find 1.3.236.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: ndk-fix-debug + run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake + - name: build-armv7 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-aarch64 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-aarch64 && cd build-aarch64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-x86 && cd build-x86 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86_64 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a + cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a + cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 + cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + android-gpu-shared: + needs: [setup] + runs-on: ubuntu-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan-shared + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v3 + with: + path: "1.3.236.0" + key: vulkansdk-linux-x86_64-1.3.236.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.3.236.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.3.236.0.tar.gz + rm -rf 1.3.236.0/source 1.3.236.0/samples + find 1.3.236.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: ndk-fix-debug + run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake + - name: build-armv7 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-armv7 && cd build-armv7 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-aarch64 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-aarch64 && cd build-aarch64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-x86 && cd build-x86 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-x86_64 + run: | + export PATH=`pwd`/1.3.236.0/x86_64/bin:$PATH + mkdir build-x86_64 && cd build-x86_64 + cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 \ + -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a + cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a + cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 + cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + webassembly: + needs: [setup] + runs-on: ubuntu-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly + steps: + - uses: actions/checkout@v3 + - name: emsdk + run: | + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + ./emsdk install $EMSCRIPTEN_VERSION + ./emsdk activate $EMSCRIPTEN_VERSION + - name: build + run: | + source emsdk/emsdk_env.sh + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-simd + run: | + source emsdk/emsdk_env.sh + mkdir build-simd && cd build-simd + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-threads + run: | + source emsdk/emsdk_env.sh + mkdir build-threads && cd build-threads + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: build-simd-threads + run: | + source emsdk/emsdk_env.sh + mkdir build-simd-threads && cd build-simd-threads + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . -j 2 + cmake --build . --target install/strip + - name: package + run: | + rm -rf ${{ env.PACKAGENAME }} + mkdir -p ${{ env.PACKAGENAME }} + cp -a build/install ${{ env.PACKAGENAME }}/basic + cp -a build-simd/install ${{ env.PACKAGENAME }}/simd + cp -a build-threads/install ${{ env.PACKAGENAME }}/threads + cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2015: + needs: [setup] + runs-on: windows-2019 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015 + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2015-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2015-shared: + needs: [setup] + runs-on: windows-2019 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2015-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2017: + needs: [setup] + runs-on: windows-2019 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017 + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2017-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2017-shared: + needs: [setup] + runs-on: windows-2019 + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2017-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2019: + needs: [setup] + runs-on: windows-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019 + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2019-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm + run: | + mkdir build-arm; cd build-arm + cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm64 + run: | + mkdir build-arm64; cd build-arm64 + cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + mkdir ${{ env.PACKAGENAME }}/arm + mkdir ${{ env.PACKAGENAME }}/arm64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" + Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2019-shared: + needs: [setup] + runs-on: windows-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2019-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm + run: | + mkdir build-arm; cd build-arm + cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm64 + run: | + mkdir build-arm64; cd build-arm64 + cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + mkdir ${{ env.PACKAGENAME }}/arm + mkdir ${{ env.PACKAGENAME }}/arm64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" + Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2022: + needs: [setup] + runs-on: windows-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022 + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2022-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm + run: | + mkdir build-arm; cd build-arm + cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm64 + run: | + mkdir build-arm64; cd build-arm64 + cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + mkdir ${{ env.PACKAGENAME }}/arm + mkdir ${{ env.PACKAGENAME }}/arm64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" + Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + windows-vs2022-shared: + needs: [setup] + runs-on: windows-latest + env: + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-vs2022-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: vulkansdk + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe?Human=true -OutFile VulkanSDK.exe + .\VulkanSDK.exe --accept-licenses --default-answer --confirm-command install com.lunarg.vulkan.32bit + - name: build-x86 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x86; cd build-x86 + cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x86\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x86\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-x64 + run: | + $env:VULKAN_SDK="C:/VulkanSDK/1.3.236.0" + $env:Path+=";C:/VulkanSDK/1.3.236.0/Bin" + mkdir build-x64; cd build-x64 + cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\x64\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\x64\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm + run: | + mkdir build-arm; cd build-arm + cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-arm64 + run: | + mkdir build-arm64; cd build-arm64 + cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: package + run: | + mkdir ${{ env.PACKAGENAME }} + mkdir ${{ env.PACKAGENAME }}/x86 + mkdir ${{ env.PACKAGENAME }}/x64 + mkdir ${{ env.PACKAGENAME }}/arm + mkdir ${{ env.PACKAGENAME }}/arm64 + Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" + Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" + Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" + Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" + 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} + - name: upload-zip + uses: actions/upload-artifact@v3 + with: + name: ${{ env.PACKAGENAME }} + path: ${{ env.PACKAGENAME }}.zip + + release: + permissions: + contents: write # for softprops/action-gh-release to create a release + needs: [setup, full-source, ubuntu-2004, ubuntu-2004-shared, ubuntu-2204, ubuntu-2204-shared, macos, macos-gpu, ios, ios-gpu, ios-bitcode, ios-gpu-bitcode, android, android-shared, android-gpu, android-gpu-shared, webassembly, windows-vs2015, windows-vs2015-shared, windows-vs2017, windows-vs2017-shared, windows-vs2019, windows-vs2019-shared, windows-vs2022, windows-vs2022-shared] + runs-on: ubuntu-latest + steps: + - name: download + uses: actions/download-artifact@v3 + with: + path: artifacts + + - name: create-release + uses: softprops/action-gh-release@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + tag_name: ${{ needs.setup.outputs.VERSION }} + name: Release ${{ needs.setup.outputs.VERSION }} + files: artifacts/*/*.zip diff --git a/.github/workflows/sync-wiki.yml b/.github/workflows/sync-wiki.yml new file mode 100644 index 00000000000..11dfa08058b --- /dev/null +++ b/.github/workflows/sync-wiki.yml @@ -0,0 +1,32 @@ +name: sync-wiki +on: + push: + branches: [master] + paths: + - '.github/workflows/sync-wiki.yml' + - 'docs/**' +concurrency: + group: sync-wiki-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + sync-wiki: + permissions: + contents: write # for Git to git push + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: sync + run: | + cp -r docs $GITHUB_WORKSPACE/ncnn.wiki + cd $GITHUB_WORKSPACE/ncnn.wiki + git config --global user.name "wiki-sync-bot" + git config --global user.email "wiki-sync-bot@qq.com" + git init + git add . + git commit -m "sync" + git remote add upstream https://${{ secrets.WIKI_SYNC_BOT_TOKEN }}@github.com/Tencent/ncnn.wiki.git + git push upstream master -f diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml new file mode 100644 index 00000000000..87401acd00f --- /dev/null +++ b/.github/workflows/test-coverage.yml @@ -0,0 +1,147 @@ +name: test-coverage +on: + push: + branches: [master] + paths: + - '.github/workflows/test-coverage.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/**' + - 'tests/**' + - 'toolchains/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/test-coverage.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/**' + - 'tests/**' + - 'toolchains/**' +concurrency: + group: test-coverage-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-gpu-t4: + runs-on: [self-hosted, linux, t4] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: build + env: + CC: gcc + CXX: g++ + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export VULKAN_SDK=/data/action/osd/1.2.189.0/x86_64 + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_VULKAN=ON -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 4 + - name: test + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: cd build && ctest --output-on-failure -j 4 + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/install/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + - name: codecov + id: codecov + continue-on-error: true + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + - name: codecov-vlen256-retry-1 + continue-on-error: true + id: codecov-vlen256-retry-1 + if: steps.codecov.outcome=='failure' + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + - name: codecov-vlen256-retry-2 + continue-on-error: true + id: codecov-vlen256-retry-2 + if: steps.codecov-vlen256-retry-1.outcome=='failure' + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + - name: codecov-vlen256-retry-3 + continue-on-error: true + id: codecov-vlen256-retry-3 + if: steps.codecov-vlen256-retry-2.outcome=='failure' + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + - name: codecov-vlen256-retry-4 + continue-on-error: true + id: codecov-vlen256-retry-4 + if: steps.codecov-vlen256-retry-3.outcome=='failure' + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + - name: codecov-vlen256-retry-5 + continue-on-error: true + id: codecov-vlen256-retry-5 + if: steps.codecov-vlen256-retry-4.outcome=='failure' + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + - name: set the status + if: always() + run: | + if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then + echo fine + else + exit 1 + fi + + linux-gcc-x64-avx512-spr: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: gcc12 + run: sudo apt-get install gcc-12 g++-12 + - name: lcov + run: sudo apt-get install lcov + - name: Setup SDE binaries + uses: petarpetrovt/setup-sde@v2 + - name: build-avx512-spr + env: + CC: gcc-12 + CXX: g++-12 + run: | + mkdir build-avx512-spr && cd build-avx512-spr + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-avx512-spr + run: | + cd build-avx512-spr + TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2 + - name: lcov-collect + run: | + cd build-avx512-spr + lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info + lcov --list lcov.info + - name: codecov-avx512-spr + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build-avx512-spr/lcov.info diff --git a/.github/workflows/web-assembly.yml b/.github/workflows/web-assembly.yml new file mode 100644 index 00000000000..61756a2d059 --- /dev/null +++ b/.github/workflows/web-assembly.yml @@ -0,0 +1,76 @@ +name: web-assembly +on: + push: + branches: [master] + paths: + - '.github/workflows/web-assembly.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/web-assembly.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + +env: + EMSCRIPTEN_VERSION: 3.1.28 + +concurrency: + group: web-assembly-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + webassembly: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: emsdk + run: | + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + ./emsdk install $EMSCRIPTEN_VERSION + ./emsdk activate $EMSCRIPTEN_VERSION + - name: build-basic + run: | + source emsdk/emsdk_env.sh + export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" + mkdir build-basic && cd build-basic + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-basic + run: | + cd build-basic + TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j 2 + - name: build-simd + run: | + source emsdk/emsdk_env.sh + export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" + mkdir build-simd && cd build-simd + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-simd + run: | + cd build-simd + TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j 2 + - name: build-simd-omp + run: | + source emsdk/emsdk_env.sh + export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1" + mkdir build-simd-omp && cd build-simd-omp + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 2 + - name: test-simd-omp + run: | + cd build-simd-omp + TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j 2 diff --git a/.github/workflows/windows-arm-cpu.yml b/.github/workflows/windows-arm-cpu.yml new file mode 100644 index 00000000000..d789482a595 --- /dev/null +++ b/.github/workflows/windows-arm-cpu.yml @@ -0,0 +1,57 @@ +name: windows-arm-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/windows-arm-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/windows-arm-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' +concurrency: + group: windows-arm-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + windows: + name: ${{ matrix.vs-version }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - vs-version: vs2019 + toolset-version: v142 + os: windows-2022 + + - vs-version: vs2022 + toolset-version: v143 + os: windows-2022 + + env: + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build; cd build + cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . --config Release -j 2 + - name: build-shared + run: | + mkdir build-shared; cd build-shared + cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 diff --git a/.github/workflows/windows-arm64-cpu.yml b/.github/workflows/windows-arm64-cpu.yml new file mode 100644 index 00000000000..a6bdbda01de --- /dev/null +++ b/.github/workflows/windows-arm64-cpu.yml @@ -0,0 +1,57 @@ +name: windows-arm64-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/windows-arm64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/windows-arm64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/arm/**' + - 'tests/**' +concurrency: + group: windows-arm64-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + windows: + name: ${{ matrix.vs-version }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - vs-version: vs2019 + toolset-version: v142 + os: windows-2022 + + - vs-version: vs2022 + toolset-version: v143 + os: windows-2022 + + env: + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build; cd build + cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . --config Release -j 2 + - name: build-shared + run: | + mkdir build-shared; cd build-shared + cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 diff --git a/.github/workflows/windows-x64-cpu-vs2019-python.yml b/.github/workflows/windows-x64-cpu-vs2019-python.yml new file mode 100644 index 00000000000..3d4e6583766 --- /dev/null +++ b/.github/workflows/windows-x64-cpu-vs2019-python.yml @@ -0,0 +1,67 @@ +name: windows-x64-cpu-vs2019-python +on: + push: + branches: [master] + paths: + - '.github/workflows/windows-x64-cpu-vs2019-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'python/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/windows-x64-cpu-vs2019-python.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'python/**' +concurrency: + group: windows-x64-cpu-vs2019-python-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + windows-vs2019-python: + runs-on: windows-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + env: + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: set up python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest setuptools wheel twine + - name: build + run: | + mkdir build; cd build + cmake -T v142,host=x64 -A x64 -DNCNN_PYTHON=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=OFF .. + cmake --build . --config Release -j 2 + - name: install python + run: cd python && pip install . + - name: test + run: cd python && pytest tests + - name: build and publish + if: startsWith(github.ref, 'refs/tags') + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} + TWINE_REPOSITORY_URL: "https://test.pypi.org/legacy/" + run: | + cd python + python setup.py bdist_wheel + twine upload dist/* diff --git a/.github/workflows/windows-x64-cpu.yml b/.github/workflows/windows-x64-cpu.yml new file mode 100644 index 00000000000..200185d1a56 --- /dev/null +++ b/.github/workflows/windows-x64-cpu.yml @@ -0,0 +1,102 @@ +name: windows-x64-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/windows-x64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/windows-x64-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: windows-x64-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + windows: + name: ${{ matrix.vs-version }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - vs-version: vs2015 + toolset-version: v140 + os: windows-2019 + + - vs-version: vs2017 + toolset-version: v141 + os: windows-2019 + + - vs-version: vs2019 + toolset-version: v142 + os: windows-2022 + + - vs-version: vs2022 + toolset-version: v143 + os: windows-2022 + + env: + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v3 + with: + path: "protobuf-install" + key: protobuf-${{ matrix.vs-version }}-x64-install-2 + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}; cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: build-sse2 + run: | + mkdir build-sse2; cd build-sse2 + cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . --config Release -j 2 + - name: test-sse2 + run: cd build-sse2; ctest -C Release --output-on-failure -j 2 + - name: build-shared + run: | + mkdir build-shared; cd build-shared + cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=ON -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 + - name: build-avx2 + run: | + mkdir build-avx2; cd build-avx2 + cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . --config Release -j 2 + - name: test-avx2 + run: cd build-avx2; ctest -C Release --output-on-failure -j 2 + - name: build-avx + run: | + mkdir build-avx; cd build-avx + cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON .. + cmake --build . --config Release -j 2 + - name: test-avx + run: cd build-avx; ctest -C Release --output-on-failure -j 2 diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml index 53ec0ef71bc..118e408498c 100644 --- a/.github/workflows/windows-x64-gpu.yml +++ b/.github/workflows/windows-x64-gpu.yml @@ -60,8 +60,6 @@ jobs: env: UseMultiToolTask: true steps: - - name: Setup Debug Session - uses: csexton/debugger-action@master - uses: actions/checkout@v3 with: submodules: true diff --git a/.github/workflows/windows-x86-cpu.yml b/.github/workflows/windows-x86-cpu.yml new file mode 100644 index 00000000000..b48431a97ac --- /dev/null +++ b/.github/workflows/windows-x86-cpu.yml @@ -0,0 +1,67 @@ +name: windows-x86-cpu +on: + push: + branches: [master] + paths: + - '.github/workflows/windows-x86-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/windows-x86-cpu.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' +concurrency: + group: windows-x86-cpu-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + windows-x86: + name: ${{ matrix.vs-version }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - vs-version: vs2015 + toolset-version: v140 + os: windows-2019 + + - vs-version: vs2017 + toolset-version: v141 + os: windows-2019 + + - vs-version: vs2019 + toolset-version: v142 + os: windows-2022 + + - vs-version: vs2022 + toolset-version: v143 + os: windows-2022 + + env: + UseMultiToolTask: true + steps: + - uses: actions/checkout@v3 + - name: build + run: | + mkdir build; cd build + cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . --config Release -j 2 + - name: test + run: cd build; ctest -C Release --output-on-failure -j 2 + - name: build-shared + run: | + mkdir build-shared; cd build-shared + cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON .. + cmake --build . --config Release -j 2 diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index dd71d622de9..1b278dd35cd 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -18,11 +18,6 @@ struct gridsample_2d_bicubic_compute_blob void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) { const int grid_size = grid.w * grid.h; -#if __AVX__ - const __m256 vImgWf = _mm256_set1_ps(src.w); - const __m256 vImgHf = _mm256_set1_ps(src.h); - const __m256 vElempackf = _mm256_set1_ps(src.elempack); -#endif // __AVX__ float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; @@ -67,10 +62,10 @@ struct gridsample_2d_bicubic_compute_blob // compute coord { // x - gx = unormalize(vImgWf, gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); // y - gy = unormalize(vImgHf, gy); + gy = unormalize(_mm256_set1_ps(src.h), gy); } __m256 gx_floor = _mm256_floor_ps(gx); @@ -79,37 +74,38 @@ struct gridsample_2d_bicubic_compute_blob const __m256 tx = _mm256_sub_ps(gx, gx_floor); const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx2, *(__m256*)_ps256_1); + __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); + __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1)); - gx0 = get_coord(vImgWf, gx0); - gx1 = get_coord(vImgWf, gx1); - gx2 = get_coord(vImgWf, gx2); - gx3 = get_coord(vImgWf, gx3); + gx0 = get_coord(_mm256_set1_ps(src.w), gx0); + gx1 = get_coord(_mm256_set1_ps(src.w), gx1); + gx2 = get_coord(_mm256_set1_ps(src.w), gx2); + gx3 = get_coord(_mm256_set1_ps(src.w), gx3); for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = get_coord(vImgHf, gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - __m256 gy_offset = _mm256_mul_ps(gy, vImgWf); + __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), vElempackf); - __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), vElempackf); - __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), vElempackf); - __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), vElempackf); + volatile float epack = src.elempack; + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); + __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); + __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); + __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - _mm256_storeu_ps(v0_in_bound_ptr[i], *(__m256*)_ps256_n1); - _mm256_storeu_ps(v1_in_bound_ptr[i], *(__m256*)_ps256_n1); - _mm256_storeu_ps(v2_in_bound_ptr[i], *(__m256*)_ps256_n1); - _mm256_storeu_ps(v3_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_set1_ps(-1)); + _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_set1_ps(-1)); + _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_set1_ps(-1)); + _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_set1_ps(-1)); v0_in_bound_ptr[i] += 8; v1_in_bound_ptr[i] += 8; @@ -200,10 +196,10 @@ struct gridsample_2d_bicubic_compute_blob // compute coord { // x - gx = unormalize(vImgWf, gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); // y - gy = unormalize(vImgHf, gy); + gy = unormalize(_mm256_set1_ps(src.h), gy); } __m256 gx_floor = _mm256_floor_ps(gx); @@ -212,37 +208,38 @@ struct gridsample_2d_bicubic_compute_blob const __m256 tx = _mm256_sub_ps(gx, gx_floor); const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx0 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_n1); + __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, *(__m256*)_ps256_1); - __m256 gx3 = _mm256_add_ps(gx2, *(__m256*)_ps256_1); + __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); + __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1)); - gx0 = get_coord(vImgWf, gx0); - gx1 = get_coord(vImgWf, gx1); - gx2 = get_coord(vImgWf, gx2); - gx3 = get_coord(vImgWf, gx3); + gx0 = get_coord(_mm256_set1_ps(src.w), gx0); + gx1 = get_coord(_mm256_set1_ps(src.w), gx1); + gx2 = get_coord(_mm256_set1_ps(src.w), gx2); + gx3 = get_coord(_mm256_set1_ps(src.w), gx3); for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = get_coord(vImgHf, gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - __m256 gy_offset = _mm256_mul_ps(gy, vImgWf); + __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), vElempackf); - __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), vElempackf); - __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), vElempackf); - __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), vElempackf); + volatile float epack = src.elempack; + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); + __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); + __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); + __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - _mm256_storeu_ps(v0_in_bound_ptr[i], *(__m256*)_ps256_n1); - _mm256_storeu_ps(v1_in_bound_ptr[i], *(__m256*)_ps256_n1); - _mm256_storeu_ps(v2_in_bound_ptr[i], *(__m256*)_ps256_n1); - _mm256_storeu_ps(v3_in_bound_ptr[i], *(__m256*)_ps256_n1); + _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_set1_ps(-1)); + _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_set1_ps(-1)); + _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_set1_ps(-1)); + _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_set1_ps(-1)); v0_in_bound_ptr[i] += 8; v1_in_bound_ptr[i] += 8; @@ -328,11 +325,6 @@ struct gridsample_2d_bicubic_compute_blob OPT_2 __m256 operator()(__m256 length, __m256 coord) { - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), *(__m256*)_ps256_2), _mm256_sub_ps(length, *(__m256*)_ps256_1)); + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), _mm256_set1_ps(2)), _mm256_sub_ps(length, _mm256_set1_ps(1))); } #endif // __AVX__ float operator()(int length, float coord) @@ -130,7 +126,7 @@ struct grid_sample_unormalize OPT_2 __m256 operator()(__m256 length, __m256 coord) { - return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, *(__m256*)_ps256_1), length, *(__m256*)_ps256_1), *(__m256*)_ps256_2); + return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), length, _mm256_set1_ps(1)), _mm256_set1_ps(2)); } #endif // __AVX__ float operator()(int length, float coord) @@ -148,7 +144,7 @@ struct compute_coord #if __AVX__ __m256 operator()(__m256 length, __m256 coord) { - const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); @@ -167,7 +163,7 @@ struct compute_coord #if __AVX__ __m256 operator()(__m256 length, __m256 coord) { - const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); coord = abs256_ps(coord); @@ -192,7 +188,7 @@ struct compute_coord #if __AVX__ __m256 operator()(__m256 length, __m256 coord) { - const __m256 border_x = _mm256_sub_ps(length, *(__m256*)_ps256_1); + const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); __m256 v0p5fp8 = _mm256_set1_ps(0.5f); coord = _mm256_add_ps(coord, v0p5fp8); @@ -229,7 +225,7 @@ struct compute_coord int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { using namespace GridSample_x86_kernel; - Mat& bottom_blob = bottom_blobs[0].clone(); + const Mat& bottom_blob = bottom_blobs[0]; const Mat& grid = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; int elempack = bottom_blob.elempack; @@ -557,12 +553,6 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Mon, 24 Apr 2023 18:30:36 +0000 Subject: [PATCH 099/127] apply code-format changes --- src/layer/x86/gridsample_nearest_compute_blob.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index aa4dd51693e..6cab27879b1 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -349,8 +349,8 @@ struct gridsample_3d_nearest_compute_blob volatile float sw = src.w; volatile float sh = src.h; __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), + _mm256_set1_ps(epack)); _mm256_storeu_ps(offset_ptr, offset); @@ -422,8 +422,8 @@ struct gridsample_3d_nearest_compute_blob volatile float sw = src.w; volatile float sh = src.h; __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), + _mm256_set1_ps(epack)); _mm256_storeu_ps(offset_ptr, offset); @@ -592,8 +592,8 @@ struct gridsample_3d_nearest_compute_blob Date: Fri, 28 Apr 2023 15:09:02 +0800 Subject: [PATCH 100/127] remove partial specialization and optimize memory usage --- .../x86/gridsample_bicubic_compute_blob.h | 798 ++----- .../x86/gridsample_bilinear_compute_blob.h | 1993 +++++------------ .../x86/gridsample_nearest_compute_blob.h | 673 ++---- src/layer/x86/gridsample_x86.cpp | 169 +- 4 files changed, 1015 insertions(+), 2618 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 1b278dd35cd..b10b7d0d2bc 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -13,191 +13,51 @@ // specific language governing permissions and limitations under the License. template -struct gridsample_2d_bicubic_compute_blob +void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) { - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) - { - const int grid_size = grid.w * grid.h; - - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); - v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); - v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); - v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); - } - - float* value_x = value.channel(0); - float* value_y = value.channel(1); + const int grid_size = grid.w * grid.h; - grid_sample_unormalize unormalize; - compute_coord get_coord; + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; -#if __AVX__ - for (; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); - __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1)); - - gx0 = get_coord(_mm256_set1_ps(src.w), gx0); - gx1 = get_coord(_mm256_set1_ps(src.w), gx1); - gx2 = get_coord(_mm256_set1_ps(src.w), gx2); - gx3 = get_coord(_mm256_set1_ps(src.w), gx3); - - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - gy = get_coord(_mm256_set1_ps(src.h), gy); - - __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - - volatile float epack = src.elempack; - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); - __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); - __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); - __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); - - _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); - _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); - _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); - _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - - _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_set1_ps(-1)); - _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_set1_ps(-1)); - _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_set1_ps(-1)); - _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_set1_ps(-1)); - - v0_in_bound_ptr[i] += 8; - v1_in_bound_ptr[i] += 8; - v2_in_bound_ptr[i] += 8; - v3_in_bound_ptr[i] += 8; - - v0_offset_ptr[i] += 8; - v1_offset_ptr[i] += 8; - v2_offset_ptr[i] += 8; - v3_offset_ptr[i] += 8; - } - - _mm256_storeu_ps(value_x, tx); - _mm256_storeu_ps(value_y, ty); - - value_x += 8; - value_y += 8; - - gridptr += 16; - } + float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; -#endif // __AVX__ + float* value_x = value.channel(0); + float* value_y = value.channel(1); - for (; x < grid_size; x += 2) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + } - // x - sample_x = unormalize(src.w, sample_x); + grid_sample_unormalize unormalize; + compute_coord get_coord; - // y - sample_y = unormalize(src.h, sample_y); - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int x2 = x1 + 1; - int x3 = x1 + 2; - - *value_x = sample_x - static_cast(x1); - *value_y = sample_y - static_cast(y1); - - x1 = get_coord(src.w, x1); - x0 = get_coord(src.w, x0); - x2 = get_coord(src.w, x2); - x3 = get_coord(src.w, x3); - - for (int i = 0; i < 4; i++) - { - int offset_y = get_coord(src.h, y1 + i - 1) * src.w; - - *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; - *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; - *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; - *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; - - *v0_in_bound_ptr[i]++ = -1.0f; - *v1_in_bound_ptr[i]++ = -1.0f; - *v2_in_bound_ptr[i]++ = -1.0f; - *v3_in_bound_ptr[i]++ = -1.0f; - - v0_offset_ptr[i]++; - v1_offset_ptr[i]++; - v2_offset_ptr[i]++; - v3_offset_ptr[i]++; - } - - value_x++; - value_y++; - - gridptr += 2; - } - } - } - else + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - + const float* gridptr = grid.channel(y); int x = 0; #if __AVX__ - for (; x + 7 < grid_size; x += 8) + for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); // compute coord { // x gx = unormalize(_mm256_set1_ps(src.w), gx); - // y gy = unormalize(_mm256_set1_ps(src.h), gy); } @@ -218,11 +78,18 @@ struct gridsample_2d_bicubic_compute_blob gx2 = get_coord(_mm256_set1_ps(src.w), gx2); gx3 = get_coord(_mm256_set1_ps(src.w), gx3); + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS)); + for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); gy = get_coord(_mm256_set1_ps(src.h), gy); + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)); + __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); volatile float epack = src.elempack; @@ -231,21 +98,16 @@ struct gridsample_2d_bicubic_compute_blob __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); + v0_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f, _mm256_and_ps(x0_in_range, y_in_range)); + v1_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f, _mm256_and_ps(x1_in_range, y_in_range)); + v2_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f, _mm256_and_ps(x2_in_range, y_in_range)); + v3_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f, _mm256_and_ps(x3_in_range, y_in_range)); + _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_set1_ps(-1)); - _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_set1_ps(-1)); - _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_set1_ps(-1)); - _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_set1_ps(-1)); - - v0_in_bound_ptr[i] += 8; - v1_in_bound_ptr[i] += 8; - v2_in_bound_ptr[i] += 8; - v3_in_bound_ptr[i] += 8; - v0_offset_ptr[i] += 8; v1_offset_ptr[i] += 8; v2_offset_ptr[i] += 8; @@ -258,20 +120,18 @@ struct gridsample_2d_bicubic_compute_blob value_x += 8; value_y += 8; - gridptr_x += 8; - gridptr_y += 8; + gridptr += 16; } #endif // __AVX__ - for (; x < grid_size; x++) + for (; x < grid_size; x += 2) { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); // x sample_x = unormalize(src.w, sample_x); - // y sample_y = unormalize(src.h, sample_y); @@ -289,19 +149,28 @@ struct gridsample_2d_bicubic_compute_blob x2 = get_coord(src.w, x2); x3 = get_coord(src.w, x3); + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool x2_in_range = (x2 > -1) & (x2 < src.w); + bool x3_in_range = (x3 > -1) & (x3 < src.w); + for (int i = 0; i < 4; i++) { - int offset_y = static_cast(get_coord(src.h, y1 + i - 1)) * src.w; + int gy = y1 + i - 1; + gy = get_coord(src.h, gy); + int offset_y = gy * src.w; + + bool y_in_range = (gy > -1) & (gy < src.h); - *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; - *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; - *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; - *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; + bool v0_in_bound = (x0_in_range & y_in_range); + bool v1_in_bound = (x1_in_range & y_in_range); + bool v2_in_bound = (x2_in_range & y_in_range); + bool v3_in_bound = (x3_in_range & y_in_range); - *v0_in_bound_ptr[i]++ = -1.0f; - *v1_in_bound_ptr[i]++ = -1.0f; - *v2_in_bound_ptr[i]++ = -1.0f; - *v3_in_bound_ptr[i]++ = -1.0f; + *v0_offset_ptr[i] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; + *v1_offset_ptr[i] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; + *v2_offset_ptr[i] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; + *v3_offset_ptr[i] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; v0_offset_ptr[i]++; v1_offset_ptr[i]++; @@ -312,328 +181,155 @@ struct gridsample_2d_bicubic_compute_blob value_x++; value_y++; - gridptr_x++; - gridptr_y++; + gridptr += 2; } } } -}; - -template -struct gridsample_2d_bicubic_compute_blob -{ - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + else { - const int grid_size = grid.w * grid.h; + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - - float* value_x = value.channel(0); - float* value_y = value.channel(1); - - for (int i = 0; i < 4; i++) + int x = 0; +#if __AVX__ + for (; x + 7 < grid_size; x += 8) { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); - v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); - v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); - v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); - } - - grid_sample_unormalize unormalize; + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) + // compute coord { - const float* gridptr = grid.channel(y); - int x = 0; -#if __AVX__ - for (; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } - - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - - __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); - __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS)); - - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); - - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)); - - _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_and_ps(x0_in_range, y_in_range)); - _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_and_ps(x1_in_range, y_in_range)); - _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_and_ps(x2_in_range, y_in_range)); - _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_and_ps(x3_in_range, y_in_range)); - - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, _mm256_set1_ps(src.w)), gx0), _mm256_set1_ps(src.elempack)); - __m256 v1_offset_f = _mm256_add_ps(v0_offset_f, _mm256_set1_ps(src.elempack)); - __m256 v2_offset_f = _mm256_add_ps(v1_offset_f, _mm256_set1_ps(src.elempack)); - __m256 v3_offset_f = _mm256_add_ps(v2_offset_f, _mm256_set1_ps(src.elempack)); - - _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); - _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); - _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); - _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - - v0_offset_ptr[i] += 8; - v1_offset_ptr[i] += 8; - v2_offset_ptr[i] += 8; - v3_offset_ptr[i] += 8; - - v0_in_bound_ptr[i] += 8; - v1_in_bound_ptr[i] += 8; - v2_in_bound_ptr[i] += 8; - v3_in_bound_ptr[i] += 8; - } - - _mm256_storeu_ps(value_x, tx); - _mm256_storeu_ps(value_y, ty); - - value_x += 8; - value_y += 8; - - gridptr += 16; - } - -#endif // __AVX__ - - for (; x < grid_size; x += 2) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - - // x - sample_x = unormalize(src.w, sample_x); - // y - sample_y = unormalize(src.h, sample_y); - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int x2 = x1 + 1; - int x3 = x1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool x2_in_range = (x2 > -1) & (x2 < src.w); - bool x3_in_range = (x3 > -1) & (x3 < src.w); - - for (int i = 0; i < 4; i++) - { - int gy = y1 + i - 1; - int offset_y = gy * src.w; - - bool y_in_range = (gy > -1) & (gy < src.h); - - *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? -1.0f : 0.0f; - *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? -1.0f : 0.0f; - *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? -1.0f : 0.0f; - *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? -1.0f : 0.0f; - - *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; - *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; - *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; - *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; - - v0_offset_ptr[i]++; - v1_offset_ptr[i]++; - v2_offset_ptr[i]++; - v3_offset_ptr[i]++; - - v0_in_bound_ptr[i]++; - v1_in_bound_ptr[i]++; - v2_in_bound_ptr[i]++; - v3_in_bound_ptr[i]++; - } - - *value_x = sample_x - static_cast(x1); - *value_y = sample_y - static_cast(y1); - - value_x++; - value_y++; - - gridptr += 2; - } + // x + gx = unormalize(_mm256_set1_ps(src.w), gx); + // y + gy = unormalize(_mm256_set1_ps(src.h), gy); } - } - else - { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - int x = 0; -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gx_floor = _mm256_floor_ps(gx); + __m256 gy_floor = _mm256_floor_ps(gy); - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } + const __m256 tx = _mm256_sub_ps(gx, gx_floor); + const __m256 ty = _mm256_sub_ps(gy, gy_floor); - __m256 gx_floor = _mm256_floor_ps(gx); - __m256 gy_floor = _mm256_floor_ps(gy); - - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); + __m256 gx1 = gx_floor; + __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); + __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1)); - __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); - __m256 gx1 = gx_floor; - __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); - __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1)); + gx0 = get_coord(_mm256_set1_ps(src.w), gx0); + gx1 = get_coord(_mm256_set1_ps(src.w), gx1); + gx2 = get_coord(_mm256_set1_ps(src.w), gx2); + gx3 = get_coord(_mm256_set1_ps(src.w), gx3); - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx0, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS)); - __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS)); - __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS)); + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx0, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS)); + __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS)); + __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS)); - for (int i = 0; i < 4; i++) - { - gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + for (int i = 0; i < 4; i++) + { + gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); + gy = get_coord(_mm256_set1_ps(src.h), gy); - __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)); + __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)); - _mm256_storeu_ps(v0_in_bound_ptr[i], _mm256_and_ps(x0_in_range, y_in_range)); - _mm256_storeu_ps(v1_in_bound_ptr[i], _mm256_and_ps(x1_in_range, y_in_range)); - _mm256_storeu_ps(v2_in_bound_ptr[i], _mm256_and_ps(x2_in_range, y_in_range)); - _mm256_storeu_ps(v3_in_bound_ptr[i], _mm256_and_ps(x3_in_range, y_in_range)); + __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(gy, _mm256_set1_ps(src.w)), gx0), _mm256_set1_ps(src.elempack)); - __m256 v1_offset_f = _mm256_add_ps(v0_offset_f, _mm256_set1_ps(src.elempack)); - __m256 v2_offset_f = _mm256_add_ps(v1_offset_f, _mm256_set1_ps(src.elempack)); - __m256 v3_offset_f = _mm256_add_ps(v2_offset_f, _mm256_set1_ps(src.elempack)); + volatile float epack = src.elempack; + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); + __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); + __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); + __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); - _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); - _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); - _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); - _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); + v0_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f, _mm256_and_ps(x0_in_range, y_in_range)); + v1_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f, _mm256_and_ps(x1_in_range, y_in_range)); + v2_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f, _mm256_and_ps(x2_in_range, y_in_range)); + v3_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f, _mm256_and_ps(x3_in_range, y_in_range)); - v0_offset_ptr[i] += 8; - v1_offset_ptr[i] += 8; - v2_offset_ptr[i] += 8; - v3_offset_ptr[i] += 8; + _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); + _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); + _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); + _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - v0_in_bound_ptr[i] += 8; - v1_in_bound_ptr[i] += 8; - v2_in_bound_ptr[i] += 8; - v3_in_bound_ptr[i] += 8; - } + v0_offset_ptr[i] += 8; + v1_offset_ptr[i] += 8; + v2_offset_ptr[i] += 8; + v3_offset_ptr[i] += 8; + } - _mm256_storeu_ps(value_x, tx); - _mm256_storeu_ps(value_y, ty); + _mm256_storeu_ps(value_x, tx); + _mm256_storeu_ps(value_y, ty); - value_x += 8; - value_y += 8; + value_x += 8; + value_y += 8; - gridptr_x += 8; - gridptr_y += 8; - } + gridptr_x += 8; + gridptr_y += 8; + } #endif // __AVX__ - for (; x < grid_size; x++) + for (; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + + // x + sample_x = unormalize(src.w, sample_x); + // y + sample_y = unormalize(src.h, sample_y); + + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int x2 = x1 + 1; + int x3 = x1 + 2; + + *value_x = sample_x - static_cast(x1); + *value_y = sample_y - static_cast(y1); + + x1 = get_coord(src.w, x1); + x0 = get_coord(src.w, x0); + x2 = get_coord(src.w, x2); + x3 = get_coord(src.w, x3); + + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool x2_in_range = (x2 > -1) & (x2 < src.w); + bool x3_in_range = (x3 > -1) & (x3 < src.w); + + for (int i = 0; i < 4; i++) { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; - - // x - sample_x = unormalize(src.w, sample_x); - // y - sample_y = unormalize(src.h, sample_y); - - int x1 = floor(sample_x); - int y1 = floor(sample_y); - int x0 = x1 - 1; - int x2 = x1 + 1; - int x3 = x1 + 2; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool x2_in_range = (x2 > -1) & (x2 < src.w); - bool x3_in_range = (x3 > -1) & (x3 < src.w); - - for (int i = 0; i < 4; i++) - { - int gy = y1 + i - 1; - int offset_y = gy * src.w; - - bool y_in_range = (gy > -1) & (gy < src.h); - - *v0_in_bound_ptr[i] = (x0_in_range & y_in_range) ? -1.0f : 0.0f; - *v1_in_bound_ptr[i] = (x1_in_range & y_in_range) ? -1.0f : 0.0f; - *v2_in_bound_ptr[i] = (x2_in_range & y_in_range) ? -1.0f : 0.0f; - *v3_in_bound_ptr[i] = (x3_in_range & y_in_range) ? -1.0f : 0.0f; - - *v0_offset_ptr[i] = (offset_y + x0) * src.elempack; - *v1_offset_ptr[i] = (offset_y + x1) * src.elempack; - *v2_offset_ptr[i] = (offset_y + x2) * src.elempack; - *v3_offset_ptr[i] = (offset_y + x3) * src.elempack; - - v0_offset_ptr[i]++; - v1_offset_ptr[i]++; - v2_offset_ptr[i]++; - v3_offset_ptr[i]++; - - v0_in_bound_ptr[i]++; - v1_in_bound_ptr[i]++; - v2_in_bound_ptr[i]++; - v3_in_bound_ptr[i]++; - } - - *value_x = sample_x - static_cast(x1); - *value_y = sample_y - static_cast(y1); + int gy = y1 + i - 1; + gy = get_coord(src.h, gy); + int offset_y = gy * src.w; + + bool y_in_range = (gy > -1) & (gy < src.h); + + bool v0_in_bound = (x0_in_range & y_in_range); + bool v1_in_bound = (x1_in_range & y_in_range); + bool v2_in_bound = (x2_in_range & y_in_range); + bool v3_in_bound = (x3_in_range & y_in_range); + + *v0_offset_ptr[i] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; + *v1_offset_ptr[i] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; + *v2_offset_ptr[i] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; + *v3_offset_ptr[i] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; + + v0_offset_ptr[i]++; + v1_offset_ptr[i]++; + v2_offset_ptr[i]++; + v3_offset_ptr[i]++; + } - value_x++; - value_y++; + value_x++; + value_y++; - gridptr_x++; - gridptr_y++; - } + gridptr_x++; + gridptr_y++; } } -}; +} #if __SSE2__ #if __AVX__ @@ -653,7 +349,7 @@ static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); } -static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -672,19 +368,12 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - for (int i = 0; i < 4; i++) { v0_offset_ptr[i] = offset.channel(i * 4 + 0); v1_offset_ptr[i] = offset.channel(i * 4 + 1); v2_offset_ptr[i] = offset.channel(i * 4 + 2); v3_offset_ptr[i] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); - v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); - v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); - v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); } const float* value_x = value.channel(0); @@ -695,10 +384,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*value_x)); for (int ii = 0; ii < 4; ii++) { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v0_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v0_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v1_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v1_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v2_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v2_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v3_in_bound_ptr[ii]) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v3_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v0_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v1_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v2_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v3_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -709,11 +398,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d v1_offset_ptr[ii]++; v2_offset_ptr[ii]++; v3_offset_ptr[ii]++; - - v0_in_bound_ptr[ii]++; - v1_in_bound_ptr[ii]++; - v2_in_bound_ptr[ii]++; - v3_in_bound_ptr[ii]++; } cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*value_y)); @@ -747,7 +431,7 @@ static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(_mm256_set1_ps(1), coeffs0), coeffs1), coeffs2); } -static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -766,19 +450,12 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - for (int i = 0; i < 4; i++) { v0_offset_ptr[i] = offset.channel(i * 4 + 0); v1_offset_ptr[i] = offset.channel(i * 4 + 1); v2_offset_ptr[i] = offset.channel(i * 4 + 2); v3_offset_ptr[i] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); - v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); - v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); - v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); } const float* value_x = value.channel(0); @@ -789,6 +466,11 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*value_x)); for (int ii = 0; ii < 4; ii++) { + float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + #if __AVX2__ __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); @@ -801,10 +483,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v3_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ - __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(*v0_in_bound_ptr[ii])); - __m256 x1_val = mask_gather_ps256(srcptr, v1_offset, _mm256_set1_ps(*v1_in_bound_ptr[ii])); - __m256 x2_val = mask_gather_ps256(srcptr, v2_offset, _mm256_set1_ps(*v2_in_bound_ptr[ii])); - __m256 x3_val = mask_gather_ps256(srcptr, v3_offset, _mm256_set1_ps(*v3_in_bound_ptr[ii])); + __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(v0_in_bound)); + __m256 x1_val = mask_gather_ps256(srcptr, v1_offset, _mm256_set1_ps(v1_in_bound)); + __m256 x2_val = mask_gather_ps256(srcptr, v2_offset, _mm256_set1_ps(v2_in_bound)); + __m256 x3_val = mask_gather_ps256(srcptr, v3_offset, _mm256_set1_ps(v3_in_bound)); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -815,11 +497,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds v1_offset_ptr[ii]++; v2_offset_ptr[ii]++; v3_offset_ptr[ii]++; - - v0_in_bound_ptr[ii]++; - v1_in_bound_ptr[ii]++; - v2_in_bound_ptr[ii]++; - v3_in_bound_ptr[ii]++; } cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*value_y)); @@ -854,7 +531,7 @@ static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(*(__m128*)_ps_1, coeffs0), coeffs1), coeffs2); } -static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -873,19 +550,12 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - for (int i = 0; i < 4; i++) { v0_offset_ptr[i] = offset.channel(i * 4 + 0); v1_offset_ptr[i] = offset.channel(i * 4 + 1); v2_offset_ptr[i] = offset.channel(i * 4 + 2); v3_offset_ptr[i] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); - v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); - v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); - v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); } const float* value_x = value.channel(0); @@ -896,10 +566,15 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set1_ps(*value_x)); for (int ii = 0; ii < 4; ii++) { - __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v0_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v0_in_bound_ptr[ii])); - __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v1_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v1_in_bound_ptr[ii])); - __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v2_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v2_in_bound_ptr[ii])); - __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v3_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*v3_in_bound_ptr[ii])); + float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + + __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v0_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v0_in_bound)); + __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v1_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v1_in_bound)); + __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v2_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v2_in_bound)); + __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v3_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v3_in_bound)); value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -910,11 +585,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds v1_offset_ptr[ii]++; v2_offset_ptr[ii]++; v3_offset_ptr[ii]++; - - v0_in_bound_ptr[ii]++; - v1_in_bound_ptr[ii]++; - v2_in_bound_ptr[ii]++; - v3_in_bound_ptr[ii]++; } cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set1_ps(*value_y)); @@ -949,7 +619,7 @@ static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2 coeffs3 = 1.f - coeffs0 - coeffs1 - coeffs2; } -static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset, Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -964,19 +634,12 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - for (int i = 0; i < 4; i++) { v0_offset_ptr[i] = offset.channel(i * 4 + 0); v1_offset_ptr[i] = offset.channel(i * 4 + 1); v2_offset_ptr[i] = offset.channel(i * 4 + 2); v3_offset_ptr[i] = offset.channel(i * 4 + 3); - - v0_in_bound_ptr[i] = in_bound.channel(i * 4 + 0); - v1_in_bound_ptr[i] = in_bound.channel(i * 4 + 1); - v2_in_bound_ptr[i] = in_bound.channel(i * 4 + 2); - v3_in_bound_ptr[i] = in_bound.channel(i * 4 + 3); } const float* value_x = value.channel(0); @@ -994,10 +657,15 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); for (int ii = 0; ii < 4; ii++) { - __m256 x0_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v0_offset_ptr[ii] + 7), *(v0_offset_ptr[ii] + 6), *(v0_offset_ptr[ii] + 5), *(v0_offset_ptr[ii] + 4), *(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), _mm256_loadu_ps(v0_in_bound_ptr[ii])); - __m256 x1_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v1_offset_ptr[ii] + 7), *(v1_offset_ptr[ii] + 6), *(v1_offset_ptr[ii] + 5), *(v1_offset_ptr[ii] + 4), *(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), _mm256_loadu_ps(v1_in_bound_ptr[ii])); - __m256 x2_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v2_offset_ptr[ii] + 7), *(v2_offset_ptr[ii] + 6), *(v2_offset_ptr[ii] + 5), *(v2_offset_ptr[ii] + 4), *(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), _mm256_loadu_ps(v2_in_bound_ptr[ii])); - __m256 x3_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v3_offset_ptr[ii] + 7), *(v3_offset_ptr[ii] + 6), *(v3_offset_ptr[ii] + 5), *(v3_offset_ptr[ii] + 4), *(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), _mm256_loadu_ps(v3_in_bound_ptr[ii])); + __m256 v0_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v0_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + __m256 v1_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v1_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + __m256 v2_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v2_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + __m256 v3_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v3_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + + __m256 x0_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v0_offset_ptr[ii] + 7), *(v0_offset_ptr[ii] + 6), *(v0_offset_ptr[ii] + 5), *(v0_offset_ptr[ii] + 4), *(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); + __m256 x1_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v1_offset_ptr[ii] + 7), *(v1_offset_ptr[ii] + 6), *(v1_offset_ptr[ii] + 5), *(v1_offset_ptr[ii] + 4), *(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); + __m256 x2_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v2_offset_ptr[ii] + 7), *(v2_offset_ptr[ii] + 6), *(v2_offset_ptr[ii] + 5), *(v2_offset_ptr[ii] + 4), *(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); + __m256 x3_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v3_offset_ptr[ii] + 7), *(v3_offset_ptr[ii] + 6), *(v3_offset_ptr[ii] + 5), *(v3_offset_ptr[ii] + 4), *(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -1008,11 +676,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds v1_offset_ptr[ii] += 8; v2_offset_ptr[ii] += 8; v3_offset_ptr[ii] += 8; - - v0_in_bound_ptr[ii] += 8; - v1_in_bound_ptr[ii] += 8; - v2_in_bound_ptr[ii] += 8; - v3_in_bound_ptr[ii] += 8; } cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_loadu_ps(value_y)); @@ -1039,10 +702,15 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); for (int ii = 0; ii < 4; ii++) { - __m128 x0_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), _mm_loadu_ps(v0_in_bound_ptr[ii])); - __m128 x1_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), _mm_loadu_ps(v1_in_bound_ptr[ii])); - __m128 x2_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), _mm_loadu_ps(v2_in_bound_ptr[ii])); - __m128 x3_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), _mm_loadu_ps(v3_in_bound_ptr[ii])); + __m128 v0_in_bound = _mm_andnot_ps(_mm_loadu_ps(v0_offset_ptr[ii]), _mm_set1_ps(-1.0f)); + __m128 v1_in_bound = _mm_andnot_ps(_mm_loadu_ps(v1_offset_ptr[ii]), _mm_set1_ps(-1.0f)); + __m128 v2_in_bound = _mm_andnot_ps(_mm_loadu_ps(v2_offset_ptr[ii]), _mm_set1_ps(-1.0f)); + __m128 v3_in_bound = _mm_andnot_ps(_mm_loadu_ps(v3_offset_ptr[ii]), _mm_set1_ps(-1.0f)); + + __m128 x0_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); + __m128 x1_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); + __m128 x2_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); + __m128 x3_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -1053,11 +721,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds v1_offset_ptr[ii] += 4; v2_offset_ptr[ii] += 4; v3_offset_ptr[ii] += 4; - - v0_in_bound_ptr[ii] += 4; - v1_in_bound_ptr[ii] += 4; - v2_in_bound_ptr[ii] += 4; - v3_in_bound_ptr[ii] += 4; } cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_loadu_ps(value_y)); @@ -1084,10 +747,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); for (int ii = 0; ii < 4; ii++) { - float x0_val = *reinterpret_cast(v0_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v0_offset_ptr[ii])) : 0; - float x1_val = *reinterpret_cast(v1_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v1_offset_ptr[ii])) : 0; - float x2_val = *reinterpret_cast(v2_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v2_offset_ptr[ii])) : 0; - float x3_val = *reinterpret_cast(v3_in_bound_ptr[ii]) < 0 ? *(srcptr + static_cast(*v3_offset_ptr[ii])) : 0; + float x0_val = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v0_offset_ptr[ii])) : 0; + float x1_val = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v1_offset_ptr[ii])) : 0; + float x2_val = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v2_offset_ptr[ii])) : 0; + float x3_val = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v3_offset_ptr[ii])) : 0; value_f[ii] = x_coeffs0 * x0_val; value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; @@ -1098,11 +761,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds v1_offset_ptr[ii]++; v2_offset_ptr[ii]++; v3_offset_ptr[ii]++; - - v0_in_bound_ptr[ii]++; - v1_in_bound_ptr[ii]++; - v2_in_bound_ptr[ii]++; - v3_in_bound_ptr[ii]++; } cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *value_y); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 7b3d7a0f7c5..33e6f122696 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -13,174 +13,44 @@ // specific language governing permissions and limitations under the License. template -struct gridsample_2d_bilinear_compute_blob +void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) { - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) - { - const int grid_size = grid.w * grid.h; - - float* offset_ptr_00 = offset.channel(0); - float* offset_ptr_01 = offset.channel(1); - float* offset_ptr_10 = offset.channel(2); - float* offset_ptr_11 = offset.channel(3); - - float* in_bound_ptr_01 = in_bound.channel(1); - float* in_bound_ptr_10 = in_bound.channel(2); - float* in_bound_ptr_11 = in_bound.channel(3); - - float* value_ptr_alpha = value.channel(0); - float* value_ptr_beta = value.channel(1); - - grid_sample_unormalize unormalize; - compute_coord get_coord; - - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; -#if __AVX__ - for (; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); - - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - } + const int grid_size = grid.w * grid.h; - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); + float* offset_ptr_00 = offset.channel(0); + float* offset_ptr_01 = offset.channel(1); + float* offset_ptr_10 = offset.channel(2); + float* offset_ptr_11 = offset.channel(3); - __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); - __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); + float* value_ptr_alpha = value.channel(0); + float* value_ptr_beta = value.channel(1); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); - - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - volatile float epack = src.elempack; - __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); - __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); - __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); - __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); - _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); - _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); - - _mm256_storeu_ps(offset_ptr_00, nw_offset); - _mm256_storeu_ps(offset_ptr_01, ne_offset); - _mm256_storeu_ps(offset_ptr_10, sw_offset); - _mm256_storeu_ps(offset_ptr_11, se_offset); - - __m256 alpha = _mm256_sub_ps(gx, x_w); - __m256 beta = _mm256_sub_ps(gy, y_n); - - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - - gridptr += 16; - - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; - - in_bound_ptr_01 += 8; - in_bound_ptr_10 += 8; - in_bound_ptr_11 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - } - -#endif // __AVX__ + grid_sample_unormalize unormalize; + compute_coord get_coord; - for (; x < grid_size; x += 2) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - - // x - sample_x = unormalize(src.w, sample_x); - sample_x = get_coord(src.w, sample_x); - - // y - sample_y = unormalize(src.h, sample_y); - sample_y = get_coord(src.h, sample_y); - - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x1_in_bound = (x1 > -1) & (x1 < src.w); - bool y1_in_bound = (y1 > -1) & (y1 < src.h); - - *in_bound_ptr_01 = x1_in_bound ? -1.0f : 0.0f; - *in_bound_ptr_10 = y1_in_bound ? -1.0f : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; - - *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; - *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; - *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; - *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; - - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - - gridptr += 2; - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - } - } - } - else + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - + const float* gridptr = grid.channel(y); int x = 0; #if __AVX__ - for (; x + 7 < grid_size; x += 8) + for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // compute coord { - // x gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); - // y gy = unormalize(_mm256_set1_ps(src.h), gy); gy = get_coord(_mm256_set1_ps(src.h), gy); } @@ -191,20 +61,26 @@ struct gridsample_2d_bilinear_compute_blob __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); volatile float epack = src.elempack; - __m256 nw_offset = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(y_n, _mm256_set1_ps(src.w)), x_w), _mm256_set1_ps(epack)); + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); - __m256 sw_offset = _mm256_add_ps(nw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); + __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); - _mm256_storeu_ps(in_bound_ptr_01, x1_in_range); - _mm256_storeu_ps(in_bound_ptr_10, y1_in_range); - _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); + nw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), nw_offset, v00_in_range); + ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); + sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range); + se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range); _mm256_storeu_ps(offset_ptr_00, nw_offset); _mm256_storeu_ps(offset_ptr_01, ne_offset); @@ -217,34 +93,29 @@ struct gridsample_2d_bilinear_compute_blob _mm256_storeu_ps(value_ptr_alpha, alpha); _mm256_storeu_ps(value_ptr_beta, beta); - gridptr_x += 8; - gridptr_y += 8; + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + + gridptr += 16; offset_ptr_00 += 8; offset_ptr_01 += 8; offset_ptr_10 += 8; offset_ptr_11 += 8; - in_bound_ptr_01 += 8; - in_bound_ptr_10 += 8; - in_bound_ptr_11 += 8; - value_ptr_alpha += 8; value_ptr_beta += 8; } - #endif // __AVX__ - for (; x < grid_size; x++) + for (; x < grid_size; x += 2) { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); - // x sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); - // y sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); @@ -253,584 +124,202 @@ struct gridsample_2d_bilinear_compute_blob int x1 = x0 + 1; int y1 = y0 + 1; + bool x0_in_bound = (x0 > -1) & (x0 < src.w); bool x1_in_bound = (x1 > -1) & (x1 < src.w); + bool y0_in_bound = (y0 > -1) & (y0 < src.h); bool y1_in_bound = (y1 > -1) & (y1 < src.h); - *in_bound_ptr_01 = x1_in_bound ? -1.0f : 0.0f; - *in_bound_ptr_10 = y1_in_bound ? -1.0f : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; + bool in_bound_00 = x0_in_bound & y0_in_bound; + bool in_bound_01 = x1_in_bound & y0_in_bound; + bool in_bound_10 = x0_in_bound & y1_in_bound; + bool in_bound_11 = x1_in_bound & y1_in_bound; - *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; - *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; - *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; - *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; + *offset_ptr_00 = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; + *offset_ptr_01 = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; + *offset_ptr_10 = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; + *offset_ptr_11 = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; *value_ptr_alpha = sample_x - x0; *value_ptr_beta = sample_y - y0; - gridptr_x++; - gridptr_y++; + gridptr += 2; offset_ptr_00++; offset_ptr_01++; offset_ptr_10++; offset_ptr_11++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - value_ptr_alpha++; value_ptr_beta++; } } } -}; - -template -struct gridsample_2d_bilinear_compute_blob -{ - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + else { - const int grid_size = grid.w * grid.h; - - float* offset_ptr_00 = offset.channel(0); - float* offset_ptr_01 = offset.channel(1); - float* offset_ptr_10 = offset.channel(2); - float* offset_ptr_11 = offset.channel(3); - - float* in_bound_ptr_00 = in_bound.channel(0); - float* in_bound_ptr_01 = in_bound.channel(1); - float* in_bound_ptr_10 = in_bound.channel(2); - float* in_bound_ptr_11 = in_bound.channel(3); + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); - float* value_ptr_alpha = value.channel(0); - float* value_ptr_beta = value.channel(1); - - grid_sample_unormalize unormalize; - - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; + int x = 0; #if __AVX__ - for (; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - - __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); - __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); - - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - volatile float epack = src.elempack; - __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); - __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); - __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); - __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); - _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); - _mm256_storeu_ps(in_bound_ptr_10, v10_in_range); - _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); - - _mm256_storeu_ps(offset_ptr_00, nw_offset); - _mm256_storeu_ps(offset_ptr_01, ne_offset); - _mm256_storeu_ps(offset_ptr_10, sw_offset); - _mm256_storeu_ps(offset_ptr_11, se_offset); - - __m256 alpha = _mm256_sub_ps(gx, x_w); - __m256 beta = _mm256_sub_ps(gy, y_n); - - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - - gridptr += 16; - - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; - - in_bound_ptr_00 += 8; - in_bound_ptr_01 += 8; - in_bound_ptr_10 += 8; - in_bound_ptr_11 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - } -#endif // __AVX__ - - for (; x < grid_size; x += 2) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - - // x - sample_x = unormalize(src.w, sample_x); - - // y - sample_y = unormalize(src.h, sample_y); - - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; - - bool x0_in_bound = (x0 > -1) & (x0 < src.w); - bool x1_in_bound = (x1 > -1) & (x1 < src.w); - bool y0_in_bound = (y0 > -1) & (y0 < src.h); - bool y1_in_bound = (y1 > -1) & (y1 < src.h); - - *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? -1.0f : 0.0f; - *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? -1.0f : 0.0f; - *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? -1.0f : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; - - *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; - *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; - *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; - *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; - - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - - gridptr += 2; - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - in_bound_ptr_00++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - } - } - } - else + for (; x + 7 < grid_size; x += 8) { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); - int x = 0; -#if __AVX__ - for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); + } - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); - __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); - __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); + __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); + __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - volatile float epack = src.elempack; - __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); - __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); - __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); - __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - _mm256_storeu_ps(in_bound_ptr_00, v00_in_range); - _mm256_storeu_ps(in_bound_ptr_01, v01_in_range); - _mm256_storeu_ps(in_bound_ptr_10, v10_in_range); - _mm256_storeu_ps(in_bound_ptr_11, v11_in_range); + volatile float epack = src.elempack; + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); + __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); + __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); + __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); - _mm256_storeu_ps(offset_ptr_00, nw_offset); - _mm256_storeu_ps(offset_ptr_01, ne_offset); - _mm256_storeu_ps(offset_ptr_10, sw_offset); - _mm256_storeu_ps(offset_ptr_11, se_offset); + nw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), nw_offset, v00_in_range); + ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); + sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range); + se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range); - __m256 alpha = _mm256_sub_ps(gx, x_w); - __m256 beta = _mm256_sub_ps(gy, y_n); + _mm256_storeu_ps(offset_ptr_00, nw_offset); + _mm256_storeu_ps(offset_ptr_01, ne_offset); + _mm256_storeu_ps(offset_ptr_10, sw_offset); + _mm256_storeu_ps(offset_ptr_11, se_offset); - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); - gridptr_x += 8; - gridptr_y += 8; + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; + gridptr_x += 8; + gridptr_y += 8; - in_bound_ptr_00 += 8; - in_bound_ptr_01 += 8; - in_bound_ptr_10 += 8; - in_bound_ptr_11 += 8; + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; - value_ptr_alpha += 8; - value_ptr_beta += 8; - } + value_ptr_alpha += 8; + value_ptr_beta += 8; + } #endif // __AVX__ - for (; x < grid_size; x++) - { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; + for (; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; - // x - sample_x = unormalize(src.w, sample_x); - // y - sample_y = unormalize(src.h, sample_y); + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int x1 = x0 + 1; - int y1 = y0 + 1; + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); - bool x0_in_bound = (x0 > -1) & (x0 < src.w); - bool x1_in_bound = (x1 > -1) & (x1 < src.w); - bool y0_in_bound = (y0 > -1) & (y0 < src.h); - bool y1_in_bound = (y1 > -1) & (y1 < src.h); + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; - *in_bound_ptr_00 = (x0_in_bound & y0_in_bound) ? -1.0f : 0.0f; - *in_bound_ptr_01 = (x1_in_bound & y0_in_bound) ? -1.0f : 0.0f; - *in_bound_ptr_10 = (x0_in_bound & y1_in_bound) ? -1.0f : 0.0f; - *in_bound_ptr_11 = (x1_in_bound & y1_in_bound) ? -1.0f : 0.0f; + bool x0_in_bound = (x0 > -1) & (x0 < src.w); + bool x1_in_bound = (x1 > -1) & (x1 < src.w); + bool y0_in_bound = (y0 > -1) & (y0 < src.h); + bool y1_in_bound = (y1 > -1) & (y1 < src.h); - *offset_ptr_00 = (x0 + y0 * src.w) * src.elempack; - *offset_ptr_01 = (x1 + y0 * src.w) * src.elempack; - *offset_ptr_10 = (x0 + y1 * src.w) * src.elempack; - *offset_ptr_11 = (x1 + y1 * src.w) * src.elempack; + bool in_bound_00 = x0_in_bound & y0_in_bound; + bool in_bound_01 = x1_in_bound & y0_in_bound; + bool in_bound_10 = x0_in_bound & y1_in_bound; + bool in_bound_11 = x1_in_bound & y1_in_bound; - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; + *offset_ptr_00 = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; + *offset_ptr_01 = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; + *offset_ptr_10 = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; + *offset_ptr_11 = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; - gridptr_x++; - gridptr_y++; + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; + gridptr_x++; + gridptr_y++; - in_bound_ptr_00++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; - value_ptr_alpha++; - value_ptr_beta++; - } + value_ptr_alpha++; + value_ptr_beta++; } } -}; +} template -struct gridsample_3d_bilinear_compute_blob +void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) { - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) - { - const int grid_size = grid.w * grid.h * grid.d; - - float* offset_ptr_000 = offset.channel(0); - float* offset_ptr_001 = offset.channel(1); - float* offset_ptr_010 = offset.channel(2); - float* offset_ptr_011 = offset.channel(3); - - float* offset_ptr_100 = offset.channel(4); - float* offset_ptr_101 = offset.channel(5); - float* offset_ptr_110 = offset.channel(6); - float* offset_ptr_111 = offset.channel(7); - - float* in_bound_ptr_000 = in_bound.channel(0); - float* in_bound_ptr_001 = in_bound.channel(1); - float* in_bound_ptr_010 = in_bound.channel(2); - float* in_bound_ptr_011 = in_bound.channel(3); - float* in_bound_ptr_100 = in_bound.channel(4); - float* in_bound_ptr_101 = in_bound.channel(5); - float* in_bound_ptr_110 = in_bound.channel(6); - float* in_bound_ptr_111 = in_bound.channel(7); - - float* value_ptr_alpha = value.channel(0); - float* value_ptr_beta = value.channel(1); - float* value_ptr_gamma = value.channel(2); - - grid_sample_unormalize unormalize; - compute_coord get_coord; - - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; -#if __AVX__ - for (; x + 23 < grid_size; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); - __m256 gz = _mm256_loadu_ps(gridptr + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); - - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - - gz = unormalize(_mm256_set1_ps(src.d), gz); - gz = get_coord(_mm256_set1_ps(src.d), gz); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); - __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); - __m256 z1 = _mm256_add_ps(z_t, _mm256_set1_ps(1)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS)); - - __m256 v011_in_range, v110_in_range, v101_in_range, v111_in_range; - { - v011_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v110_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v011_in_range, z1_in_range); - } - - volatile float epack = src.elempack; - __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, - _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), - _mm256_set1_ps(epack)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(epack)); - - __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(epack), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(epack)); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr_000, _mm256_set1_ps(-1)); - _mm256_storeu_ps(in_bound_ptr_001, x1_in_range); - _mm256_storeu_ps(in_bound_ptr_010, y1_in_range); - _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); - - _mm256_storeu_ps(in_bound_ptr_100, z1_in_range); - _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); - _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); - _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); - - _mm256_storeu_ps(offset_ptr_000, tnw_offset); - _mm256_storeu_ps(offset_ptr_001, tne_offset); - _mm256_storeu_ps(offset_ptr_010, tsw_offset); - _mm256_storeu_ps(offset_ptr_011, tse_offset); - - _mm256_storeu_ps(offset_ptr_100, bnw_offset); - _mm256_storeu_ps(offset_ptr_101, bne_offset); - _mm256_storeu_ps(offset_ptr_110, bsw_offset); - _mm256_storeu_ps(offset_ptr_111, bse_offset); - - __m256 alpha = _mm256_sub_ps(gx, x_w); - __m256 beta = _mm256_sub_ps(gy, y_n); - __m256 gamma = _mm256_sub_ps(gz, z_t); - - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - _mm256_storeu_ps(value_ptr_gamma, gamma); - - gridptr += 24; - - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; - - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; - - in_bound_ptr_000 += 8; - in_bound_ptr_001 += 8; - in_bound_ptr_010 += 8; - in_bound_ptr_011 += 8; - - in_bound_ptr_100 += 8; - in_bound_ptr_101 += 8; - in_bound_ptr_110 += 8; - in_bound_ptr_111 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; - } -#endif // __AVX__ + const int grid_size = grid.w * grid.h * grid.d; - for (; x < grid_size; x += 3) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - float sample_z = *(gridptr + 2); - - sample_x = unormalize(src.w, sample_x); - sample_x = get_coord(src.w, sample_x); - - sample_y = unormalize(src.h, sample_y); - sample_y = get_coord(src.h, sample_y); - - sample_z = unormalize(src.d, sample_z); - sample_z = get_coord(src.d, sample_z); - - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int z0 = (int)floor(sample_z); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v11_in_range = x1_in_range & y1_in_range; - - *in_bound_ptr_000 = -1.0f; - *in_bound_ptr_001 = x1_in_range ? -1.0f : 0.0f; - *in_bound_ptr_010 = y1_in_range ? -1.0f : 0.0f; - *in_bound_ptr_011 = v11_in_range ? -1.0f : 0.0f; - - *in_bound_ptr_100 = z1_in_range ? -1.0f : 0.0f; - *in_bound_ptr_101 = (x1_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_110 = (y1_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; - - *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; - - *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; - - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - *value_ptr_gamma = sample_z - z0; - - gridptr += 3; - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - } - } - } - else - { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - const float* gridptr_z = grid.channel(2); + float* offset_ptr_000 = offset.channel(0); + float* offset_ptr_001 = offset.channel(1); + float* offset_ptr_010 = offset.channel(2); + float* offset_ptr_011 = offset.channel(3); + + float* offset_ptr_100 = offset.channel(4); + float* offset_ptr_101 = offset.channel(5); + float* offset_ptr_110 = offset.channel(6); + float* offset_ptr_111 = offset.channel(7); + float* value_ptr_alpha = value.channel(0); + float* value_ptr_beta = value.channel(1); + float* value_ptr_gamma = value.channel(2); + + grid_sample_unormalize unormalize; + compute_coord get_coord; + + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) + { + const float* gridptr = grid.channel(y); int x = 0; #if __AVX__ - for (; x + 7 < grid_size; x += 8) + for (; x + 23 < grid_size; x += 24) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); - __m256 gz = _mm256_loadu_ps(gridptr_z); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); + + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); + + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - // compute coord { gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -857,17 +346,27 @@ struct gridsample_3d_bilinear_compute_blob __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS)); __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS)); - __m256 v011_in_range, v110_in_range, v101_in_range, v111_in_range; + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; { - v011_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - v101_in_range = _mm256_and_ps(x1_in_range, z1_in_range); - v110_in_range = _mm256_and_ps(y1_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v011_in_range, z1_in_range); + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); + + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); + + v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } volatile float epack = src.elempack; __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, - _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), + _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), _mm256_set1_ps(epack)); __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); @@ -878,15 +377,15 @@ struct gridsample_3d_bilinear_compute_blob __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); - _mm256_storeu_ps(in_bound_ptr_000, _mm256_set1_ps(-1)); - _mm256_storeu_ps(in_bound_ptr_001, x1_in_range); - _mm256_storeu_ps(in_bound_ptr_010, y1_in_range); - _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); + tnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tnw_offset, v000_in_range); + tne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tne_offset, v001_in_range); + tsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tsw_offset, v010_in_range); + tse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tse_offset, v011_in_range); - _mm256_storeu_ps(in_bound_ptr_100, z1_in_range); - _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); - _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); - _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); + bnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bnw_offset, v100_in_range); + bne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bne_offset, v101_in_range); + bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range); + bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range); _mm256_storeu_ps(offset_ptr_000, tnw_offset); _mm256_storeu_ps(offset_ptr_001, tne_offset); @@ -906,9 +405,7 @@ struct gridsample_3d_bilinear_compute_blob _mm256_storeu_ps(value_ptr_beta, beta); _mm256_storeu_ps(value_ptr_gamma, gamma); - gridptr_x += 8; - gridptr_y += 8; - gridptr_z += 8; + gridptr += 24; offset_ptr_000 += 8; offset_ptr_001 += 8; @@ -920,27 +417,17 @@ struct gridsample_3d_bilinear_compute_blob offset_ptr_110 += 8; offset_ptr_111 += 8; - in_bound_ptr_000 += 8; - in_bound_ptr_001 += 8; - in_bound_ptr_010 += 8; - in_bound_ptr_011 += 8; - - in_bound_ptr_100 += 8; - in_bound_ptr_101 += 8; - in_bound_ptr_110 += 8; - in_bound_ptr_111 += 8; - value_ptr_alpha += 8; value_ptr_beta += 8; value_ptr_gamma += 8; } #endif // __AVX__ - for (; x < grid_size; x++) + for (; x < grid_size; x += 3) { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; - float sample_z = *gridptr_z; + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + float sample_z = *(gridptr + 2); sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); @@ -958,39 +445,43 @@ struct gridsample_3d_bilinear_compute_blob int y1 = y0 + 1; int z1 = z0 + 1; + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool z0_in_range = (z0 > -1) & (z0 < src.d); bool x1_in_range = (x1 > -1) & (x1 < src.w); bool y1_in_range = (y1 > -1) & (y1 < src.h); bool z1_in_range = (z1 > -1) & (z1 < src.d); + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; bool v11_in_range = x1_in_range & y1_in_range; - *in_bound_ptr_000 = -1.0f; - *in_bound_ptr_001 = x1_in_range ? -1.0f : 0.0f; - *in_bound_ptr_010 = y1_in_range ? -1.0f : 0.0f; - *in_bound_ptr_011 = v11_in_range ? -1.0f : 0.0f; + bool in_bound_000 = v00_in_range & z0_in_range; + bool in_bound_001 = v01_in_range & z0_in_range; + bool in_bound_010 = v10_in_range & z0_in_range; + bool in_bound_011 = v11_in_range & z0_in_range; - *in_bound_ptr_100 = z1_in_range ? -1.0f : 0.0f; - *in_bound_ptr_101 = (x1_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_110 = (y1_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; + bool in_bound_100 = v00_in_range & z1_in_range; + bool in_bound_101 = v01_in_range & z1_in_range; + bool in_bound_110 = v10_in_range & z1_in_range; + bool in_bound_111 = v11_in_range & z1_in_range; - *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + *offset_ptr_000 = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_001 = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_010 = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_011 = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + *offset_ptr_100 = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_101 = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_110 = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_111 = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; *value_ptr_alpha = sample_x - x0; *value_ptr_beta = sample_y - y0; *value_ptr_gamma = sample_z - z0; - gridptr_x++; - gridptr_y++; - gridptr_z++; + gridptr += 3; offset_ptr_000++; offset_ptr_001++; @@ -1002,484 +493,214 @@ struct gridsample_3d_bilinear_compute_blob offset_ptr_110++; offset_ptr_111++; - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - value_ptr_alpha++; value_ptr_beta++; value_ptr_gamma++; } } } -}; - -template -struct gridsample_3d_bilinear_compute_blob -{ - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + else { - const int grid_size = grid.w * grid.h * grid.d; - - float* offset_ptr_000 = offset.channel(0); - float* offset_ptr_001 = offset.channel(1); - float* offset_ptr_010 = offset.channel(2); - float* offset_ptr_011 = offset.channel(3); - - float* offset_ptr_100 = offset.channel(4); - float* offset_ptr_101 = offset.channel(5); - float* offset_ptr_110 = offset.channel(6); - float* offset_ptr_111 = offset.channel(7); - - float* in_bound_ptr_000 = in_bound.channel(0); - float* in_bound_ptr_001 = in_bound.channel(1); - float* in_bound_ptr_010 = in_bound.channel(2); - float* in_bound_ptr_011 = in_bound.channel(3); - float* in_bound_ptr_100 = in_bound.channel(4); - float* in_bound_ptr_101 = in_bound.channel(5); - float* in_bound_ptr_110 = in_bound.channel(6); - float* in_bound_ptr_111 = in_bound.channel(7); - - float* value_ptr_alpha = value.channel(0); - float* value_ptr_beta = value.channel(1); - float* value_ptr_gamma = value.channel(2); - - grid_sample_unormalize unormalize; - - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; -#if __AVX__ - for (; x + 23 < grid_size; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); - __m256 gz = _mm256_loadu_ps(gridptr + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - } - - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); - __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); - __m256 z1 = _mm256_add_ps(z_t, _mm256_set1_ps(1)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - volatile float epack = src.elempack; - __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, - _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), - _mm256_set1_ps(epack)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(epack)); - - __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(epack), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(epack)); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr_000, v000_in_range); - _mm256_storeu_ps(in_bound_ptr_001, v001_in_range); - _mm256_storeu_ps(in_bound_ptr_010, v010_in_range); - _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); - - _mm256_storeu_ps(in_bound_ptr_100, v100_in_range); - _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); - _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); - _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); - - _mm256_storeu_ps(offset_ptr_000, tnw_offset); - _mm256_storeu_ps(offset_ptr_001, tne_offset); - _mm256_storeu_ps(offset_ptr_010, tsw_offset); - _mm256_storeu_ps(offset_ptr_011, tse_offset); - - _mm256_storeu_ps(offset_ptr_100, bnw_offset); - _mm256_storeu_ps(offset_ptr_101, bne_offset); - _mm256_storeu_ps(offset_ptr_110, bsw_offset); - _mm256_storeu_ps(offset_ptr_111, bse_offset); - - __m256 alpha = _mm256_sub_ps(gx, x_w); - __m256 beta = _mm256_sub_ps(gy, y_n); - __m256 gamma = _mm256_sub_ps(gz, z_t); - - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - _mm256_storeu_ps(value_ptr_gamma, gamma); - - gridptr += 24; - - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; - - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; - - in_bound_ptr_000 += 8; - in_bound_ptr_001 += 8; - in_bound_ptr_010 += 8; - in_bound_ptr_011 += 8; - - in_bound_ptr_100 += 8; - in_bound_ptr_101 += 8; - in_bound_ptr_110 += 8; - in_bound_ptr_111 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; - } -#endif // __AVX__ + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); - for (; x < grid_size; x += 3) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - float sample_z = *(gridptr + 2); - - sample_x = unormalize(src.w, sample_x); - sample_y = unormalize(src.h, sample_y); - sample_z = unormalize(src.d, sample_z); - - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int z0 = (int)floor(sample_z); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); - bool z0_in_range = (z0 > -1) & (z0 < src.d); - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); - - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; - - *in_bound_ptr_000 = (v00_in_range & z0_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_001 = (v01_in_range & z0_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_010 = (v10_in_range & z0_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_011 = (v11_in_range & z0_in_range) ? -1.0f : 0.0f; - - *in_bound_ptr_100 = (v00_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_101 = (v01_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_110 = (v10_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; - - *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; - - *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; - - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - *value_ptr_gamma = sample_z - z0; - - gridptr += 3; - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - } - } - } - else + int x = 0; +#if __AVX__ + for (; x + 7 < grid_size; x += 8) { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - const float* gridptr_z = grid.channel(2); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); - int x = 0; -#if __AVX__ - for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); - __m256 gz = _mm256_loadu_ps(gridptr_z); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - } + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - __m256 x_w = _mm256_floor_ps(gx); - __m256 y_n = _mm256_floor_ps(gy); - __m256 z_t = _mm256_floor_ps(gz); - - __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); - __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); - __m256 z1 = _mm256_add_ps(z_t, _mm256_set1_ps(1)); - - __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); - __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); - __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); - __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); - __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS)); - __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS)); - - __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; - { - __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); - __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); - __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); - __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - - v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); - v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); - v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); - v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - - v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); - v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); - v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); - v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); - } - - volatile float epack = src.elempack; - __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, - _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), - _mm256_set1_ps(epack)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(epack)); - - __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(epack), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(epack)); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr_000, v000_in_range); - _mm256_storeu_ps(in_bound_ptr_001, v001_in_range); - _mm256_storeu_ps(in_bound_ptr_010, v010_in_range); - _mm256_storeu_ps(in_bound_ptr_011, v011_in_range); - - _mm256_storeu_ps(in_bound_ptr_100, v100_in_range); - _mm256_storeu_ps(in_bound_ptr_101, v101_in_range); - _mm256_storeu_ps(in_bound_ptr_110, v110_in_range); - _mm256_storeu_ps(in_bound_ptr_111, v111_in_range); - - _mm256_storeu_ps(offset_ptr_000, tnw_offset); - _mm256_storeu_ps(offset_ptr_001, tne_offset); - _mm256_storeu_ps(offset_ptr_010, tsw_offset); - _mm256_storeu_ps(offset_ptr_011, tse_offset); - - _mm256_storeu_ps(offset_ptr_100, bnw_offset); - _mm256_storeu_ps(offset_ptr_101, bne_offset); - _mm256_storeu_ps(offset_ptr_110, bsw_offset); - _mm256_storeu_ps(offset_ptr_111, bse_offset); - - __m256 alpha = _mm256_sub_ps(gx, x_w); - __m256 beta = _mm256_sub_ps(gy, y_n); - __m256 gamma = _mm256_sub_ps(gz, z_t); - - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - _mm256_storeu_ps(value_ptr_gamma, gamma); - - gridptr_x += 8; - gridptr_y += 8; - gridptr_z += 8; - - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; - - in_bound_ptr_000 += 8; - in_bound_ptr_001 += 8; - in_bound_ptr_010 += 8; - in_bound_ptr_011 += 8; - - in_bound_ptr_100 += 8; - in_bound_ptr_101 += 8; - in_bound_ptr_110 += 8; - in_bound_ptr_111 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; + gz = unormalize(_mm256_set1_ps(src.d), gz); + gz = get_coord(_mm256_set1_ps(src.d), gz); } -#endif // __AVX__ - for (; x < grid_size; x++) - { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; - float sample_z = *gridptr_z; + __m256 x_w = _mm256_floor_ps(gx); + __m256 y_n = _mm256_floor_ps(gy); + __m256 z_t = _mm256_floor_ps(gz); - sample_x = unormalize(src.w, sample_x); - sample_y = unormalize(src.h, sample_y); - sample_z = unormalize(src.d, sample_z); - - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int z0 = (int)floor(sample_z); - int x1 = x0 + 1; - int y1 = y0 + 1; - int z1 = z0 + 1; - - bool x0_in_range = (x0 > -1) & (x0 < src.w); - bool y0_in_range = (y0 > -1) & (y0 < src.h); - bool z0_in_range = (z0 > -1) & (z0 < src.d); - bool x1_in_range = (x1 > -1) & (x1 < src.w); - bool y1_in_range = (y1 > -1) & (y1 < src.h); - bool z1_in_range = (z1 > -1) & (z1 < src.d); + __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1)); + __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1)); + __m256 z1 = _mm256_add_ps(z_t, _mm256_set1_ps(1)); - bool v00_in_range = x0_in_range & y0_in_range; - bool v01_in_range = x1_in_range & y0_in_range; - bool v10_in_range = x0_in_range & y1_in_range; - bool v11_in_range = x1_in_range & y1_in_range; + __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS)); + __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS)); + __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS)); + __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS)); + __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS)); + __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS)); - *in_bound_ptr_000 = (v00_in_range & z0_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_001 = (v01_in_range & z0_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_010 = (v10_in_range & z0_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_011 = (v11_in_range & z0_in_range) ? -1.0f : 0.0f; + __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range; + { + __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range); + __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range); + __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); + __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - *in_bound_ptr_100 = (v00_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_101 = (v01_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_110 = (v10_in_range & z1_in_range) ? -1.0f : 0.0f; - *in_bound_ptr_111 = (v11_in_range & z1_in_range) ? -1.0f : 0.0f; + v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range); + v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range); + v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range); + v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range); - *offset_ptr_000 = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_001 = (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_010 = (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack; - *offset_ptr_011 = (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack; + v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range); + v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range); + v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range); + v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); + } - *offset_ptr_100 = (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_101 = (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_110 = (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack; - *offset_ptr_111 = (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack; + volatile float epack = src.elempack; + __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, + _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), + _mm256_set1_ps(epack)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); + __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(epack)); + + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(epack), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(epack)); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); + __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); + + tnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tnw_offset, v000_in_range); + tne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tne_offset, v001_in_range); + tsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tsw_offset, v010_in_range); + tse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tse_offset, v011_in_range); + + bnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bnw_offset, v100_in_range); + bne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bne_offset, v101_in_range); + bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range); + bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range); + + _mm256_storeu_ps(offset_ptr_000, tnw_offset); + _mm256_storeu_ps(offset_ptr_001, tne_offset); + _mm256_storeu_ps(offset_ptr_010, tsw_offset); + _mm256_storeu_ps(offset_ptr_011, tse_offset); + + _mm256_storeu_ps(offset_ptr_100, bnw_offset); + _mm256_storeu_ps(offset_ptr_101, bne_offset); + _mm256_storeu_ps(offset_ptr_110, bsw_offset); + _mm256_storeu_ps(offset_ptr_111, bse_offset); + + __m256 alpha = _mm256_sub_ps(gx, x_w); + __m256 beta = _mm256_sub_ps(gy, y_n); + __m256 gamma = _mm256_sub_ps(gz, z_t); + + _mm256_storeu_ps(value_ptr_alpha, alpha); + _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(value_ptr_gamma, gamma); + + gridptr_x += 8; + gridptr_y += 8; + gridptr_z += 8; - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - *value_ptr_gamma = sample_z - z0; + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; - gridptr_x++; - gridptr_y++; - gridptr_z++; + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + } +#endif // __AVX__ - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; + for (; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; + + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); + + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); + + sample_z = unormalize(src.d, sample_z); + sample_z = get_coord(src.d, sample_z); + + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int z0 = (int)floor(sample_z); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + bool x0_in_range = (x0 > -1) & (x0 < src.w); + bool y0_in_range = (y0 > -1) & (y0 < src.h); + bool z0_in_range = (z0 > -1) & (z0 < src.d); + bool x1_in_range = (x1 > -1) & (x1 < src.w); + bool y1_in_range = (y1 > -1) & (y1 < src.h); + bool z1_in_range = (z1 > -1) & (z1 < src.d); + + bool v00_in_range = x0_in_range & y0_in_range; + bool v01_in_range = x1_in_range & y0_in_range; + bool v10_in_range = x0_in_range & y1_in_range; + bool v11_in_range = x1_in_range & y1_in_range; + + bool in_bound_000 = v00_in_range & z0_in_range; + bool in_bound_001 = v01_in_range & z0_in_range; + bool in_bound_010 = v10_in_range & z0_in_range; + bool in_bound_011 = v11_in_range & z0_in_range; + + bool in_bound_100 = v00_in_range & z1_in_range; + bool in_bound_101 = v01_in_range & z1_in_range; + bool in_bound_110 = v10_in_range & z1_in_range; + bool in_bound_111 = v11_in_range & z1_in_range; + + *offset_ptr_000 = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_001 = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_010 = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_011 = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + + *offset_ptr_100 = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_101 = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_110 = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_ptr_111 = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + + *value_ptr_alpha = sample_x - x0; + *value_ptr_beta = sample_y - y0; + *value_ptr_gamma = sample_z - z0; + + gridptr_x++; + gridptr_y++; + gridptr_z++; - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - } + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; } } -}; +} #if __SSE2__ #if __AVX__ #if __AVX512F__ -static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -1497,11 +718,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const float* offset_ptr_10 = offset.channel(2); const float* offset_ptr_11 = offset.channel(3); - const float* in_bound_ptr_00 = in_bound.channel(0); - const float* in_bound_ptr_01 = in_bound.channel(1); - const float* in_bound_ptr_10 = in_bound.channel(2); - const float* in_bound_ptr_11 = in_bound.channel(3); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); @@ -1512,10 +728,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_10), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_11), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __mmask16 mask00 = *reinterpret_cast(in_bound_ptr_00) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask01 = *reinterpret_cast(in_bound_ptr_01) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask10 = *reinterpret_cast(in_bound_ptr_10) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask11 = *reinterpret_cast(in_bound_ptr_11) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask00 = *reinterpret_cast(offset_ptr_00) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask01 = *reinterpret_cast(offset_ptr_01) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask10 = *reinterpret_cast(offset_ptr_10) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask11 = *reinterpret_cast(offset_ptr_11) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); __m512 v00_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask00, v00_offset, srcptr, sizeof(float)); __m512 v01_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask01, v01_offset, srcptr, sizeof(float)); @@ -1536,11 +752,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& offset_ptr_10++; offset_ptr_11++; - in_bound_ptr_00++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - value_ptr_alpha++; value_ptr_beta++; @@ -1548,7 +759,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& } } } -static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -1571,15 +782,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const float* offset_ptr_110 = offset.channel(6); const float* offset_ptr_111 = offset.channel(7); - const float* in_bound_ptr_000 = in_bound.channel(0); - const float* in_bound_ptr_001 = in_bound.channel(1); - const float* in_bound_ptr_010 = in_bound.channel(2); - const float* in_bound_ptr_011 = in_bound.channel(3); - const float* in_bound_ptr_100 = in_bound.channel(4); - const float* in_bound_ptr_101 = in_bound.channel(5); - const float* in_bound_ptr_110 = in_bound.channel(6); - const float* in_bound_ptr_111 = in_bound.channel(7); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); const float* value_ptr_gamma = value.channel(2); @@ -1595,14 +797,14 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_110), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_111), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_000) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); - __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_001) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); - __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_010) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); - __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_011) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); - __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_100) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); - __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_101) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); - __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_110) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); - __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr_111) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); + __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_000) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); + __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_001) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); + __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_010) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); + __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_011) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); + __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_100) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); + __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_101) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); + __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_110) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); + __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_111) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); __m512 beta = _mm512_set1_ps(*value_ptr_beta); @@ -1629,16 +831,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& offset_ptr_110++; offset_ptr_111++; - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - value_ptr_alpha++; value_ptr_beta++; value_ptr_gamma++; @@ -1649,7 +841,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& } #endif // __AVX512F__ -static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -1667,11 +859,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* offset_ptr_10 = offset.channel(2); const float* offset_ptr_11 = offset.channel(3); - const float* in_bound_ptr_00 = in_bound.channel(0); - const float* in_bound_ptr_01 = in_bound.channel(1); - const float* in_bound_ptr_10 = in_bound.channel(2); - const float* in_bound_ptr_11 = in_bound.channel(3); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); @@ -1689,15 +876,15 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_11), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ - __m256 v00_in_range = _mm256_set1_ps(*in_bound_ptr_00); - __m256 v01_in_range = _mm256_set1_ps(*in_bound_ptr_01); - __m256 v10_in_range = _mm256_set1_ps(*in_bound_ptr_10); - __m256 v11_in_range = _mm256_set1_ps(*in_bound_ptr_11); + float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; + float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; + float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; + float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_range); - __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_range); - __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_range); - __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_range); + __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, _mm256_set1_ps(in_bound_00)); + __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, _mm256_set1_ps(in_bound_01)); + __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, _mm256_set1_ps(in_bound_10)); + __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, _mm256_set1_ps(in_bound_11)); __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); __m256 beta = _mm256_set1_ps(*value_ptr_beta); @@ -1713,11 +900,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d offset_ptr_10++; offset_ptr_11++; - in_bound_ptr_00++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - value_ptr_alpha++; value_ptr_beta++; @@ -1725,7 +907,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d } } } -static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -1748,15 +930,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* offset_ptr_110 = offset.channel(6); const float* offset_ptr_111 = offset.channel(7); - const float* in_bound_ptr_000 = in_bound.channel(0); - const float* in_bound_ptr_001 = in_bound.channel(1); - const float* in_bound_ptr_010 = in_bound.channel(2); - const float* in_bound_ptr_011 = in_bound.channel(3); - const float* in_bound_ptr_100 = in_bound.channel(4); - const float* in_bound_ptr_101 = in_bound.channel(5); - const float* in_bound_ptr_110 = in_bound.channel(6); - const float* in_bound_ptr_111 = in_bound.channel(7); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); const float* value_ptr_gamma = value.channel(2); @@ -1783,23 +956,23 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_111), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ - __m256 v000_in_range = _mm256_set1_ps(*in_bound_ptr_000); - __m256 v001_in_range = _mm256_set1_ps(*in_bound_ptr_001); - __m256 v010_in_range = _mm256_set1_ps(*in_bound_ptr_010); - __m256 v011_in_range = _mm256_set1_ps(*in_bound_ptr_011); - __m256 v100_in_range = _mm256_set1_ps(*in_bound_ptr_100); - __m256 v101_in_range = _mm256_set1_ps(*in_bound_ptr_101); - __m256 v110_in_range = _mm256_set1_ps(*in_bound_ptr_110); - __m256 v111_in_range = _mm256_set1_ps(*in_bound_ptr_111); - - __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_range); - __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_range); - __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_range); - __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_range); - __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_range); - __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_range); - __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_range); - __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_range); + float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; + float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; + float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; + float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; + float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; + float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; + float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; + float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; + + __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, _mm256_set1_ps(in_bound_000)); + __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, _mm256_set1_ps(in_bound_001)); + __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, _mm256_set1_ps(in_bound_010)); + __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, _mm256_set1_ps(in_bound_011)); + __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, _mm256_set1_ps(in_bound_100)); + __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, _mm256_set1_ps(in_bound_101)); + __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, _mm256_set1_ps(in_bound_110)); + __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, _mm256_set1_ps(in_bound_111)); __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); __m256 beta = _mm256_set1_ps(*value_ptr_beta); @@ -1826,16 +999,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d offset_ptr_110++; offset_ptr_111++; - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - value_ptr_alpha++; value_ptr_beta++; value_ptr_gamma++; @@ -1845,7 +1008,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d } } #endif // __AVX__ -static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -1863,11 +1026,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const float* offset_ptr_10 = offset.channel(2); const float* offset_ptr_11 = offset.channel(3); - const float* in_bound_ptr_00 = in_bound.channel(0); - const float* in_bound_ptr_01 = in_bound.channel(1); - const float* in_bound_ptr_10 = in_bound.channel(2); - const float* in_bound_ptr_11 = in_bound.channel(3); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); @@ -1878,15 +1036,15 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_10), _mm_set_epi32(3, 2, 1, 0)); __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_11), _mm_set_epi32(3, 2, 1, 0)); - __m128 v00_in_range = _mm_set1_ps(*in_bound_ptr_00); - __m128 v01_in_range = _mm_set1_ps(*in_bound_ptr_01); - __m128 v10_in_range = _mm_set1_ps(*in_bound_ptr_10); - __m128 v11_in_range = _mm_set1_ps(*in_bound_ptr_11); + float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; + float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; + float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; + float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; - __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_range); - __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_range); - __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_range); - __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_range); + __m128 v00_val = mask_gather_ps(srcptr, v00_offset, _mm_set1_ps(in_bound_00)); + __m128 v01_val = mask_gather_ps(srcptr, v01_offset, _mm_set1_ps(in_bound_01)); + __m128 v10_val = mask_gather_ps(srcptr, v10_offset, _mm_set1_ps(in_bound_10)); + __m128 v11_val = mask_gather_ps(srcptr, v11_offset, _mm_set1_ps(in_bound_11)); __m128 alpha = _mm_set1_ps(*value_ptr_alpha); __m128 beta = _mm_set1_ps(*value_ptr_beta); @@ -1902,11 +1060,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d offset_ptr_10++; offset_ptr_11++; - in_bound_ptr_00++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - value_ptr_alpha++; value_ptr_beta++; @@ -1914,7 +1067,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d } } } -static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -1937,15 +1090,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const float* offset_ptr_110 = offset.channel(6); const float* offset_ptr_111 = offset.channel(7); - const float* in_bound_ptr_000 = in_bound.channel(0); - const float* in_bound_ptr_001 = in_bound.channel(1); - const float* in_bound_ptr_010 = in_bound.channel(2); - const float* in_bound_ptr_011 = in_bound.channel(3); - const float* in_bound_ptr_100 = in_bound.channel(4); - const float* in_bound_ptr_101 = in_bound.channel(5); - const float* in_bound_ptr_110 = in_bound.channel(6); - const float* in_bound_ptr_111 = in_bound.channel(7); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); const float* value_ptr_gamma = value.channel(2); @@ -1961,23 +1105,23 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_110), _mm_set_epi32(3, 2, 1, 0)); __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_111), _mm_set_epi32(3, 2, 1, 0)); - __m128 v000_in_range = _mm_set1_ps(*in_bound_ptr_000); - __m128 v001_in_range = _mm_set1_ps(*in_bound_ptr_001); - __m128 v010_in_range = _mm_set1_ps(*in_bound_ptr_010); - __m128 v011_in_range = _mm_set1_ps(*in_bound_ptr_011); - __m128 v100_in_range = _mm_set1_ps(*in_bound_ptr_100); - __m128 v101_in_range = _mm_set1_ps(*in_bound_ptr_101); - __m128 v110_in_range = _mm_set1_ps(*in_bound_ptr_110); - __m128 v111_in_range = _mm_set1_ps(*in_bound_ptr_111); - - __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_range); - __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_range); - __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_range); - __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_range); - __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_range); - __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_range); - __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_range); - __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_range); + float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; + float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; + float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; + float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; + float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; + float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; + float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; + float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; + + __m128 v000_val = mask_gather_ps(srcptr, v000_offset, _mm_set1_ps(in_bound_000)); + __m128 v001_val = mask_gather_ps(srcptr, v001_offset, _mm_set1_ps(in_bound_001)); + __m128 v010_val = mask_gather_ps(srcptr, v010_offset, _mm_set1_ps(in_bound_010)); + __m128 v011_val = mask_gather_ps(srcptr, v011_offset, _mm_set1_ps(in_bound_011)); + __m128 v100_val = mask_gather_ps(srcptr, v100_offset, _mm_set1_ps(in_bound_100)); + __m128 v101_val = mask_gather_ps(srcptr, v101_offset, _mm_set1_ps(in_bound_101)); + __m128 v110_val = mask_gather_ps(srcptr, v110_offset, _mm_set1_ps(in_bound_110)); + __m128 v111_val = mask_gather_ps(srcptr, v111_offset, _mm_set1_ps(in_bound_111)); __m128 alpha = _mm_set1_ps(*value_ptr_alpha); __m128 beta = _mm_set1_ps(*value_ptr_beta); @@ -2004,16 +1148,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d offset_ptr_110++; offset_ptr_111++; - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - value_ptr_alpha++; value_ptr_beta++; value_ptr_gamma++; @@ -2024,7 +1158,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d } #endif // __SSE2__ -static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -2042,11 +1176,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* offset_ptr_10 = offset.channel(2); const float* offset_ptr_11 = offset.channel(3); - const float* in_bound_ptr_00 = in_bound.channel(0); - const float* in_bound_ptr_01 = in_bound.channel(1); - const float* in_bound_ptr_10 = in_bound.channel(2); - const float* in_bound_ptr_11 = in_bound.channel(3); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); @@ -2061,15 +1190,15 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d __m256i v10_offset = _mm256_set_epi32(*(offset_ptr_10 + 7), *(offset_ptr_10 + 6), *(offset_ptr_10 + 5), *(offset_ptr_10 + 4), *(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); __m256i v11_offset = _mm256_set_epi32(*(offset_ptr_11 + 7), *(offset_ptr_11 + 6), *(offset_ptr_11 + 5), *(offset_ptr_11 + 4), *(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); - __m256 v00_in_range = _mm256_loadu_ps(in_bound_ptr_00); - __m256 v01_in_range = _mm256_loadu_ps(in_bound_ptr_01); - __m256 v10_in_range = _mm256_loadu_ps(in_bound_ptr_10); - __m256 v11_in_range = _mm256_loadu_ps(in_bound_ptr_11); + __m256 v00_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_00), _mm256_set1_ps(-1.0f)); + __m256 v01_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_01), _mm256_set1_ps(-1.0f)); + __m256 v10_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_10), _mm256_set1_ps(-1.0f)); + __m256 v11_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_11), _mm256_set1_ps(-1.0f)); - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_range); - __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_range); - __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_range); - __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_range); + __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_bound); + __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_bound); + __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_bound); + __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_bound); __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); __m256 beta = _mm256_loadu_ps(value_ptr_beta); @@ -2085,11 +1214,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_10 += 8; offset_ptr_11 += 8; - in_bound_ptr_00 += 8; - in_bound_ptr_01 += 8; - in_bound_ptr_10 += 8; - in_bound_ptr_11 += 8; - value_ptr_alpha += 8; value_ptr_beta += 8; @@ -2103,15 +1227,15 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d __m128i v10_offset = _mm_set_epi32(*(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); __m128i v11_offset = _mm_set_epi32(*(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); - __m128 v00_in_range = _mm_loadu_ps(in_bound_ptr_00); - __m128 v01_in_range = _mm_loadu_ps(in_bound_ptr_01); - __m128 v10_in_range = _mm_loadu_ps(in_bound_ptr_10); - __m128 v11_in_range = _mm_loadu_ps(in_bound_ptr_11); + __m128 v00_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_00), _mm_set1_ps(-1.0f)); + __m128 v01_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_01), _mm_set1_ps(-1.0f)); + __m128 v10_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_10), _mm_set1_ps(-1.0f)); + __m128 v11_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_11), _mm_set1_ps(-1.0f)); - __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_range); - __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_range); - __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_range); - __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_range); + __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_bound); + __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_bound); + __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_bound); + __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_bound); __m128 alpha = _mm_loadu_ps(value_ptr_alpha); __m128 beta = _mm_loadu_ps(value_ptr_beta); @@ -2127,11 +1251,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_10 += 4; offset_ptr_11 += 4; - in_bound_ptr_00 += 4; - in_bound_ptr_01 += 4; - in_bound_ptr_10 += 4; - in_bound_ptr_11 += 4; - value_ptr_alpha += 4; value_ptr_beta += 4; @@ -2140,21 +1259,16 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __SSE2__ for (; x < grid_size; x++) { - float v00 = *in_bound_ptr_00 < 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; - float v01 = *in_bound_ptr_01 < 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; - float v10 = *in_bound_ptr_10 < 0 ? *(srcptr + static_cast(*offset_ptr_10)) : 0; - float v11 = *in_bound_ptr_11 < 0 ? *(srcptr + static_cast(*offset_ptr_11)) : 0; + float v00 = *offset_ptr_00 >= 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; + float v01 = *offset_ptr_01 >= 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; + float v10 = *offset_ptr_10 >= 0 ? *(srcptr + static_cast(*offset_ptr_10)) : 0; + float v11 = *offset_ptr_11 >= 0 ? *(srcptr + static_cast(*offset_ptr_11)) : 0; float v0 = v00 * (1 - *value_ptr_alpha) + v01 * *value_ptr_alpha; float v1 = v10 * (1 - *value_ptr_alpha) + v11 * *value_ptr_alpha; *dstptr = v0 * (1 - *value_ptr_beta) + v1 * *value_ptr_beta; - in_bound_ptr_00++; - in_bound_ptr_01++; - in_bound_ptr_10++; - in_bound_ptr_11++; - offset_ptr_00++; offset_ptr_01++; offset_ptr_10++; @@ -2166,7 +1280,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d } } } -static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -2189,15 +1303,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* offset_ptr_110 = offset.channel(6); const float* offset_ptr_111 = offset.channel(7); - const float* in_bound_ptr_000 = in_bound.channel(0); - const float* in_bound_ptr_001 = in_bound.channel(1); - const float* in_bound_ptr_010 = in_bound.channel(2); - const float* in_bound_ptr_011 = in_bound.channel(3); - const float* in_bound_ptr_100 = in_bound.channel(4); - const float* in_bound_ptr_101 = in_bound.channel(5); - const float* in_bound_ptr_110 = in_bound.channel(6); - const float* in_bound_ptr_111 = in_bound.channel(7); - const float* value_ptr_alpha = value.channel(0); const float* value_ptr_beta = value.channel(1); const float* value_ptr_gamma = value.channel(2); @@ -2216,23 +1321,23 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d __m256i v110_offset = _mm256_set_epi32(*(offset_ptr_110 + 7), *(offset_ptr_110 + 6), *(offset_ptr_110 + 5), *(offset_ptr_110 + 4), *(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); __m256i v111_offset = _mm256_set_epi32(*(offset_ptr_111 + 7), *(offset_ptr_111 + 6), *(offset_ptr_111 + 5), *(offset_ptr_111 + 4), *(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); - __m256 v000_in_range = _mm256_loadu_ps(in_bound_ptr_000); - __m256 v001_in_range = _mm256_loadu_ps(in_bound_ptr_001); - __m256 v010_in_range = _mm256_loadu_ps(in_bound_ptr_010); - __m256 v011_in_range = _mm256_loadu_ps(in_bound_ptr_011); - __m256 v100_in_range = _mm256_loadu_ps(in_bound_ptr_100); - __m256 v101_in_range = _mm256_loadu_ps(in_bound_ptr_101); - __m256 v110_in_range = _mm256_loadu_ps(in_bound_ptr_110); - __m256 v111_in_range = _mm256_loadu_ps(in_bound_ptr_111); - - __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_range); - __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_range); - __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_range); - __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_range); - __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_range); - __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_range); - __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_range); - __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_range); + __m256 v000_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_000), _mm256_set1_ps(-1.0f)); + __m256 v001_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_001), _mm256_set1_ps(-1.0f)); + __m256 v010_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_010), _mm256_set1_ps(-1.0f)); + __m256 v011_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_011), _mm256_set1_ps(-1.0f)); + __m256 v100_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_100), _mm256_set1_ps(-1.0f)); + __m256 v101_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_101), _mm256_set1_ps(-1.0f)); + __m256 v110_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_110), _mm256_set1_ps(-1.0f)); + __m256 v111_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_111), _mm256_set1_ps(-1.0f)); + + __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_bound); + __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_bound); + __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_bound); + __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_bound); + __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_bound); + __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_bound); + __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_bound); + __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_bound); __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); __m256 beta = _mm256_loadu_ps(value_ptr_beta); @@ -2259,16 +1364,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_110 += 8; offset_ptr_111 += 8; - in_bound_ptr_000 += 8; - in_bound_ptr_001 += 8; - in_bound_ptr_010 += 8; - in_bound_ptr_011 += 8; - - in_bound_ptr_100 += 8; - in_bound_ptr_101 += 8; - in_bound_ptr_110 += 8; - in_bound_ptr_111 += 8; - value_ptr_alpha += 8; value_ptr_beta += 8; value_ptr_gamma += 8; @@ -2288,23 +1383,23 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d __m128i v110_offset = _mm_set_epi32(*(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); __m128i v111_offset = _mm_set_epi32(*(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); - __m128 v000_in_range = _mm_loadu_ps(in_bound_ptr_000); - __m128 v001_in_range = _mm_loadu_ps(in_bound_ptr_001); - __m128 v010_in_range = _mm_loadu_ps(in_bound_ptr_010); - __m128 v011_in_range = _mm_loadu_ps(in_bound_ptr_011); - __m128 v100_in_range = _mm_loadu_ps(in_bound_ptr_100); - __m128 v101_in_range = _mm_loadu_ps(in_bound_ptr_101); - __m128 v110_in_range = _mm_loadu_ps(in_bound_ptr_110); - __m128 v111_in_range = _mm_loadu_ps(in_bound_ptr_111); - - __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_range); - __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_range); - __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_range); - __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_range); - __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_range); - __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_range); - __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_range); - __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_range); + __m128 v000_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_000), _mm_set1_ps(-1.0f)); + __m128 v001_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_001), _mm_set1_ps(-1.0f)); + __m128 v010_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_010), _mm_set1_ps(-1.0f)); + __m128 v011_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_011), _mm_set1_ps(-1.0f)); + __m128 v100_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_100), _mm_set1_ps(-1.0f)); + __m128 v101_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_101), _mm_set1_ps(-1.0f)); + __m128 v110_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_110), _mm_set1_ps(-1.0f)); + __m128 v111_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_111), _mm_set1_ps(-1.0f)); + + __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_bound); + __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_bound); + __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_bound); + __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_bound); + __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_bound); + __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_bound); + __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_bound); + __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_bound); __m128 alpha = _mm_loadu_ps(value_ptr_alpha); __m128 beta = _mm_loadu_ps(value_ptr_beta); @@ -2331,16 +1426,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_110 += 4; offset_ptr_111 += 4; - in_bound_ptr_000 += 4; - in_bound_ptr_001 += 4; - in_bound_ptr_010 += 4; - in_bound_ptr_011 += 4; - - in_bound_ptr_100 += 4; - in_bound_ptr_101 += 4; - in_bound_ptr_110 += 4; - in_bound_ptr_111 += 4; - value_ptr_alpha += 4; value_ptr_beta += 4; value_ptr_gamma += 4; @@ -2350,15 +1435,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d #endif // __SSE2__ for (; x < grid_size; x++) { - float v000 = *reinterpret_cast(in_bound_ptr_000) < 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; - float v001 = *reinterpret_cast(in_bound_ptr_001) < 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; - float v010 = *reinterpret_cast(in_bound_ptr_010) < 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; - float v011 = *reinterpret_cast(in_bound_ptr_011) < 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; - - float v100 = *reinterpret_cast(in_bound_ptr_100) < 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; - float v101 = *reinterpret_cast(in_bound_ptr_101) < 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; - float v110 = *reinterpret_cast(in_bound_ptr_110) < 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; - float v111 = *reinterpret_cast(in_bound_ptr_111) < 0 ? *(srcptr + static_cast(*offset_ptr_111)) : 0; + float v000 = *reinterpret_cast(offset_ptr_000) >= 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; + float v001 = *reinterpret_cast(offset_ptr_001) >= 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; + float v010 = *reinterpret_cast(offset_ptr_010) >= 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; + float v011 = *reinterpret_cast(offset_ptr_011) >= 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; + + float v100 = *reinterpret_cast(offset_ptr_100) >= 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; + float v101 = *reinterpret_cast(offset_ptr_101) >= 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; + float v110 = *reinterpret_cast(offset_ptr_110) >= 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; + float v111 = *reinterpret_cast(offset_ptr_111) >= 0 ? *(srcptr + static_cast(*offset_ptr_111)) : 0; float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; @@ -2380,16 +1465,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d offset_ptr_110++; offset_ptr_111++; - in_bound_ptr_000++; - in_bound_ptr_001++; - in_bound_ptr_010++; - in_bound_ptr_011++; - - in_bound_ptr_100++; - in_bound_ptr_101++; - in_bound_ptr_110++; - in_bound_ptr_111++; - value_ptr_alpha++; value_ptr_beta++; value_ptr_gamma++; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 6cab27879b1..a2f6a9dc01f 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -13,93 +13,33 @@ // specific language governing permissions and limitations under the License. template -struct gridsample_2d_nearest_compute_blob +void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) { - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) - { - const int grid_size = grid.w * grid.h; - - float* offset_ptr = offset.channel(0); - - grid_sample_unormalize unormalize; - compute_coord get_coord; - - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; -#if __AVX__ - for (; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); - - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + const int grid_size = grid.w * grid.h; - volatile float epack = src.elempack; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); + float* offset_ptr = offset.channel(0); - _mm256_storeu_ps(offset_ptr, offset); + grid_sample_unormalize unormalize; + compute_coord get_coord; - gridptr += 16; - - offset_ptr += 8; - } - -#endif // __AVX__ - - for (; x < grid_size; x += 2) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - - sample_x = unormalize(src.w, sample_x); - sample_x = get_coord(src.w, sample_x); - - sample_y = unormalize(src.h, sample_y); - sample_y = get_coord(src.h, sample_y); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - *offset_ptr = (x0 + y0 * src.w) * src.elempack; - - gridptr += 2; - - offset_ptr++; - } - } - } - else + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - + const float* gridptr = grid.channel(y); int x = 0; #if __AVX__ - for (; x + 7 < grid_size; x += 8) + for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); + + __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); + gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); + tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + + gx = _mm256_shuffle_ps(gx, gy, 0b10001000); + gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); // compute coord { @@ -113,23 +53,26 @@ struct gridsample_2d_nearest_compute_blob gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); + volatile float epack = src.elempack; __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); - _mm256_storeu_ps(offset_ptr, offset); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); - gridptr_x += 8; - gridptr_y += 8; + _mm256_storeu_ps(offset_ptr, offset); + gridptr += 16; offset_ptr += 8; } #endif // __AVX__ - for (; x < grid_size; x++) + for (; x < grid_size; x += 2) { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); @@ -140,267 +83,114 @@ struct gridsample_2d_nearest_compute_blob int x0 = static_cast(floor(sample_x + 0.5f)); int y0 = static_cast(floor(sample_y + 0.5f)); - *offset_ptr = (x0 + y0 * src.w) * src.elempack; - - gridptr_x++; - gridptr_y++; + bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)); + *offset_ptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0f; + gridptr += 2; offset_ptr++; } } } -}; - -template -struct gridsample_2d_nearest_compute_blob -{ - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + else { - const int grid_size = grid.w * grid.h; - - float* offset_ptr = offset.channel(0); - - float* in_bound_ptr = in_bound.channel(0); + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); - grid_sample_unormalize unormalize; - - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; + int x = 0; #if __AVX__ - for (; x + 15 < grid_size; x += 16) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); - - volatile float epack = src.elempack; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_ps(offset_ptr, offset); - - gridptr += 16; - offset_ptr += 8; - in_bound_ptr += 8; - } - -#endif // __AVX__ - - for (; x < grid_size; x += 2) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - - sample_x = unormalize(src.w, sample_x); - - sample_y = unormalize(src.h, sample_y); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - - *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? -1.0f : 0.0f; - *offset_ptr = (x0 + y0 * src.w) * src.elempack; - - gridptr += 2; - offset_ptr++; - in_bound_ptr++; - } - } - } - else + for (; x + 7 < grid_size; x += 8) { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); - int x = 0; -#if __AVX__ - for (; x + 7 < grid_size; x += 8) + // compute coord { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - } + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); + } - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); - volatile float epack = src.elempack; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); + volatile float epack = src.elempack; + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); - _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_ps(offset_ptr, offset); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); - gridptr_x += 8; - gridptr_y += 8; - offset_ptr += 8; - in_bound_ptr += 8; - } + _mm256_storeu_ps(offset_ptr, offset); + + gridptr_x += 8; + gridptr_y += 8; + offset_ptr += 8; + } #endif // __AVX__ - for (; x < grid_size; x++) - { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; + for (; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; - sample_x = unormalize(src.w, sample_x); - sample_y = unormalize(src.h, sample_y); + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); - *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)) ? -1.0f : 0.0f; + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); - *offset_ptr = (x0 + y0 * src.w) * src.elempack; + bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)); - gridptr_x++; - gridptr_y++; + *offset_ptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0f; - offset_ptr++; + gridptr_x++; + gridptr_y++; - in_bound_ptr++; - } + offset_ptr++; } } -}; +} template -struct gridsample_3d_nearest_compute_blob +void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) { - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) - { - const int grid_size = grid.w * grid.h * grid.d; + const int grid_size = grid.w * grid.h * grid.d; - float* offset_ptr = offset.channel(0); + float* offset_ptr = offset.channel(0); - grid_sample_unormalize unormalize; - compute_coord get_coord; + grid_sample_unormalize unormalize; + compute_coord get_coord; - if (permute_fusion == 0) + if (permute_fusion == 0) + { + for (int y = 0; y < grid.c; y++) { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; + const float* gridptr = grid.channel(y); + int x = 0; #if __AVX__ - for (; x + 23 < grid_size; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); - __m256 gz = _mm256_loadu_ps(gridptr + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); - - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - - gz = unormalize(_mm256_set1_ps(src.d), gz); - gz = get_coord(_mm256_set1_ps(src.d), gz); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - volatile float epack = src.elempack; - volatile float sw = src.w; - volatile float sh = src.h; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); - - _mm256_storeu_ps(offset_ptr, offset); - - gridptr += 24; - - offset_ptr += 8; - } - -#endif // __AVX__ - - for (; x < grid_size; x += 3) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - float sample_z = *(gridptr + 2); - - sample_x = unormalize(src.w, sample_x); - sample_x = get_coord(src.w, sample_x); - - sample_y = unormalize(src.h, sample_y); - sample_y = get_coord(src.h, sample_y); - - sample_z = unormalize(src.d, sample_z); - sample_z = get_coord(src.d, sample_z); - - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - int z0 = static_cast(floor(sample_z + 0.5f)); - - *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + for (; x + 23 < grid_size; x += 24) + { + __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); - gridptr += 3; + __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); + __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); + gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - offset_ptr++; - } - } - } - else - { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - const float* gridptr_z = grid.channel(2); + tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); + tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - int x = 0; -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); - __m256 gz = _mm256_loadu_ps(gridptr_z); + gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); + gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); + gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); // compute coord { @@ -418,29 +208,32 @@ struct gridsample_3d_nearest_compute_blob gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); + volatile float epack = src.elempack; volatile float sw = src.w; volatile float sh = src.h; __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), _mm256_set1_ps(epack)); - _mm256_storeu_ps(offset_ptr, offset); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); - gridptr_x += 8; - gridptr_y += 8; - gridptr_z += 8; + _mm256_storeu_ps(offset_ptr, offset); + gridptr += 24; offset_ptr += 8; } #endif // __AVX__ - for (; x < grid_size; x++) + for (; x < grid_size; x += 3) { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; - float sample_z = *gridptr_z; + float sample_x = *gridptr; + float sample_y = *(gridptr + 1); + float sample_z = *(gridptr + 2); sample_x = unormalize(src.w, sample_x); sample_x = get_coord(src.w, sample_x); @@ -455,193 +248,105 @@ struct gridsample_3d_nearest_compute_blob int y0 = static_cast(floor(sample_y + 0.5f)); int z0 = static_cast(floor(sample_z + 0.5f)); - *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)); - gridptr_x++; - gridptr_y++; - gridptr_z++; + *offset_ptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + gridptr += 3; offset_ptr++; } } } -}; - -template -struct gridsample_3d_nearest_compute_blob -{ - void operator()(const Mat& src, const Mat& grid, Mat& offset, Mat& in_bound, Mat& value, int permute_fusion, const Option& opt) + else { - const int grid_size = grid.w * grid.h * grid.d; - - float* offset_ptr = offset.channel(0); - - float* in_bound_ptr = in_bound.channel(0); - - grid_sample_unormalize unormalize; + const float* gridptr_x = grid.channel(0); + const float* gridptr_y = grid.channel(1); + const float* gridptr_z = grid.channel(2); - if (permute_fusion == 0) - { - for (int y = 0; y < grid.c; y++) - { - const float* gridptr = grid.channel(y); - int x = 0; + int x = 0; #if __AVX__ - for (; x + 23 < grid_size; x += 24) - { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); - __m256 gz = _mm256_loadu_ps(gridptr + 16); - - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - } - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); - - volatile float epack = src.elempack; - volatile float sw = src.w; - volatile float sh = src.h; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); - - _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_ps(offset_ptr, offset); - - gridptr += 24; - offset_ptr += 8; - in_bound_ptr += 8; - } - -#endif // __AVX__ - - for (; x < grid_size; x += 3) - { - float sample_x = *gridptr; - float sample_y = *(gridptr + 1); - float sample_z = *(gridptr + 2); - - sample_x = unormalize(src.w, sample_x); - sample_y = unormalize(src.h, sample_y); - sample_z = unormalize(src.d, sample_z); + for (; x + 7 < grid_size; x += 8) + { + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - int z0 = static_cast(floor(sample_z + 0.5f)); + // compute coord= + { + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)) ? -1.0f : 0.0f; - *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - gridptr += 3; - offset_ptr++; - in_bound_ptr++; - } + gz = unormalize(_mm256_set1_ps(src.d), gz); + gz = get_coord(_mm256_set1_ps(src.d), gz); } - } - else - { - const float* gridptr_x = grid.channel(0); - const float* gridptr_y = grid.channel(1); - const float* gridptr_z = grid.channel(2); - int x = 0; -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); - __m256 gz = _mm256_loadu_ps(gridptr_z); + gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); + gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); + gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - // compute coord= - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - - gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); - gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); - gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f))); - } + __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), + _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); + v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); - __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), - _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); - v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); + volatile float epack = src.elempack; + volatile float sw = src.w; + volatile float sh = src.h; + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), + _mm256_set1_ps(epack)); - volatile float epack = src.elempack; - volatile float sw = src.w; - volatile float sh = src.h; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); - _mm256_storeu_ps(in_bound_ptr, v_in_range); - _mm256_storeu_ps(offset_ptr, offset); + _mm256_storeu_ps(offset_ptr, offset); - gridptr_x += 8; - gridptr_y += 8; - gridptr_z += 8; + gridptr_x += 8; + gridptr_y += 8; + gridptr_z += 8; - offset_ptr += 8; - in_bound_ptr += 8; - } + offset_ptr += 8; + } #endif // __AVX__ - for (; x < grid_size; x++) - { - float sample_x = *gridptr_x; - float sample_y = *gridptr_y; - float sample_z = *gridptr_z; + for (; x < grid_size; x++) + { + float sample_x = *gridptr_x; + float sample_y = *gridptr_y; + float sample_z = *gridptr_z; - sample_x = unormalize(src.w, sample_x); - sample_y = unormalize(src.h, sample_y); - sample_z = unormalize(src.d, sample_z); + sample_x = unormalize(src.w, sample_x); + sample_x = get_coord(src.w, sample_x); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - int z0 = static_cast(floor(sample_z + 0.5f)); + sample_y = unormalize(src.h, sample_y); + sample_y = get_coord(src.h, sample_y); - *in_bound_ptr = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)) ? -1.0f : 0.0f; + sample_z = unormalize(src.d, sample_z); + sample_z = get_coord(src.d, sample_z); - *offset_ptr = (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack; + int x0 = static_cast(floor(sample_x + 0.5f)); + int y0 = static_cast(floor(sample_y + 0.5f)); + int z0 = static_cast(floor(sample_z + 0.5f)); - gridptr_x++; - gridptr_y++; - gridptr_z++; + bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)); - offset_ptr++; + *offset_ptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - in_bound_ptr++; - } + gridptr_x++; + gridptr_y++; + gridptr_z++; + + offset_ptr++; } } -}; +} #if __SSE2__ #if __AVX__ #if __AVX512F__ -static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -657,22 +362,19 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const float* offset_ptr = offset.channel(0); - const float* in_bound_ptr = in_bound.channel(0); - for (int i = 0; i < grid_size; i++) { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(in_bound_ptr) < 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); _mm512_storeu_ps(dstptr, _v); offset_ptr++; - in_bound_ptr++; dstptr += 16; } } } #endif // __AVX512F__ -static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -688,27 +390,25 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const float* offset_ptr = offset.channel(0); - const float* in_bound_ptr = in_bound.channel(0); - for (int i = 0; i < grid_size; i++) { + float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; #if __AVX2__ __m256i _offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else __m256i _offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ - __m256 _v = mask_gather_ps256(srcptr, _offset, _mm256_set1_ps(*in_bound_ptr)); + __m256 _v = mask_gather_ps256(srcptr, _offset, _mm256_set1_ps(in_bound)); _mm256_storeu_ps(dstptr, _v); offset_ptr++; - in_bound_ptr++; dstptr += 8; } } } #endif // __AVX__ -static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -724,16 +424,14 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const float* offset_ptr = offset.channel(0); - const float* in_bound_ptr = in_bound.channel(0); - for (int i = 0; i < grid_size; i++) { - __m128 _v = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_ptr), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(*in_bound_ptr)); + float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; + __m128 _v = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_ptr), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(in_bound)); _mm_storeu_ps(dstptr, _v); offset_ptr++; - in_bound_ptr++; dstptr += 4; } } @@ -741,7 +439,7 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, #endif // __SSE2__ -static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& in_bound, const Option& opt) +static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -757,38 +455,35 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const float* offset_ptr = offset.channel(0); - const float* in_bound_ptr = in_bound.channel(0); - int x = 0; #if __SSE2__ #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), _mm256_loadu_ps(in_bound_ptr)); + __m256 in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr), _mm256_set1_ps(-1.0f)); + __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); _mm256_storeu_ps(dstptr, _v); offset_ptr += 8; - in_bound_ptr += 8; dstptr += 8; } #endif // __AVX__ for (; x + 3 < grid_size; x += 4) { - __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), _mm_loadu_ps(in_bound_ptr)); + __m128 in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr), _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); _mm_storeu_ps(dstptr, _v); offset_ptr += 4; - in_bound_ptr += 4; dstptr += 4; } #endif // __SSE2__ for (; x < grid_size; x++) { - *dstptr = *reinterpret_cast(in_bound_ptr) < 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; + *dstptr = *reinterpret_cast(offset_ptr) >= 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; - in_bound_ptr++; offset_ptr++; dstptr++; } diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index eb5671fe062..0501de69ca3 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -136,7 +136,19 @@ struct grid_sample_unormalize }; template -struct compute_coord; +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return coord; + } +}; template struct compute_coord @@ -235,7 +247,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else @@ -315,49 +318,40 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else @@ -370,48 +364,41 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_2d_bicubic_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else @@ -435,50 +422,41 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_bilinear_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else @@ -491,49 +469,40 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } else { - gridsample_3d_nearest_compute_blob op; - op(bottom_blob, grid_p1, offset_blob, in_bound_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); } } else @@ -559,26 +528,26 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Fri, 28 Apr 2023 07:55:46 +0000 Subject: [PATCH 101/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_compute_blob.h | 6 +++--- src/layer/x86/gridsample_nearest_compute_blob.h | 4 ++-- src/layer/x86/gridsample_x86.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 33e6f122696..0c0fa91e4e4 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -366,7 +366,7 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o volatile float epack = src.elempack; __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, - _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), + _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), _mm256_set1_ps(epack)); __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); @@ -559,7 +559,7 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o volatile float epack = src.elempack; __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, - _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), + _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), _mm256_set1_ps(epack)); __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); @@ -1439,7 +1439,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d float v001 = *reinterpret_cast(offset_ptr_001) >= 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; float v010 = *reinterpret_cast(offset_ptr_010) >= 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; float v011 = *reinterpret_cast(offset_ptr_011) >= 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; - + float v100 = *reinterpret_cast(offset_ptr_100) >= 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; float v101 = *reinterpret_cast(offset_ptr_101) >= 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; float v110 = *reinterpret_cast(offset_ptr_110) >= 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index a2f6a9dc01f..6e8cc65199d 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -216,7 +216,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of volatile float sw = src.w; volatile float sh = src.h; __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), _mm256_set1_ps(epack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); @@ -295,7 +295,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of volatile float sw = src.w; volatile float sh = src.h; __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), _mm256_set1_ps(epack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 0501de69ca3..8eeab610244 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -423,7 +423,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Thu, 4 May 2023 20:42:06 +0800 Subject: [PATCH 102/127] apply avx2 --- .../x86/gridsample_apply_interpolation.h | 71 ++ .../gridsample_bicubic_apply_interpolation.h | 491 ++++++++++ .../x86/gridsample_bicubic_compute_blob.h | 448 --------- .../gridsample_bilinear_apply_interpolation.h | 848 ++++++++++++++++++ .../x86/gridsample_bilinear_compute_blob.h | 776 ---------------- src/layer/x86/gridsample_compute_blob.h | 153 ++++ .../gridsample_nearest_apply_interpolation.h | 191 ++++ .../x86/gridsample_nearest_compute_blob.h | 147 --- src/layer/x86/gridsample_x86.cpp | 208 +---- src/layer/x86/gridsample_x86_avx2.cpp | 75 ++ 10 files changed, 1836 insertions(+), 1572 deletions(-) create mode 100644 src/layer/x86/gridsample_apply_interpolation.h create mode 100644 src/layer/x86/gridsample_bicubic_apply_interpolation.h create mode 100644 src/layer/x86/gridsample_bilinear_apply_interpolation.h create mode 100644 src/layer/x86/gridsample_compute_blob.h create mode 100644 src/layer/x86/gridsample_nearest_apply_interpolation.h create mode 100644 src/layer/x86/gridsample_x86_avx2.cpp diff --git a/src/layer/x86/gridsample_apply_interpolation.h b/src/layer/x86/gridsample_apply_interpolation.h new file mode 100644 index 00000000000..abb457b874d --- /dev/null +++ b/src/layer/x86/gridsample_apply_interpolation.h @@ -0,0 +1,71 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if __SSE2__ +#if __AVX__ +static __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) +{ +#if __AVX2__ + __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[8], maski[8]; + memcpy(offseti, &offset, 8 * sizeof(int)); + memcpy(maski, &mask, 8 * sizeof(int)); + + float data[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < 8; i++) + { + if (maski[i] & 0xF0000000) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m256 v = _mm256_loadu_ps(data); +#endif // __AVX2__ + + return v; +} + +#endif // __AVX__ + +static __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) +{ +#if __AVX2__ + __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); +#else + int offseti[4], maski[4]; + memcpy(offseti, &offset, 4 * sizeof(int)); + memcpy(maski, &mask, 4 * sizeof(int)); + + float data[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < 4; i++) + { + if (maski[i] & 0xF0000000) + { + data[i] = *(ptr + offseti[i]); + } + } + + __m128 v = _mm_loadu_ps(data); +#endif // __AVX__ + + return v; +} + +#endif // __SSE2__ + +#include "gridsample_bilinear_apply_interpolation.h" +#include "gridsample_bicubic_apply_interpolation.h" +#include "gridsample_nearest_apply_interpolation.h" \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h new file mode 100644 index 00000000000..cc4cc837899 --- /dev/null +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -0,0 +1,491 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ +static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2, __m512& coeffs3, const __m512& tx) +{ + const __m512 A = _mm512_set1_ps(-0.75f); + + const __m512 x0 = _mm512_add_ps(tx, *(__m512*)_ps512_1); + const __m512& x1 = tx; + const __m512 x2 = _mm512_sub_ps(*(__m512*)_ps512_1, tx); + //const __m512 x3 = _mm512_add_ps(x2, *(__m512*)_ps512_1); + + coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); + coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), *(__m512*)_ps512_1); + coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), *(__m512*)_ps512_1); + coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); +} + +static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m512 value_f[4]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v0_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v1_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v2_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v3_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + + value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm512_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm512_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + } + + cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*value_y)); + + __m512 _v = _mm512_mul_ps(y_coeffs0, value_f[0]); + _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm512_storeu_ps(dstptr, _v); + + value_x++; + value_y++; + + dstptr += 16; + } + } +} +#endif // __AVX512F__ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ +void gridsample_2d_bicubic_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); +void gridsample_2d_bicubic_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); +void gridsample_2d_bicubic_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); +#endif + +static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) +{ + const __m256 A = _mm256_set1_ps(-0.75f); + + const __m256 x0 = _mm256_add_ps(tx, _mm256_set1_ps(1)); + const __m256& x1 = tx; + const __m256 x2 = _mm256_sub_ps(_mm256_set1_ps(1), tx); + //const __m256 x3 = _mm256_add_ps(x2, _mm256_set1_ps(1)); + + coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); + coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), _mm256_set1_ps(1)); + coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), _mm256_set1_ps(1)); + coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(_mm256_set1_ps(1), coeffs0), coeffs1), coeffs2); +} + +static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_2d_bicubic_apply_interpolation_p8_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m256 value_f[4]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*value_x)); + for (int ii = 0; ii < 4; ii++) + { + float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + +#if __AVX2__ + __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v2_offset = _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v3_offset = _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i v0_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v0_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v1_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v1_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v2_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v2_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v3_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ + + __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(v0_in_bound)); + __m256 x1_val = mask_gather_ps256(srcptr, v1_offset, _mm256_set1_ps(v1_in_bound)); + __m256 x2_val = mask_gather_ps256(srcptr, v2_offset, _mm256_set1_ps(v2_in_bound)); + __m256 x3_val = mask_gather_ps256(srcptr, v3_offset, _mm256_set1_ps(v3_in_bound)); + + value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + } + + cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*value_y)); + + __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); + _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm256_storeu_ps(dstptr, _v); + + value_x++; + value_y++; + + dstptr += 8; + } + } +} + +#endif // __AVX__ +static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, __m128& coeffs3, const __m128& tx) +{ + const __m128 A = _mm_set_ps1(-0.75f); + + const __m128 x0 = _mm_add_ps(tx, _mm_set_ps1(1.0f)); + const __m128& x1 = tx; + const __m128 x2 = _mm_sub_ps(_mm_set_ps1(1.0f), tx); + //const __m128 x3 = _mm_add_ps(x2, _mm_set_ps1(1.0f)); + + coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set_ps1(5.0f), A)), x0), _mm_mul_ps(_mm_set_ps1(8.0f), A)), x0), _mm_mul_ps(_mm_set_ps1(4), A)); + coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set_ps1(2.0f)), x1), _mm_add_ps(A, _mm_set_ps1(3.0f))), x1), x1), _mm_set_ps1(1.0f)); + coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set_ps1(2.0f)), x2), _mm_add_ps(A, _mm_set_ps1(3.0f))), x2), x2), _mm_set_ps1(1.0f)); + coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(_mm_set_ps1(1.0f), coeffs0), coeffs1), coeffs2); +} + +static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_2d_bicubic_apply_interpolation_p4_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m128 value_f[4]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set_ps1(*value_x)); + for (int ii = 0; ii < 4; ii++) + { + float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + + __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v0_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v0_in_bound)); + __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v1_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v1_in_bound)); + __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v2_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v2_in_bound)); + __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v3_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v3_in_bound)); + + value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + } + + cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set_ps1(*value_y)); + + __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); + _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm_storeu_ps(dstptr, _v); + + value_x++; + value_y++; + + dstptr += 4; + } + } +} +#endif // __SSE2__ + +static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2, float& coeffs3, float fx) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs0 = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs1 = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs2 = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs3 = 1.f - coeffs0 - coeffs1 - coeffs2; +} + +static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_2d_bicubic_apply_interpolation_p1_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; + + for (int i = 0; i < 4; i++) + { + v0_offset_ptr[i] = offset.channel(i * 4 + 0); + v1_offset_ptr[i] = offset.channel(i * 4 + 1); + v2_offset_ptr[i] = offset.channel(i * 4 + 2); + v3_offset_ptr[i] = offset.channel(i * 4 + 3); + } + + const float* value_x = value.channel(0); + const float* value_y = value.channel(1); + + int x = 0; +#if __SSE2__ +#if __AVX__ + { + __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m256 value_f[4]; + for (; x + 7 < grid_size; x += 8) + { + cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m256 v0_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v0_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + __m256 v1_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v1_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + __m256 v2_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v2_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + __m256 v3_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v3_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); + + __m256 x0_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v0_offset_ptr[ii] + 7), *(v0_offset_ptr[ii] + 6), *(v0_offset_ptr[ii] + 5), *(v0_offset_ptr[ii] + 4), *(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); + __m256 x1_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v1_offset_ptr[ii] + 7), *(v1_offset_ptr[ii] + 6), *(v1_offset_ptr[ii] + 5), *(v1_offset_ptr[ii] + 4), *(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); + __m256 x2_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v2_offset_ptr[ii] + 7), *(v2_offset_ptr[ii] + 6), *(v2_offset_ptr[ii] + 5), *(v2_offset_ptr[ii] + 4), *(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); + __m256 x3_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v3_offset_ptr[ii] + 7), *(v3_offset_ptr[ii] + 6), *(v3_offset_ptr[ii] + 5), *(v3_offset_ptr[ii] + 4), *(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); + + value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii] += 8; + v1_offset_ptr[ii] += 8; + v2_offset_ptr[ii] += 8; + v3_offset_ptr[ii] += 8; + } + + cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_loadu_ps(value_y)); + + __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); + _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm256_storeu_ps(dstptr, _v); + + value_x += 8; + value_y += 8; + + dstptr += 8; + } + } +#endif // __AVX__ + { + __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m128 value_f[4]; + for (; x + 3 < grid_size; x += 4) + { + cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); + for (int ii = 0; ii < 4; ii++) + { + __m128 v0_in_bound = _mm_andnot_ps(_mm_loadu_ps(v0_offset_ptr[ii]), _mm_set_ps1(-1.0f)); + __m128 v1_in_bound = _mm_andnot_ps(_mm_loadu_ps(v1_offset_ptr[ii]), _mm_set_ps1(-1.0f)); + __m128 v2_in_bound = _mm_andnot_ps(_mm_loadu_ps(v2_offset_ptr[ii]), _mm_set_ps1(-1.0f)); + __m128 v3_in_bound = _mm_andnot_ps(_mm_loadu_ps(v3_offset_ptr[ii]), _mm_set_ps1(-1.0f)); + + __m128 x0_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); + __m128 x1_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); + __m128 x2_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); + __m128 x3_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); + + value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); + value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + v0_offset_ptr[ii] += 4; + v1_offset_ptr[ii] += 4; + v2_offset_ptr[ii] += 4; + v3_offset_ptr[ii] += 4; + } + + cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_loadu_ps(value_y)); + + __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); + _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); + _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); + _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); + _mm_storeu_ps(dstptr, _v); + + value_x += 4; + value_y += 4; + + dstptr += 4; + } + } +#endif // __SSE2__ + float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + float value_f[4]; + + for (; x < grid_size; x++) + { + cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); + for (int ii = 0; ii < 4; ii++) + { + float x0_val = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v0_offset_ptr[ii])) : 0; + float x1_val = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v1_offset_ptr[ii])) : 0; + float x2_val = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v2_offset_ptr[ii])) : 0; + float x3_val = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v3_offset_ptr[ii])) : 0; + + value_f[ii] = x_coeffs0 * x0_val; + value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; + value_f[ii] = x_coeffs2 * x2_val + value_f[ii]; + value_f[ii] = x_coeffs3 * x3_val + value_f[ii]; + + v0_offset_ptr[ii]++; + v1_offset_ptr[ii]++; + v2_offset_ptr[ii]++; + v3_offset_ptr[ii]++; + } + + cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *value_y); + + float _v = y_coeffs0 * value_f[0]; + _v = y_coeffs1 * value_f[1] + _v; + _v = y_coeffs2 * value_f[2] + _v; + _v = y_coeffs3 * value_f[3] + _v; + *dstptr = _v; + + value_x++; + value_y++; + + dstptr++; + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index b10b7d0d2bc..90198f47c73 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -330,451 +330,3 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of } } } - -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ -static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2, __m512& coeffs3, const __m512& tx) -{ - const __m512 A = _mm512_set1_ps(-0.75f); - - const __m512 x0 = _mm512_add_ps(tx, *(__m512*)_ps512_1); - const __m512& x1 = tx; - const __m512 x2 = _mm512_sub_ps(*(__m512*)_ps512_1, tx); - //const __m512 x3 = _mm512_add_ps(x2, *(__m512*)_ps512_1); - - coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); - coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), *(__m512*)_ps512_1); - coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), *(__m512*)_ps512_1); - coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); -} - -static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m512 value_f[4]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); - - for (int i = 0; i < grid_size; i++) - { - cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*value_x)); - for (int ii = 0; ii < 4; ii++) - { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v0_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v1_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v2_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v3_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - - value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm512_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm512_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; - } - - cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*value_y)); - - __m512 _v = _mm512_mul_ps(y_coeffs0, value_f[0]); - _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm512_storeu_ps(dstptr, _v); - - value_x++; - value_y++; - - dstptr += 16; - } - } -} -#endif // __AVX512F__ -static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) -{ - const __m256 A = _mm256_set1_ps(-0.75f); - - const __m256 x0 = _mm256_add_ps(tx, _mm256_set1_ps(1)); - const __m256& x1 = tx; - const __m256 x2 = _mm256_sub_ps(_mm256_set1_ps(1), tx); - //const __m256 x3 = _mm256_add_ps(x2, _mm256_set1_ps(1)); - - coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A)); - coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), _mm256_set1_ps(1)); - coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), _mm256_set1_ps(1)); - coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(_mm256_set1_ps(1), coeffs0), coeffs1), coeffs2); -} - -static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m256 value_f[4]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); - - for (int i = 0; i < grid_size; i++) - { - cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*value_x)); - for (int ii = 0; ii < 4; ii++) - { - float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - -#if __AVX2__ - __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v2_offset = _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v3_offset = _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i v0_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v0_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v1_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v1_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v2_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v2_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v3_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - - __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(v0_in_bound)); - __m256 x1_val = mask_gather_ps256(srcptr, v1_offset, _mm256_set1_ps(v1_in_bound)); - __m256 x2_val = mask_gather_ps256(srcptr, v2_offset, _mm256_set1_ps(v2_in_bound)); - __m256 x3_val = mask_gather_ps256(srcptr, v3_offset, _mm256_set1_ps(v3_in_bound)); - - value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; - } - - cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*value_y)); - - __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); - _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm256_storeu_ps(dstptr, _v); - - value_x++; - value_y++; - - dstptr += 8; - } - } -} - -#endif // __AVX__ -static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, __m128& coeffs3, const __m128& tx) -{ - const __m128 A = _mm_set1_ps(-0.75f); - - const __m128 x0 = _mm_add_ps(tx, *(__m128*)_ps_1); - const __m128& x1 = tx; - const __m128 x2 = _mm_sub_ps(*(__m128*)_ps_1, tx); - //const __m128 x3 = _mm_add_ps(x2, *(__m128*)_ps_1); - - coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set1_ps(5.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(8.0f), A)), x0), _mm_mul_ps(_mm_set1_ps(4), A)); - coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x1), _mm_add_ps(A, _mm_set1_ps(3.0f))), x1), x1), *(__m128*)_ps_1); - coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set1_ps(2.0f)), x2), _mm_add_ps(A, _mm_set1_ps(3.0f))), x2), x2), *(__m128*)_ps_1); - coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(*(__m128*)_ps_1, coeffs0), coeffs1), coeffs2); -} - -static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m128 value_f[4]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); - - for (int i = 0; i < grid_size; i++) - { - cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set1_ps(*value_x)); - for (int ii = 0; ii < 4; ii++) - { - float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - - __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v0_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v0_in_bound)); - __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v1_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v1_in_bound)); - __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v2_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v2_in_bound)); - __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v3_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(v3_in_bound)); - - value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; - } - - cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set1_ps(*value_y)); - - __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); - _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm_storeu_ps(dstptr, _v); - - value_x++; - value_y++; - - dstptr += 4; - } - } -} -#endif // __SSE2__ - -static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2, float& coeffs3, float fx) -{ - const float A = -0.75f; - - float fx0 = fx + 1; - float fx1 = fx; - float fx2 = 1 - fx; - // float fx3 = 2 - fx; - - coeffs0 = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; - coeffs1 = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; - coeffs2 = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; - coeffs3 = 1.f - coeffs0 - coeffs1 - coeffs2; -} - -static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); - - int x = 0; -#if __SSE2__ -#if __AVX__ - { - __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m256 value_f[4]; - for (; x + 7 < grid_size; x += 8) - { - cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); - for (int ii = 0; ii < 4; ii++) - { - __m256 v0_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v0_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - __m256 v1_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v1_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - __m256 v2_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v2_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - __m256 v3_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v3_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - - __m256 x0_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v0_offset_ptr[ii] + 7), *(v0_offset_ptr[ii] + 6), *(v0_offset_ptr[ii] + 5), *(v0_offset_ptr[ii] + 4), *(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); - __m256 x1_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v1_offset_ptr[ii] + 7), *(v1_offset_ptr[ii] + 6), *(v1_offset_ptr[ii] + 5), *(v1_offset_ptr[ii] + 4), *(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); - __m256 x2_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v2_offset_ptr[ii] + 7), *(v2_offset_ptr[ii] + 6), *(v2_offset_ptr[ii] + 5), *(v2_offset_ptr[ii] + 4), *(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); - __m256 x3_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v3_offset_ptr[ii] + 7), *(v3_offset_ptr[ii] + 6), *(v3_offset_ptr[ii] + 5), *(v3_offset_ptr[ii] + 4), *(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); - - value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii] += 8; - v1_offset_ptr[ii] += 8; - v2_offset_ptr[ii] += 8; - v3_offset_ptr[ii] += 8; - } - - cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_loadu_ps(value_y)); - - __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); - _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm256_storeu_ps(dstptr, _v); - - value_x += 8; - value_y += 8; - - dstptr += 8; - } - } -#endif // __AVX__ - { - __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m128 value_f[4]; - for (; x + 3 < grid_size; x += 4) - { - cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); - for (int ii = 0; ii < 4; ii++) - { - __m128 v0_in_bound = _mm_andnot_ps(_mm_loadu_ps(v0_offset_ptr[ii]), _mm_set1_ps(-1.0f)); - __m128 v1_in_bound = _mm_andnot_ps(_mm_loadu_ps(v1_offset_ptr[ii]), _mm_set1_ps(-1.0f)); - __m128 v2_in_bound = _mm_andnot_ps(_mm_loadu_ps(v2_offset_ptr[ii]), _mm_set1_ps(-1.0f)); - __m128 v3_in_bound = _mm_andnot_ps(_mm_loadu_ps(v3_offset_ptr[ii]), _mm_set1_ps(-1.0f)); - - __m128 x0_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); - __m128 x1_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); - __m128 x2_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); - __m128 x3_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); - - value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii] += 4; - v1_offset_ptr[ii] += 4; - v2_offset_ptr[ii] += 4; - v3_offset_ptr[ii] += 4; - } - - cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_loadu_ps(value_y)); - - __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); - _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm_storeu_ps(dstptr, _v); - - value_x += 4; - value_y += 4; - - dstptr += 4; - } - } -#endif // __SSE2__ - float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - float value_f[4]; - - for (; x < grid_size; x++) - { - cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); - for (int ii = 0; ii < 4; ii++) - { - float x0_val = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v0_offset_ptr[ii])) : 0; - float x1_val = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v1_offset_ptr[ii])) : 0; - float x2_val = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v2_offset_ptr[ii])) : 0; - float x3_val = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v3_offset_ptr[ii])) : 0; - - value_f[ii] = x_coeffs0 * x0_val; - value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; - value_f[ii] = x_coeffs2 * x2_val + value_f[ii]; - value_f[ii] = x_coeffs3 * x3_val + value_f[ii]; - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; - } - - cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *value_y); - - float _v = y_coeffs0 * value_f[0]; - _v = y_coeffs1 * value_f[1] + _v; - _v = y_coeffs2 * value_f[2] + _v; - _v = y_coeffs3 * value_f[3] + _v; - *dstptr = _v; - - value_x++; - value_y++; - - dstptr++; - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h new file mode 100644 index 00000000000..862fb35df11 --- /dev/null +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -0,0 +1,848 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ +static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + __m512i v00_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_00), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v01_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_01), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_10), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_11), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + __mmask16 mask00 = *reinterpret_cast(offset_ptr_00) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask01 = *reinterpret_cast(offset_ptr_01) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask10 = *reinterpret_cast(offset_ptr_10) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __mmask16 mask11 = *reinterpret_cast(offset_ptr_11) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + + __m512 v00_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask00, v00_offset, srcptr, sizeof(float)); + __m512 v01_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask01, v01_offset, srcptr, sizeof(float)); + __m512 v10_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask10, v10_offset, srcptr, sizeof(float)); + __m512 v11_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask11, v11_offset, srcptr, sizeof(float)); + + __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); + __m512 beta = _mm512_set1_ps(*value_ptr_beta); + + __m512 v0 = _mm512_fmadd_ps(v01_val, alpha, _mm512_fnmadd_ps(v00_val, alpha, v00_val)); + __m512 v1 = _mm512_fmadd_ps(v11_val, alpha, _mm512_fnmadd_ps(v10_val, alpha, v10_val)); + + __m512 _v = _mm512_fmadd_ps(v1, beta, _mm512_fnmadd_ps(v0, beta, v0)); + _mm512_storeu_ps(dstptr, _v); + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + + dstptr += 16; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + for (int i = 0; i < grid_size; i++) + { + __m512i v000_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_000), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v001_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_001), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v010_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_010), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v011_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_011), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v100_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_100), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v101_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_101), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_110), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_111), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_000) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); + __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_001) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); + __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_010) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); + __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_011) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); + __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_100) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); + __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_101) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); + __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_110) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); + __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_111) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); + + __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); + __m512 beta = _mm512_set1_ps(*value_ptr_beta); + __m512 gamma = _mm512_set1_ps(*value_ptr_gamma); + + __m512 v00 = _mm512_fmadd_ps(v001_val, alpha, _mm512_fnmadd_ps(v000_val, alpha, v000_val)); + __m512 v01 = _mm512_fmadd_ps(v011_val, alpha, _mm512_fnmadd_ps(v010_val, alpha, v010_val)); + __m512 v10 = _mm512_fmadd_ps(v101_val, alpha, _mm512_fnmadd_ps(v100_val, alpha, v100_val)); + __m512 v11 = _mm512_fmadd_ps(v111_val, alpha, _mm512_fnmadd_ps(v110_val, alpha, v110_val)); + + __m512 v0 = _mm512_fmadd_ps(v01, beta, _mm512_fnmadd_ps(v00, beta, v00)); + __m512 v1 = _mm512_fmadd_ps(v11, beta, _mm512_fnmadd_ps(v10, beta, v10)); + + __m512 _v = _mm512_fmadd_ps(v1, gamma, _mm512_fnmadd_ps(v0, gamma, v0)); + _mm512_storeu_ps(dstptr, _v); + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + + dstptr += 16; + } + } +} + +#endif // __AVX512F__ + +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ +void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); +void gridsample_2d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); +void gridsample_2d_bilinear_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); + +void gridsample_3d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); +void gridsample_3d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); +void gridsample_3d_bilinear_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); +#endif + +static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_2d_bilinear_apply_interpolation_p8_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { +#if __AVX2__ + __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_00), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_01), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_10), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_11), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i v00_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_00), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v01_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_01), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v10_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_10), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_11), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ + + float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; + float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; + float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; + float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; + + __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, _mm256_set1_ps(in_bound_00)); + __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, _mm256_set1_ps(in_bound_01)); + __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, _mm256_set1_ps(in_bound_10)); + __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, _mm256_set1_ps(in_bound_11)); + + __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); + __m256 beta = _mm256_set1_ps(*value_ptr_beta); + + __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + + dstptr += 8; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_3d_bilinear_apply_interpolation_p8_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + for (int i = 0; i < grid_size; i++) + { +#if __AVX2__ + __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_000), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_001), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_010), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v011_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_011), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v100_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_100), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_101), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_110), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_111), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i v000_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_000), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v001_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_001), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v010_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_010), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v011_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_011), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v100_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_100), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v101_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_101), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v110_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_110), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_111), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ + + float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; + float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; + float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; + float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; + float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; + float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; + float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; + float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; + + __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, _mm256_set1_ps(in_bound_000)); + __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, _mm256_set1_ps(in_bound_001)); + __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, _mm256_set1_ps(in_bound_010)); + __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, _mm256_set1_ps(in_bound_011)); + __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, _mm256_set1_ps(in_bound_100)); + __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, _mm256_set1_ps(in_bound_101)); + __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, _mm256_set1_ps(in_bound_110)); + __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, _mm256_set1_ps(in_bound_111)); + + __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); + __m256 beta = _mm256_set1_ps(*value_ptr_beta); + __m256 gamma = _mm256_set1_ps(*value_ptr_gamma); + + __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); + __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + + dstptr += 8; + } + } +} +#endif // __AVX__ +static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_2d_bilinear_apply_interpolation_p4_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + for (int i = 0; i < grid_size; i++) + { + __m128i v00_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_00), _mm_set_epi32(3, 2, 1, 0)); + __m128i v01_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_01), _mm_set_epi32(3, 2, 1, 0)); + __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_10), _mm_set_epi32(3, 2, 1, 0)); + __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_11), _mm_set_epi32(3, 2, 1, 0)); + + float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; + float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; + float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; + float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; + + __m128 v00_val = mask_gather_ps(srcptr, v00_offset, _mm_set1_ps(in_bound_00)); + __m128 v01_val = mask_gather_ps(srcptr, v01_offset, _mm_set1_ps(in_bound_01)); + __m128 v10_val = mask_gather_ps(srcptr, v10_offset, _mm_set1_ps(in_bound_10)); + __m128 v11_val = mask_gather_ps(srcptr, v11_offset, _mm_set1_ps(in_bound_11)); + + __m128 alpha = _mm_set1_ps(*value_ptr_alpha); + __m128 beta = _mm_set1_ps(*value_ptr_beta); + + __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + + dstptr += 4; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_3d_bilinear_apply_interpolation_p4_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + for (int i = 0; i < grid_size; i++) + { + __m128i v000_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_000), _mm_set_epi32(3, 2, 1, 0)); + __m128i v001_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_001), _mm_set_epi32(3, 2, 1, 0)); + __m128i v010_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_010), _mm_set_epi32(3, 2, 1, 0)); + __m128i v011_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_011), _mm_set_epi32(3, 2, 1, 0)); + __m128i v100_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_100), _mm_set_epi32(3, 2, 1, 0)); + __m128i v101_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_101), _mm_set_epi32(3, 2, 1, 0)); + __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_110), _mm_set_epi32(3, 2, 1, 0)); + __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_111), _mm_set_epi32(3, 2, 1, 0)); + + float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; + float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; + float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; + float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; + float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; + float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; + float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; + float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; + + __m128 v000_val = mask_gather_ps(srcptr, v000_offset, _mm_set1_ps(in_bound_000)); + __m128 v001_val = mask_gather_ps(srcptr, v001_offset, _mm_set1_ps(in_bound_001)); + __m128 v010_val = mask_gather_ps(srcptr, v010_offset, _mm_set1_ps(in_bound_010)); + __m128 v011_val = mask_gather_ps(srcptr, v011_offset, _mm_set1_ps(in_bound_011)); + __m128 v100_val = mask_gather_ps(srcptr, v100_offset, _mm_set1_ps(in_bound_100)); + __m128 v101_val = mask_gather_ps(srcptr, v101_offset, _mm_set1_ps(in_bound_101)); + __m128 v110_val = mask_gather_ps(srcptr, v110_offset, _mm_set1_ps(in_bound_110)); + __m128 v111_val = mask_gather_ps(srcptr, v111_offset, _mm_set1_ps(in_bound_111)); + + __m128 alpha = _mm_set1_ps(*value_ptr_alpha); + __m128 beta = _mm_set1_ps(*value_ptr_beta); + __m128 gamma = _mm_set1_ps(*value_ptr_gamma); + + __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); + __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); + + __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + + dstptr += 4; + } + } +} +#endif // __SSE2__ + +static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_2d_bilinear_apply_interpolation_p1_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int grid_size = outw * outh; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_00 = offset.channel(0); + const float* offset_ptr_01 = offset.channel(1); + const float* offset_ptr_10 = offset.channel(2); + const float* offset_ptr_11 = offset.channel(3); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + + int x = 0; +#if __SSE2__ +#if __AVX__ + + for (; x + 7 < grid_size; x += 8) + { + __m256i v00_offset = _mm256_set_epi32(*(offset_ptr_00 + 7), *(offset_ptr_00 + 6), *(offset_ptr_00 + 5), *(offset_ptr_00 + 4), *(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); + __m256i v01_offset = _mm256_set_epi32(*(offset_ptr_01 + 7), *(offset_ptr_01 + 6), *(offset_ptr_01 + 5), *(offset_ptr_01 + 4), *(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); + __m256i v10_offset = _mm256_set_epi32(*(offset_ptr_10 + 7), *(offset_ptr_10 + 6), *(offset_ptr_10 + 5), *(offset_ptr_10 + 4), *(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); + __m256i v11_offset = _mm256_set_epi32(*(offset_ptr_11 + 7), *(offset_ptr_11 + 6), *(offset_ptr_11 + 5), *(offset_ptr_11 + 4), *(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); + + __m256 v00_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_00), _mm256_set1_ps(-1.0f)); + __m256 v01_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_01), _mm256_set1_ps(-1.0f)); + __m256 v10_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_10), _mm256_set1_ps(-1.0f)); + __m256 v11_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_11), _mm256_set1_ps(-1.0f)); + + __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_bound); + __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_bound); + __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_bound); + __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_bound); + + __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); + __m256 beta = _mm256_loadu_ps(value_ptr_beta); + + __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_00 += 8; + offset_ptr_01 += 8; + offset_ptr_10 += 8; + offset_ptr_11 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + + dstptr += 8; + } +#endif // __AVX__ + for (; x + 3 < grid_size; x += 4) + { + __m128i v00_offset = _mm_set_epi32(*(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); + __m128i v01_offset = _mm_set_epi32(*(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); + __m128i v10_offset = _mm_set_epi32(*(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); + __m128i v11_offset = _mm_set_epi32(*(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); + + __m128 v00_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_00), _mm_set1_ps(-1.0f)); + __m128 v01_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_01), _mm_set1_ps(-1.0f)); + __m128 v10_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_10), _mm_set1_ps(-1.0f)); + __m128 v11_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_11), _mm_set1_ps(-1.0f)); + + __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_bound); + __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_bound); + __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_bound); + __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_bound); + + __m128 alpha = _mm_loadu_ps(value_ptr_alpha); + __m128 beta = _mm_loadu_ps(value_ptr_beta); + + __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); + __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); + + __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_00 += 4; + offset_ptr_01 += 4; + offset_ptr_10 += 4; + offset_ptr_11 += 4; + + value_ptr_alpha += 4; + value_ptr_beta += 4; + + dstptr += 4; + } +#endif // __SSE2__ + for (; x < grid_size; x++) + { + float v00 = *offset_ptr_00 >= 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; + float v01 = *offset_ptr_01 >= 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; + float v10 = *offset_ptr_10 >= 0 ? *(srcptr + static_cast(*offset_ptr_10)) : 0; + float v11 = *offset_ptr_11 >= 0 ? *(srcptr + static_cast(*offset_ptr_11)) : 0; + + float v0 = v00 * (1 - *value_ptr_alpha) + v01 * *value_ptr_alpha; + float v1 = v10 * (1 - *value_ptr_alpha) + v11 * *value_ptr_alpha; + + *dstptr = v0 * (1 - *value_ptr_beta) + v1 * *value_ptr_beta; + + offset_ptr_00++; + offset_ptr_01++; + offset_ptr_10++; + offset_ptr_11++; + + value_ptr_alpha++; + value_ptr_beta++; + dstptr++; + } + } +} +static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_3d_bilinear_apply_interpolation_p1_avx2(src, dst, offset, value, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr_000 = offset.channel(0); + const float* offset_ptr_001 = offset.channel(1); + const float* offset_ptr_010 = offset.channel(2); + const float* offset_ptr_011 = offset.channel(3); + const float* offset_ptr_100 = offset.channel(4); + const float* offset_ptr_101 = offset.channel(5); + const float* offset_ptr_110 = offset.channel(6); + const float* offset_ptr_111 = offset.channel(7); + + const float* value_ptr_alpha = value.channel(0); + const float* value_ptr_beta = value.channel(1); + const float* value_ptr_gamma = value.channel(2); + + int x = 0; +#if __SSE2__ +#if __AVX__ + for (; x + 7 < grid_size; x += 8) + { + __m256i v000_offset = _mm256_set_epi32(*(offset_ptr_000 + 7), *(offset_ptr_000 + 6), *(offset_ptr_000 + 5), *(offset_ptr_000 + 4), *(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); + __m256i v001_offset = _mm256_set_epi32(*(offset_ptr_001 + 7), *(offset_ptr_001 + 6), *(offset_ptr_001 + 5), *(offset_ptr_001 + 4), *(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); + __m256i v010_offset = _mm256_set_epi32(*(offset_ptr_010 + 7), *(offset_ptr_010 + 6), *(offset_ptr_010 + 5), *(offset_ptr_010 + 4), *(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); + __m256i v011_offset = _mm256_set_epi32(*(offset_ptr_011 + 7), *(offset_ptr_011 + 6), *(offset_ptr_011 + 5), *(offset_ptr_011 + 4), *(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); + __m256i v100_offset = _mm256_set_epi32(*(offset_ptr_100 + 7), *(offset_ptr_100 + 6), *(offset_ptr_100 + 5), *(offset_ptr_100 + 4), *(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); + __m256i v101_offset = _mm256_set_epi32(*(offset_ptr_101 + 7), *(offset_ptr_101 + 6), *(offset_ptr_101 + 5), *(offset_ptr_101 + 4), *(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); + __m256i v110_offset = _mm256_set_epi32(*(offset_ptr_110 + 7), *(offset_ptr_110 + 6), *(offset_ptr_110 + 5), *(offset_ptr_110 + 4), *(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); + __m256i v111_offset = _mm256_set_epi32(*(offset_ptr_111 + 7), *(offset_ptr_111 + 6), *(offset_ptr_111 + 5), *(offset_ptr_111 + 4), *(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); + + __m256 v000_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_000), _mm256_set1_ps(-1.0f)); + __m256 v001_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_001), _mm256_set1_ps(-1.0f)); + __m256 v010_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_010), _mm256_set1_ps(-1.0f)); + __m256 v011_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_011), _mm256_set1_ps(-1.0f)); + __m256 v100_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_100), _mm256_set1_ps(-1.0f)); + __m256 v101_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_101), _mm256_set1_ps(-1.0f)); + __m256 v110_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_110), _mm256_set1_ps(-1.0f)); + __m256 v111_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_111), _mm256_set1_ps(-1.0f)); + + __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_bound); + __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_bound); + __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_bound); + __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_bound); + __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_bound); + __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_bound); + __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_bound); + __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_bound); + + __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); + __m256 beta = _mm256_loadu_ps(value_ptr_beta); + __m256 gamma = _mm256_loadu_ps(value_ptr_gamma); + + __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); + __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); + + __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); + _mm256_storeu_ps(dstptr, _v); + + offset_ptr_000 += 8; + offset_ptr_001 += 8; + offset_ptr_010 += 8; + offset_ptr_011 += 8; + + offset_ptr_100 += 8; + offset_ptr_101 += 8; + offset_ptr_110 += 8; + offset_ptr_111 += 8; + + value_ptr_alpha += 8; + value_ptr_beta += 8; + value_ptr_gamma += 8; + + dstptr += 8; + } + +#endif // __AVX__ + for (; x + 3 < grid_size; x += 4) + { + __m128i v000_offset = _mm_set_epi32(*(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); + __m128i v001_offset = _mm_set_epi32(*(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); + __m128i v010_offset = _mm_set_epi32(*(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); + __m128i v011_offset = _mm_set_epi32(*(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); + __m128i v100_offset = _mm_set_epi32(*(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); + __m128i v101_offset = _mm_set_epi32(*(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); + __m128i v110_offset = _mm_set_epi32(*(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); + __m128i v111_offset = _mm_set_epi32(*(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); + + __m128 v000_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_000), _mm_set1_ps(-1.0f)); + __m128 v001_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_001), _mm_set1_ps(-1.0f)); + __m128 v010_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_010), _mm_set1_ps(-1.0f)); + __m128 v011_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_011), _mm_set1_ps(-1.0f)); + __m128 v100_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_100), _mm_set1_ps(-1.0f)); + __m128 v101_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_101), _mm_set1_ps(-1.0f)); + __m128 v110_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_110), _mm_set1_ps(-1.0f)); + __m128 v111_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_111), _mm_set1_ps(-1.0f)); + + __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_bound); + __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_bound); + __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_bound); + __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_bound); + __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_bound); + __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_bound); + __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_bound); + __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_bound); + + __m128 alpha = _mm_loadu_ps(value_ptr_alpha); + __m128 beta = _mm_loadu_ps(value_ptr_beta); + __m128 gamma = _mm_loadu_ps(value_ptr_gamma); + + __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); + __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); + __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); + __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); + + __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); + __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); + + __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); + _mm_storeu_ps(dstptr, _v); + + offset_ptr_000 += 4; + offset_ptr_001 += 4; + offset_ptr_010 += 4; + offset_ptr_011 += 4; + + offset_ptr_100 += 4; + offset_ptr_101 += 4; + offset_ptr_110 += 4; + offset_ptr_111 += 4; + + value_ptr_alpha += 4; + value_ptr_beta += 4; + value_ptr_gamma += 4; + + dstptr += 4; + } +#endif // __SSE2__ + for (; x < grid_size; x++) + { + float v000 = *reinterpret_cast(offset_ptr_000) >= 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; + float v001 = *reinterpret_cast(offset_ptr_001) >= 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; + float v010 = *reinterpret_cast(offset_ptr_010) >= 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; + float v011 = *reinterpret_cast(offset_ptr_011) >= 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; + + float v100 = *reinterpret_cast(offset_ptr_100) >= 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; + float v101 = *reinterpret_cast(offset_ptr_101) >= 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; + float v110 = *reinterpret_cast(offset_ptr_110) >= 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; + float v111 = *reinterpret_cast(offset_ptr_111) >= 0 ? *(srcptr + static_cast(*offset_ptr_111)) : 0; + + float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; + float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; + float v10 = v100 * (1 - *value_ptr_alpha) + v101 * *value_ptr_alpha; + float v11 = v110 * (1 - *value_ptr_alpha) + v111 * *value_ptr_alpha; + + float v0 = v00 * (1 - *value_ptr_beta) + v01 * *value_ptr_beta; + float v1 = v10 * (1 - *value_ptr_beta) + v11 * *value_ptr_beta; + + *dstptr = v0 * (1 - *value_ptr_gamma) + v1 * *value_ptr_gamma; + + offset_ptr_000++; + offset_ptr_001++; + offset_ptr_010++; + offset_ptr_011++; + + offset_ptr_100++; + offset_ptr_101++; + offset_ptr_110++; + offset_ptr_111++; + + value_ptr_alpha++; + value_ptr_beta++; + value_ptr_gamma++; + dstptr++; + } + } +} \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 33e6f122696..ed5e5934b41 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -696,779 +696,3 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o } } } - -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ -static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - - for (int i = 0; i < grid_size; i++) - { - __m512i v00_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_00), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v01_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_01), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_10), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_11), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - __mmask16 mask00 = *reinterpret_cast(offset_ptr_00) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask01 = *reinterpret_cast(offset_ptr_01) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask10 = *reinterpret_cast(offset_ptr_10) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask11 = *reinterpret_cast(offset_ptr_11) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - - __m512 v00_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask00, v00_offset, srcptr, sizeof(float)); - __m512 v01_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask01, v01_offset, srcptr, sizeof(float)); - __m512 v10_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask10, v10_offset, srcptr, sizeof(float)); - __m512 v11_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask11, v11_offset, srcptr, sizeof(float)); - - __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); - __m512 beta = _mm512_set1_ps(*value_ptr_beta); - - __m512 v0 = _mm512_fmadd_ps(v01_val, alpha, _mm512_fnmadd_ps(v00_val, alpha, v00_val)); - __m512 v1 = _mm512_fmadd_ps(v11_val, alpha, _mm512_fnmadd_ps(v10_val, alpha, v10_val)); - - __m512 _v = _mm512_fmadd_ps(v1, beta, _mm512_fnmadd_ps(v0, beta, v0)); - _mm512_storeu_ps(dstptr, _v); - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - - dstptr += 16; - } - } -} -static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); - - for (int i = 0; i < grid_size; i++) - { - __m512i v000_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_000), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v001_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_001), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v010_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_010), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v011_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_011), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v100_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_100), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v101_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_101), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_110), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_111), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_000) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); - __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_001) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); - __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_010) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); - __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_011) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); - __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_100) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); - __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_101) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); - __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_110) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); - __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_111) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); - - __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); - __m512 beta = _mm512_set1_ps(*value_ptr_beta); - __m512 gamma = _mm512_set1_ps(*value_ptr_gamma); - - __m512 v00 = _mm512_fmadd_ps(v001_val, alpha, _mm512_fnmadd_ps(v000_val, alpha, v000_val)); - __m512 v01 = _mm512_fmadd_ps(v011_val, alpha, _mm512_fnmadd_ps(v010_val, alpha, v010_val)); - __m512 v10 = _mm512_fmadd_ps(v101_val, alpha, _mm512_fnmadd_ps(v100_val, alpha, v100_val)); - __m512 v11 = _mm512_fmadd_ps(v111_val, alpha, _mm512_fnmadd_ps(v110_val, alpha, v110_val)); - - __m512 v0 = _mm512_fmadd_ps(v01, beta, _mm512_fnmadd_ps(v00, beta, v00)); - __m512 v1 = _mm512_fmadd_ps(v11, beta, _mm512_fnmadd_ps(v10, beta, v10)); - - __m512 _v = _mm512_fmadd_ps(v1, gamma, _mm512_fnmadd_ps(v0, gamma, v0)); - _mm512_storeu_ps(dstptr, _v); - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - - dstptr += 16; - } - } -} - -#endif // __AVX512F__ -static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - - for (int i = 0; i < grid_size; i++) - { -#if __AVX2__ - __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_00), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_01), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_10), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_11), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i v00_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_00), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v01_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_01), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v10_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_10), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_11), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - - float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; - float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; - float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; - float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; - - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, _mm256_set1_ps(in_bound_00)); - __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, _mm256_set1_ps(in_bound_01)); - __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, _mm256_set1_ps(in_bound_10)); - __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, _mm256_set1_ps(in_bound_11)); - - __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); - __m256 beta = _mm256_set1_ps(*value_ptr_beta); - - __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); - _mm256_storeu_ps(dstptr, _v); - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - - dstptr += 8; - } - } -} -static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); - - for (int i = 0; i < grid_size; i++) - { -#if __AVX2__ - __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_000), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_001), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_010), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v011_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_011), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v100_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_100), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_101), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_110), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_111), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i v000_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_000), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v001_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_001), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v010_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_010), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v011_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_011), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v100_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_100), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v101_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_101), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v110_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_110), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_111), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - - float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; - float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; - float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; - float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; - float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; - float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; - float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; - float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; - - __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, _mm256_set1_ps(in_bound_000)); - __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, _mm256_set1_ps(in_bound_001)); - __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, _mm256_set1_ps(in_bound_010)); - __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, _mm256_set1_ps(in_bound_011)); - __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, _mm256_set1_ps(in_bound_100)); - __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, _mm256_set1_ps(in_bound_101)); - __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, _mm256_set1_ps(in_bound_110)); - __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, _mm256_set1_ps(in_bound_111)); - - __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); - __m256 beta = _mm256_set1_ps(*value_ptr_beta); - __m256 gamma = _mm256_set1_ps(*value_ptr_gamma); - - __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); - - __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); - __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); - _mm256_storeu_ps(dstptr, _v); - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - - dstptr += 8; - } - } -} -#endif // __AVX__ -static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - - for (int i = 0; i < grid_size; i++) - { - __m128i v00_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_00), _mm_set_epi32(3, 2, 1, 0)); - __m128i v01_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_01), _mm_set_epi32(3, 2, 1, 0)); - __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_10), _mm_set_epi32(3, 2, 1, 0)); - __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_11), _mm_set_epi32(3, 2, 1, 0)); - - float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; - float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; - float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; - float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; - - __m128 v00_val = mask_gather_ps(srcptr, v00_offset, _mm_set1_ps(in_bound_00)); - __m128 v01_val = mask_gather_ps(srcptr, v01_offset, _mm_set1_ps(in_bound_01)); - __m128 v10_val = mask_gather_ps(srcptr, v10_offset, _mm_set1_ps(in_bound_10)); - __m128 v11_val = mask_gather_ps(srcptr, v11_offset, _mm_set1_ps(in_bound_11)); - - __m128 alpha = _mm_set1_ps(*value_ptr_alpha); - __m128 beta = _mm_set1_ps(*value_ptr_beta); - - __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); - _mm_storeu_ps(dstptr, _v); - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - - dstptr += 4; - } - } -} -static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); - - for (int i = 0; i < grid_size; i++) - { - __m128i v000_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_000), _mm_set_epi32(3, 2, 1, 0)); - __m128i v001_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_001), _mm_set_epi32(3, 2, 1, 0)); - __m128i v010_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_010), _mm_set_epi32(3, 2, 1, 0)); - __m128i v011_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_011), _mm_set_epi32(3, 2, 1, 0)); - __m128i v100_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_100), _mm_set_epi32(3, 2, 1, 0)); - __m128i v101_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_101), _mm_set_epi32(3, 2, 1, 0)); - __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_110), _mm_set_epi32(3, 2, 1, 0)); - __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_111), _mm_set_epi32(3, 2, 1, 0)); - - float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; - float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; - float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; - float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; - float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; - float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; - float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; - float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; - - __m128 v000_val = mask_gather_ps(srcptr, v000_offset, _mm_set1_ps(in_bound_000)); - __m128 v001_val = mask_gather_ps(srcptr, v001_offset, _mm_set1_ps(in_bound_001)); - __m128 v010_val = mask_gather_ps(srcptr, v010_offset, _mm_set1_ps(in_bound_010)); - __m128 v011_val = mask_gather_ps(srcptr, v011_offset, _mm_set1_ps(in_bound_011)); - __m128 v100_val = mask_gather_ps(srcptr, v100_offset, _mm_set1_ps(in_bound_100)); - __m128 v101_val = mask_gather_ps(srcptr, v101_offset, _mm_set1_ps(in_bound_101)); - __m128 v110_val = mask_gather_ps(srcptr, v110_offset, _mm_set1_ps(in_bound_110)); - __m128 v111_val = mask_gather_ps(srcptr, v111_offset, _mm_set1_ps(in_bound_111)); - - __m128 alpha = _mm_set1_ps(*value_ptr_alpha); - __m128 beta = _mm_set1_ps(*value_ptr_beta); - __m128 gamma = _mm_set1_ps(*value_ptr_gamma); - - __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); - - __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); - __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); - - __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); - _mm_storeu_ps(dstptr, _v); - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - - dstptr += 4; - } - } -} -#endif // __SSE2__ - -static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int grid_size = outw * outh; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - - int x = 0; -#if __SSE2__ -#if __AVX__ - - for (; x + 7 < grid_size; x += 8) - { - __m256i v00_offset = _mm256_set_epi32(*(offset_ptr_00 + 7), *(offset_ptr_00 + 6), *(offset_ptr_00 + 5), *(offset_ptr_00 + 4), *(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); - __m256i v01_offset = _mm256_set_epi32(*(offset_ptr_01 + 7), *(offset_ptr_01 + 6), *(offset_ptr_01 + 5), *(offset_ptr_01 + 4), *(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); - __m256i v10_offset = _mm256_set_epi32(*(offset_ptr_10 + 7), *(offset_ptr_10 + 6), *(offset_ptr_10 + 5), *(offset_ptr_10 + 4), *(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); - __m256i v11_offset = _mm256_set_epi32(*(offset_ptr_11 + 7), *(offset_ptr_11 + 6), *(offset_ptr_11 + 5), *(offset_ptr_11 + 4), *(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); - - __m256 v00_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_00), _mm256_set1_ps(-1.0f)); - __m256 v01_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_01), _mm256_set1_ps(-1.0f)); - __m256 v10_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_10), _mm256_set1_ps(-1.0f)); - __m256 v11_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_11), _mm256_set1_ps(-1.0f)); - - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_bound); - __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_bound); - __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_bound); - __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_bound); - - __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); - __m256 beta = _mm256_loadu_ps(value_ptr_beta); - - __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); - _mm256_storeu_ps(dstptr, _v); - - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - - dstptr += 8; - } -#endif // __AVX__ - for (; x + 3 < grid_size; x += 4) - { - __m128i v00_offset = _mm_set_epi32(*(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); - __m128i v01_offset = _mm_set_epi32(*(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); - __m128i v10_offset = _mm_set_epi32(*(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); - __m128i v11_offset = _mm_set_epi32(*(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); - - __m128 v00_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_00), _mm_set1_ps(-1.0f)); - __m128 v01_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_01), _mm_set1_ps(-1.0f)); - __m128 v10_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_10), _mm_set1_ps(-1.0f)); - __m128 v11_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_11), _mm_set1_ps(-1.0f)); - - __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_bound); - __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_bound); - __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_bound); - __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_bound); - - __m128 alpha = _mm_loadu_ps(value_ptr_alpha); - __m128 beta = _mm_loadu_ps(value_ptr_beta); - - __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); - _mm_storeu_ps(dstptr, _v); - - offset_ptr_00 += 4; - offset_ptr_01 += 4; - offset_ptr_10 += 4; - offset_ptr_11 += 4; - - value_ptr_alpha += 4; - value_ptr_beta += 4; - - dstptr += 4; - } -#endif // __SSE2__ - for (; x < grid_size; x++) - { - float v00 = *offset_ptr_00 >= 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; - float v01 = *offset_ptr_01 >= 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; - float v10 = *offset_ptr_10 >= 0 ? *(srcptr + static_cast(*offset_ptr_10)) : 0; - float v11 = *offset_ptr_11 >= 0 ? *(srcptr + static_cast(*offset_ptr_11)) : 0; - - float v0 = v00 * (1 - *value_ptr_alpha) + v01 * *value_ptr_alpha; - float v1 = v10 * (1 - *value_ptr_alpha) + v11 * *value_ptr_alpha; - - *dstptr = v0 * (1 - *value_ptr_beta) + v1 * *value_ptr_beta; - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - dstptr++; - } - } -} -static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); - - int x = 0; -#if __SSE2__ -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256i v000_offset = _mm256_set_epi32(*(offset_ptr_000 + 7), *(offset_ptr_000 + 6), *(offset_ptr_000 + 5), *(offset_ptr_000 + 4), *(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); - __m256i v001_offset = _mm256_set_epi32(*(offset_ptr_001 + 7), *(offset_ptr_001 + 6), *(offset_ptr_001 + 5), *(offset_ptr_001 + 4), *(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); - __m256i v010_offset = _mm256_set_epi32(*(offset_ptr_010 + 7), *(offset_ptr_010 + 6), *(offset_ptr_010 + 5), *(offset_ptr_010 + 4), *(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); - __m256i v011_offset = _mm256_set_epi32(*(offset_ptr_011 + 7), *(offset_ptr_011 + 6), *(offset_ptr_011 + 5), *(offset_ptr_011 + 4), *(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); - __m256i v100_offset = _mm256_set_epi32(*(offset_ptr_100 + 7), *(offset_ptr_100 + 6), *(offset_ptr_100 + 5), *(offset_ptr_100 + 4), *(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); - __m256i v101_offset = _mm256_set_epi32(*(offset_ptr_101 + 7), *(offset_ptr_101 + 6), *(offset_ptr_101 + 5), *(offset_ptr_101 + 4), *(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); - __m256i v110_offset = _mm256_set_epi32(*(offset_ptr_110 + 7), *(offset_ptr_110 + 6), *(offset_ptr_110 + 5), *(offset_ptr_110 + 4), *(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); - __m256i v111_offset = _mm256_set_epi32(*(offset_ptr_111 + 7), *(offset_ptr_111 + 6), *(offset_ptr_111 + 5), *(offset_ptr_111 + 4), *(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); - - __m256 v000_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_000), _mm256_set1_ps(-1.0f)); - __m256 v001_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_001), _mm256_set1_ps(-1.0f)); - __m256 v010_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_010), _mm256_set1_ps(-1.0f)); - __m256 v011_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_011), _mm256_set1_ps(-1.0f)); - __m256 v100_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_100), _mm256_set1_ps(-1.0f)); - __m256 v101_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_101), _mm256_set1_ps(-1.0f)); - __m256 v110_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_110), _mm256_set1_ps(-1.0f)); - __m256 v111_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_111), _mm256_set1_ps(-1.0f)); - - __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_bound); - __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_bound); - __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_bound); - __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_bound); - __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_bound); - __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_bound); - __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_bound); - __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_bound); - - __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); - __m256 beta = _mm256_loadu_ps(value_ptr_beta); - __m256 gamma = _mm256_loadu_ps(value_ptr_gamma); - - __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); - - __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); - __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); - _mm256_storeu_ps(dstptr, _v); - - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; - - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; - - dstptr += 8; - } - -#endif // __AVX__ - for (; x + 3 < grid_size; x += 4) - { - __m128i v000_offset = _mm_set_epi32(*(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); - __m128i v001_offset = _mm_set_epi32(*(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); - __m128i v010_offset = _mm_set_epi32(*(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); - __m128i v011_offset = _mm_set_epi32(*(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); - __m128i v100_offset = _mm_set_epi32(*(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); - __m128i v101_offset = _mm_set_epi32(*(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); - __m128i v110_offset = _mm_set_epi32(*(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); - __m128i v111_offset = _mm_set_epi32(*(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); - - __m128 v000_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_000), _mm_set1_ps(-1.0f)); - __m128 v001_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_001), _mm_set1_ps(-1.0f)); - __m128 v010_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_010), _mm_set1_ps(-1.0f)); - __m128 v011_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_011), _mm_set1_ps(-1.0f)); - __m128 v100_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_100), _mm_set1_ps(-1.0f)); - __m128 v101_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_101), _mm_set1_ps(-1.0f)); - __m128 v110_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_110), _mm_set1_ps(-1.0f)); - __m128 v111_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_111), _mm_set1_ps(-1.0f)); - - __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_bound); - __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_bound); - __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_bound); - __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_bound); - __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_bound); - __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_bound); - __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_bound); - __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_bound); - - __m128 alpha = _mm_loadu_ps(value_ptr_alpha); - __m128 beta = _mm_loadu_ps(value_ptr_beta); - __m128 gamma = _mm_loadu_ps(value_ptr_gamma); - - __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); - - __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); - __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); - - __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); - _mm_storeu_ps(dstptr, _v); - - offset_ptr_000 += 4; - offset_ptr_001 += 4; - offset_ptr_010 += 4; - offset_ptr_011 += 4; - - offset_ptr_100 += 4; - offset_ptr_101 += 4; - offset_ptr_110 += 4; - offset_ptr_111 += 4; - - value_ptr_alpha += 4; - value_ptr_beta += 4; - value_ptr_gamma += 4; - - dstptr += 4; - } -#endif // __SSE2__ - for (; x < grid_size; x++) - { - float v000 = *reinterpret_cast(offset_ptr_000) >= 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; - float v001 = *reinterpret_cast(offset_ptr_001) >= 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; - float v010 = *reinterpret_cast(offset_ptr_010) >= 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; - float v011 = *reinterpret_cast(offset_ptr_011) >= 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; - - float v100 = *reinterpret_cast(offset_ptr_100) >= 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; - float v101 = *reinterpret_cast(offset_ptr_101) >= 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; - float v110 = *reinterpret_cast(offset_ptr_110) >= 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; - float v111 = *reinterpret_cast(offset_ptr_111) >= 0 ? *(srcptr + static_cast(*offset_ptr_111)) : 0; - - float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; - float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; - float v10 = v100 * (1 - *value_ptr_alpha) + v101 * *value_ptr_alpha; - float v11 = v110 * (1 - *value_ptr_alpha) + v111 * *value_ptr_alpha; - - float v0 = v00 * (1 - *value_ptr_beta) + v01 * *value_ptr_beta; - float v1 = v10 * (1 - *value_ptr_beta) + v11 * *value_ptr_beta; - - *dstptr = v0 * (1 - *value_ptr_gamma) + v1 * *value_ptr_gamma; - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - dstptr++; - } - } -} \ No newline at end of file diff --git a/src/layer/x86/gridsample_compute_blob.h b/src/layer/x86/gridsample_compute_blob.h new file mode 100644 index 00000000000..dcfe18a43bd --- /dev/null +++ b/src/layer/x86/gridsample_compute_blob.h @@ -0,0 +1,153 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if _MSC_VER +#define OPT_2 +#elif __clang__ +#define OPT_2 __attribute__((optnone)) +#elif __GNUC__ +#define OPT_2 __attribute__((optimize("2"))) +#endif + +template +struct grid_sample_unormalize; + +template<> +struct grid_sample_unormalize +{ +#if __AVX__ + OPT_2 + __m256 operator()(__m256 length, __m256 coord) + { + return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), _mm256_set1_ps(2)), _mm256_sub_ps(length, _mm256_set1_ps(1))); + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return (coord + 1) / 2.f * (length - 1); + } +}; + +template<> +struct grid_sample_unormalize +{ +#if __AVX__ + OPT_2 + __m256 operator()(__m256 length, __m256 coord) + { + return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), length, _mm256_set1_ps(1)), _mm256_set1_ps(2)); + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return ((coord + 1) * length - 1) / 2.f; + } +}; + +template +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return coord; + } +}; + +template +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); + + coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); + + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + return std::min(length - 1.0f, std::max(coord, 0.0f)); + } +}; + +template<> +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); + + coord = abs256_ps(coord); + + __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, border_x)); + coord = _mm256_sub_ps(border_x, reflectx_v); + + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + coord = fabs(coord); + coord = (length - 1) - fabs(coord - (length - 1)); + + return std::min(length - 1.0f, std::max(coord, 0.0f)); + } +}; + +template<> +struct compute_coord +{ +#if __AVX__ + __m256 operator()(__m256 length, __m256 coord) + { + const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); + + __m256 v0p5fp8 = _mm256_set1_ps(0.5f); + coord = _mm256_add_ps(coord, v0p5fp8); + + coord = abs256_ps(coord); + + __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, length)); + coord = _mm256_sub_ps(length, reflectx_v); + + coord = _mm256_sub_ps(coord, v0p5fp8); + + _mm256_sub_ps(coord, v0p5fp8); + + coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); + + return coord; + } +#endif // __AVX__ + float operator()(int length, float coord) + { + coord = fabs(coord + 0.5f); + coord = length - fabs(coord - length) - 0.5; + + return std::min(length - 1.0f, std::max(coord, 0.0f)); + } +}; + +#include "gridsample_bilinear_compute_blob.h" +#include "gridsample_bicubic_compute_blob.h" +#include "gridsample_nearest_compute_blob.h" \ No newline at end of file diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h new file mode 100644 index 00000000000..c71086e5bea --- /dev/null +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -0,0 +1,191 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ +static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +{ + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr = offset.channel(0); + + for (int i = 0; i < grid_size; i++) + { + __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + + _mm512_storeu_ps(dstptr, _v); + + offset_ptr++; + dstptr += 16; + } + } +} +#endif // __AVX512F__ + +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ +void gridsample_nearest_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Option& opt); +void gridsample_nearest_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset, const Option& opt); +void gridsample_nearest_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset, const Option& opt); +#endif + +static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_nearest_apply_interpolation_p8_avx2(src, dst, offset, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr = offset.channel(0); + + for (int i = 0; i < grid_size; i++) + { + float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; +#if __AVX2__ + __m256i _offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); +#else + __m256i _offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); +#endif // __AVX2__ + __m256 _v = mask_gather_ps256(srcptr, _offset, _mm256_set1_ps(in_bound)); + + _mm256_storeu_ps(dstptr, _v); + + offset_ptr++; + dstptr += 8; + } + } +} +#endif // __AVX__ +static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_nearest_apply_interpolation_p4_avx2(src, dst, offset, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr = offset.channel(0); + + for (int i = 0; i < grid_size; i++) + { + float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; + __m128 _v = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_ptr), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(in_bound)); + + _mm_storeu_ps(dstptr, _v); + + offset_ptr++; + dstptr += 4; + } + } +} + +#endif // __SSE2__ + +static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ + if (ncnn::cpu_support_x86_avx2()) + { + gridsample_nearest_apply_interpolation_p1_avx2(src, dst, offset, opt); + return; + } +#endif + + const int channels = dst.c; + const int outw = dst.w; + const int outh = dst.h; + const int outd = dst.d; + const int grid_size = outw * outh * outd; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* srcptr = src.channel(q); + float* dstptr = dst.channel(q); + + const float* offset_ptr = offset.channel(0); + + int x = 0; +#if __SSE2__ +#if __AVX__ + for (; x + 7 < grid_size; x += 8) + { + __m256 in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr), _mm256_set1_ps(-1.0f)); + __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); + + _mm256_storeu_ps(dstptr, _v); + + offset_ptr += 8; + dstptr += 8; + } +#endif // __AVX__ + for (; x + 3 < grid_size; x += 4) + { + __m128 in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr), _mm_set1_ps(-1.0f)); + __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); + + _mm_storeu_ps(dstptr, _v); + + offset_ptr += 4; + dstptr += 4; + } +#endif // __SSE2__ + for (; x < grid_size; x++) + { + *dstptr = *reinterpret_cast(offset_ptr) >= 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; + + offset_ptr++; + dstptr++; + } + } +} diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index a2f6a9dc01f..f2119463294 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -342,150 +342,3 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of } } } - -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ -static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr = offset.channel(0); - - for (int i = 0; i < grid_size; i++) - { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - - _mm512_storeu_ps(dstptr, _v); - - offset_ptr++; - dstptr += 16; - } - } -} -#endif // __AVX512F__ -static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr = offset.channel(0); - - for (int i = 0; i < grid_size; i++) - { - float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; -#if __AVX2__ - __m256i _offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i _offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - __m256 _v = mask_gather_ps256(srcptr, _offset, _mm256_set1_ps(in_bound)); - - _mm256_storeu_ps(dstptr, _v); - - offset_ptr++; - dstptr += 8; - } - } -} -#endif // __AVX__ -static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr = offset.channel(0); - - for (int i = 0; i < grid_size; i++) - { - float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; - __m128 _v = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_ptr), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(in_bound)); - - _mm_storeu_ps(dstptr, _v); - - offset_ptr++; - dstptr += 4; - } - } -} - -#endif // __SSE2__ - -static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) -{ - const int channels = dst.c; - const int outw = dst.w; - const int outh = dst.h; - const int outd = dst.d; - const int grid_size = outw * outh * outd; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* srcptr = src.channel(q); - float* dstptr = dst.channel(q); - - const float* offset_ptr = offset.channel(0); - - int x = 0; -#if __SSE2__ -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256 in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr), _mm256_set1_ps(-1.0f)); - __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); - - _mm256_storeu_ps(dstptr, _v); - - offset_ptr += 8; - dstptr += 8; - } -#endif // __AVX__ - for (; x + 3 < grid_size; x += 4) - { - __m128 in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr), _mm_set1_ps(-1.0f)); - __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); - - _mm_storeu_ps(dstptr, _v); - - offset_ptr += 4; - dstptr += 4; - } -#endif // __SSE2__ - for (; x < grid_size; x++) - { - *dstptr = *reinterpret_cast(offset_ptr) >= 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; - - offset_ptr++; - dstptr++; - } - } -} diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 0501de69ca3..0993e485678 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -26,9 +26,13 @@ #endif // __AVX__ #endif // __SSE2__ #include "x86_usability.h" +#include "cpu.h" namespace ncnn { +#include "gridsample_compute_blob.h" +#include "gridsample_apply_interpolation.h" + GridSample_x86::GridSample_x86() { #if __SSE2__ @@ -36,207 +40,9 @@ GridSample_x86::GridSample_x86() #endif // __SSE2__ } -#if __SSE2__ -#if __AVX__ -static __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) -{ -#if __AVX2__ - __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[8], maski[8]; - memcpy(offseti, &offset, 8 * sizeof(int)); - memcpy(maski, &mask, 8 * sizeof(int)); - - float data[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = 0; i < 8; i++) - { - if (maski[i] & 0xF0000000) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m256 v = _mm256_loadu_ps(data); -#endif // __AVX2__ - - return v; -} - -#endif // __AVX__ - -static __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) -{ -#if __AVX2__ - __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[4], maski[4]; - memcpy(offseti, &offset, 4 * sizeof(int)); - memcpy(maski, &mask, 4 * sizeof(int)); - - float data[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = 0; i < 4; i++) - { - if (maski[i] & 0xF0000000) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m128 v = _mm_loadu_ps(data); -#endif // __AVX__ - - return v; -} - -#endif // __SSE2__ - -#if _MSC_VER -#define OPT_2 -#elif __clang__ -#define OPT_2 __attribute__((optnone)) -#elif __GNUC__ -#define OPT_2 __attribute__((optimize("2"))) -#endif - -namespace GridSample_x86_kernel { - -template -struct grid_sample_unormalize; - -template<> -struct grid_sample_unormalize -{ -#if __AVX__ - OPT_2 - __m256 operator()(__m256 length, __m256 coord) - { - return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), _mm256_set1_ps(2)), _mm256_sub_ps(length, _mm256_set1_ps(1))); - } -#endif // __AVX__ - float operator()(int length, float coord) - { - return (coord + 1) / 2.f * (length - 1); - } -}; - -template<> -struct grid_sample_unormalize -{ -#if __AVX__ - OPT_2 - __m256 operator()(__m256 length, __m256 coord) - { - return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), length, _mm256_set1_ps(1)), _mm256_set1_ps(2)); - } -#endif // __AVX__ - float operator()(int length, float coord) - { - return ((coord + 1) * length - 1) / 2.f; - } -}; - -template -struct compute_coord -{ -#if __AVX__ - __m256 operator()(__m256 length, __m256 coord) - { - return coord; - } -#endif // __AVX__ - float operator()(int length, float coord) - { - return coord; - } -}; - -template -struct compute_coord -{ -#if __AVX__ - __m256 operator()(__m256 length, __m256 coord) - { - const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); - - coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); - - return coord; - } -#endif // __AVX__ - float operator()(int length, float coord) - { - return std::min(length - 1.0f, std::max(coord, 0.0f)); - } -}; - -template<> -struct compute_coord -{ -#if __AVX__ - __m256 operator()(__m256 length, __m256 coord) - { - const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); - - coord = abs256_ps(coord); - - __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, border_x)); - coord = _mm256_sub_ps(border_x, reflectx_v); - - return coord; - } -#endif // __AVX__ - float operator()(int length, float coord) - { - coord = fabs(coord); - coord = (length - 1) - fabs(coord - (length - 1)); - - return std::min(length - 1.0f, std::max(coord, 0.0f)); - } -}; - -template<> -struct compute_coord -{ -#if __AVX__ - __m256 operator()(__m256 length, __m256 coord) - { - const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1)); - - __m256 v0p5fp8 = _mm256_set1_ps(0.5f); - coord = _mm256_add_ps(coord, v0p5fp8); - - coord = abs256_ps(coord); - - __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, length)); - coord = _mm256_sub_ps(length, reflectx_v); - - coord = _mm256_sub_ps(coord, v0p5fp8); - - _mm256_sub_ps(coord, v0p5fp8); - - coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps())); - - return coord; - } -#endif // __AVX__ - float operator()(int length, float coord) - { - coord = fabs(coord + 0.5f); - coord = length - fabs(coord - length) - 0.5; - - return std::min(length - 1.0f, std::max(coord, 0.0f)); - } -}; - -#include "gridsample_bilinear_compute_blob.h" -#include "gridsample_bicubic_compute_blob.h" -#include "gridsample_nearest_compute_blob.h" - -} //namespace GridSample_x86_kernel - int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { - using namespace GridSample_x86_kernel; + /*using namespace GridSample_x86_kernel;*/ const Mat& bottom_blob = bottom_blobs[0]; const Mat& grid = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; @@ -249,10 +55,10 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Thu, 4 May 2023 12:47:26 +0000 Subject: [PATCH 103/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_apply_interpolation.h | 2 +- src/layer/x86/gridsample_bilinear_apply_interpolation.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index cc4cc837899..1a886eaa671 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -101,7 +101,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d void gridsample_2d_bicubic_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); void gridsample_2d_bicubic_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); void gridsample_2d_bicubic_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); -#endif +#endif static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) { diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 862fb35df11..8bdfe45b036 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -813,7 +813,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d float v001 = *reinterpret_cast(offset_ptr_001) >= 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; float v010 = *reinterpret_cast(offset_ptr_010) >= 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; float v011 = *reinterpret_cast(offset_ptr_011) >= 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; - + float v100 = *reinterpret_cast(offset_ptr_100) >= 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; float v101 = *reinterpret_cast(offset_ptr_101) >= 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; float v110 = *reinterpret_cast(offset_ptr_110) >= 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; From 67300e1de23021f9dcb0b124df6a1c01db824fc0 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 5 May 2023 15:58:02 +0800 Subject: [PATCH 104/127] remove unused code and fix vs2015 build-shared --- src/layer/x86/gridsample_bicubic_compute_blob.h | 2 -- src/layer/x86/gridsample_x86.cpp | 1 - src/layer/x86/gridsample_x86_avx2.cpp | 1 - 3 files changed, 4 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 90198f47c73..802278e7c8a 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -19,8 +19,6 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - float *v0_in_bound_ptr[4], *v1_in_bound_ptr[4], *v2_in_bound_ptr[4], *v3_in_bound_ptr[4]; - float* value_x = value.channel(0); float* value_y = value.channel(1); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index e72121f3861..b03e9b0bb6b 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -42,7 +42,6 @@ GridSample_x86::GridSample_x86() int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { - /*using namespace GridSample_x86_kernel;*/ const Mat& bottom_blob = bottom_blobs[0]; const Mat& grid = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; diff --git a/src/layer/x86/gridsample_x86_avx2.cpp b/src/layer/x86/gridsample_x86_avx2.cpp index 718053d38b6..70ee0005f91 100644 --- a/src/layer/x86/gridsample_x86_avx2.cpp +++ b/src/layer/x86/gridsample_x86_avx2.cpp @@ -17,7 +17,6 @@ #include "x86_usability.h" namespace ncnn { -#include "gridsample_x86.h" #include "gridsample_apply_interpolation.h" void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) From 5c1d5ed972fc07d15a4c1293ec23a7c8f84fb77d Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 8 May 2023 15:09:30 +0800 Subject: [PATCH 105/127] fix avx512 build bug --- .../x86/gridsample_bicubic_apply_interpolation.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index 1a886eaa671..49ec88ab1c0 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -19,15 +19,14 @@ static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2 { const __m512 A = _mm512_set1_ps(-0.75f); - const __m512 x0 = _mm512_add_ps(tx, *(__m512*)_ps512_1); + const __m512 x0 = _mm512_add_ps(tx, _mm512_set1_ps(1.0f)); const __m512& x1 = tx; - const __m512 x2 = _mm512_sub_ps(*(__m512*)_ps512_1, tx); - //const __m512 x3 = _mm512_add_ps(x2, *(__m512*)_ps512_1); + const __m512 x2 = _mm512_sub_ps(_mm512_set1_ps(1.0f), tx); coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A)); - coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), *(__m512*)_ps512_1); - coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), *(__m512*)_ps512_1); - coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(*(__m512*)_ps512_1, coeffs0), coeffs1), coeffs2); + coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), _mm512_set1_ps(1.0f)); + coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), _mm512_set1_ps(1.0f)); + coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(_mm512_set1_ps(1.0f), coeffs0), coeffs1), coeffs2); } static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) From 0633218a257bacda53b7dc390416870b553d3bda Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Mon, 15 May 2023 17:12:15 +0800 Subject: [PATCH 106/127] fix permute_fusion rule in gridsample and add permute_fusion parameter to document --- docs/developer-guide/operators.md | 1 + tools/modelwriter.h | 1 + tools/pnnx/src/pass_ncnn/F_grid_sample.cpp | 11 +++++++++++ 3 files changed, 13 insertions(+) diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index a545372f02f..d775a92124a 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -887,6 +887,7 @@ This function is often used in conjunction with affine_grid() to build Spatial T | 0 | sample_type | int | 1 | | | 1 | padding_mode | int | 1 | | | 2 | align_corner | int | 0 | | +| 3 | permute_fusion| int | 0 | fuse with permute | Sample type: diff --git a/tools/modelwriter.h b/tools/modelwriter.h index 3d09ec1859d..fd5105e612f 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -1734,6 +1734,7 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 0=%d", sample_type) fprintf_param_value(" 1=%d", padding_mode) fprintf_param_value(" 2=%d", align_corner) + fprintf_param_value(" 3=%d", permute_fusion) } else if (layer->type == "GroupNorm") { diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp index 9527a20116d..b71e64dcbde 100644 --- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -91,6 +91,17 @@ pnnx.Output output 1 0 out return "permutegridsample"; } + bool match(const std::map& captured_params) const + { + const std::vector& dims = captured_params.at("dims").ai; + + if ((dims == std::vector{1, 2, 0}) || (dims == std::vector{1, 2, 3, 0})) + return true; + if ((dims == std::vector{0, 2, 3, 1}) || (dims == std::vector{0, 2, 3, 4, 1})) + return true; + return false; + } + void write(Operator* op, const std::map& captured_params) const { const std::string& mode = captured_params.at("mode").s; From 10ce44e1a2c118e6bc092e65a447426586dd36f0 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 23 May 2023 16:24:48 +0800 Subject: [PATCH 107/127] optimize code --- .../x86/gridsample_bicubic_compute_blob.h | 9 ++-- .../x86/gridsample_bilinear_compute_blob.h | 54 +++++++++---------- src/layer/x86/gridsample_compute_blob.h | 1 - .../x86/gridsample_nearest_compute_blob.h | 24 +++------ 4 files changed, 37 insertions(+), 51 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 802278e7c8a..66ebec18976 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -90,11 +90,10 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - volatile float epack = src.elempack; - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); - __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); - __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); - __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); + __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(src.elempack)); + __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(src.elempack)); + __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(src.elempack)); + __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(src.elempack)); v0_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f, _mm256_and_ps(x0_in_range, y_in_range)); v1_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f, _mm256_and_ps(x1_in_range, y_in_range)); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 0ec21d7a47a..326094debef 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -71,11 +71,10 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - volatile float epack = src.elempack; - __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); - __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); - __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); - __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(src.elempack)); + __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(src.elempack)); + __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack), nw_offset); + __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(src.elempack)); nw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), nw_offset, v00_in_range); ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); @@ -190,11 +189,10 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range); __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range); - volatile float epack = src.elempack; - __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(epack)); - __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(epack)); - __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack), nw_offset); - __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(epack)); + __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(src.elempack)); + __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(src.elempack)); + __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack), nw_offset); + __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(src.elempack)); nw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), nw_offset, v00_in_range); ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); @@ -364,18 +362,17 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - volatile float epack = src.elempack; __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), - _mm256_set1_ps(epack)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(epack)); + _mm256_set1_ps(src.elempack)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(src.elempack)); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack))); + __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(src.elempack)); - __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(epack), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(epack)); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(src.elempack), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(src.elempack)); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack))); + __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(src.elempack)); tnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tnw_offset, v000_in_range); tne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tne_offset, v001_in_range); @@ -557,18 +554,17 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range); } - volatile float epack = src.elempack; __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t, _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)), - _mm256_set1_ps(epack)); - __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(epack)); - __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(epack)); - - __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(epack), tnw_offset); - __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(epack)); - __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(epack))); - __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(epack)); + _mm256_set1_ps(src.elempack)); + __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(src.elempack)); + __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack))); + __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(src.elempack)); + + __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(src.elempack), tnw_offset); + __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(src.elempack)); + __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack))); + __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(src.elempack)); tnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tnw_offset, v000_in_range); tne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tne_offset, v001_in_range); diff --git a/src/layer/x86/gridsample_compute_blob.h b/src/layer/x86/gridsample_compute_blob.h index dcfe18a43bd..259783c2470 100644 --- a/src/layer/x86/gridsample_compute_blob.h +++ b/src/layer/x86/gridsample_compute_blob.h @@ -27,7 +27,6 @@ template<> struct grid_sample_unormalize { #if __AVX__ - OPT_2 __m256 operator()(__m256 length, __m256 coord) { return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), _mm256_set1_ps(2)), _mm256_sub_ps(length, _mm256_set1_ps(1))); diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 0f97a86ceaf..a70710c4ae9 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -56,8 +56,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); - volatile float epack = src.elempack; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(src.elempack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); @@ -118,8 +117,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)), _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); - volatile float epack = src.elempack; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(epack)); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(src.elempack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); @@ -212,12 +210,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); - volatile float epack = src.elempack; - volatile float sw = src.w; - volatile float sh = src.h; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), gz, + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), + _mm256_set1_ps(src.elempack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); @@ -291,12 +286,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS))); v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); - volatile float epack = src.elempack; - volatile float sw = src.w; - volatile float sh = src.h; - __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(sw), _mm256_set1_ps(sh)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(sw), gx)), - _mm256_set1_ps(epack)); + __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), gz, + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), + _mm256_set1_ps(src.elempack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); From 8a7da038b757a3e2a65bdf97999b6e3026655e2e Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Tue, 23 May 2023 08:27:29 +0000 Subject: [PATCH 108/127] apply code-format changes --- src/layer/x86/gridsample_nearest_compute_blob.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index a70710c4ae9..c1d3df1ca6c 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -211,7 +211,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), _mm256_set1_ps(src.elempack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); @@ -287,7 +287,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS))); __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), gz, - _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), + _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), _mm256_set1_ps(src.elempack)); offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); From e2b88076c9767b126f60ac410388792c8cdba9d6 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Tue, 23 May 2023 17:59:47 +0800 Subject: [PATCH 109/127] remove OPT_2 macro definition --- src/layer/x86/gridsample_compute_blob.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/layer/x86/gridsample_compute_blob.h b/src/layer/x86/gridsample_compute_blob.h index 259783c2470..d88251da3f7 100644 --- a/src/layer/x86/gridsample_compute_blob.h +++ b/src/layer/x86/gridsample_compute_blob.h @@ -12,14 +12,6 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#if _MSC_VER -#define OPT_2 -#elif __clang__ -#define OPT_2 __attribute__((optnone)) -#elif __GNUC__ -#define OPT_2 __attribute__((optimize("2"))) -#endif - template struct grid_sample_unormalize; @@ -42,7 +34,6 @@ template<> struct grid_sample_unormalize { #if __AVX__ - OPT_2 __m256 operator()(__m256 length, __m256 coord) { return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), length, _mm256_set1_ps(1)), _mm256_set1_ps(2)); From 0ca97ed2ecc99703ad5a9c83c9719850f39e7313 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Wed, 31 May 2023 15:03:23 +0800 Subject: [PATCH 110/127] optimize unittest and apply OMP in xxx_compute_blob --- .../x86/gridsample_bicubic_compute_blob.h | 1 + .../x86/gridsample_bilinear_compute_blob.h | 1 + .../x86/gridsample_nearest_compute_blob.h | 1 + tests/test_gridsample.cpp | 154 ++++++++++++------ 4 files changed, 104 insertions(+), 53 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 66ebec18976..9af07f69e74 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -35,6 +35,7 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of if (permute_fusion == 0) { +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 326094debef..1cface7e505 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -30,6 +30,7 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o if (permute_fusion == 0) { +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index c1d3df1ca6c..4db18c28756 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -24,6 +24,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of if (permute_fusion == 0) { +#pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index b2f1c750333..0e384115352 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -51,8 +51,16 @@ static int test_gridsample_0() || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 1, 1, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 2, 0, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 2, 1, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 3, 0, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 1, 0, 0) || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 1, 1, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 2, 0, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 2, 1, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 3, 0, 0) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 3, 1, 0) || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 1, 0, 1) || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 1, 1, 1) || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 2, 0, 1) @@ -61,75 +69,115 @@ static int test_gridsample_0() || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 3, 1, 1) || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 1, 0, 1) || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 1, 1, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 2, 0, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 2, 1, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 3, 0, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 3, 1, 1) || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 1, 0, 1) - || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 1, 1, 1); + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 1, 1, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 2, 0, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 2, 1, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 3, 0, 1) + || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 3, 1, 1); } static int test_gridsample_1() { return 0 - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 1, 1, 0, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 1, 1, 1, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 1, 2, 0, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 1, 2, 1, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 1, 3, 0, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 1, 3, 1, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 2, 1, 0, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 2, 1, 1, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 3, 1, 0, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(2, 27, 21), 3, 1, 1, 0) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 1, 1, 0, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 1, 1, 1, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 1, 2, 0, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 1, 2, 1, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 1, 3, 0, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 1, 3, 1, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 2, 1, 0, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 2, 1, 1, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 3, 1, 0, 1) - || test_gridsample(RandomMat(8, 16, 8), RandomMat(27, 21, 2), 3, 1, 1, 1); + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 1, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 1, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 2, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 2, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 3, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 3, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 1, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 1, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 2, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 2, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 3, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 3, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 1, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 1, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 2, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 2, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 3, 0, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 3, 1, 0) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 1, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 1, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 2, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 2, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 3, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 3, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 1, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 1, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 2, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 2, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 3, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 3, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 1, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 1, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 2, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 2, 1, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 3, 0, 1) + || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 3, 1, 1); } static int test_gridsample_2() { return 0 - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 1, 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 27, 21, 10), 2, 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 1, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 2, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 3, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 1, 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(27, 21, 10, 3), 2, 1, 1, 1); + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 1, 0, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 1, 1, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 2, 0, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 2, 1, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 3, 0, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 3, 1, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 1, 0, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 1, 1, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 2, 0, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 2, 1, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 3, 0, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 3, 1, 0) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 1, 0, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 1, 1, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 2, 0, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 2, 1, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 3, 0, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 3, 1, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 1, 0, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 1, 1, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 2, 0, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 2, 1, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 3, 0, 1) + || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 3, 1, 1); } static int test_gridsample_3() { return 0 - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 2, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 1, 3, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 0, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 16, 12, 10), 2, 1, 1, 0) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 1, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 1, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 2, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 2, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 3, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 1, 3, 1, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 0, 1) - || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(16, 12, 10, 3), 2, 1, 1, 1); + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 1, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 2, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 3, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 1, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 2, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 3, 0, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 1, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 2, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 3, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 1, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 2, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 3, 0, 1) + || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 3, 1, 1); } int main() From fdc15ac411c074fca481d3d9f61baea1933b178f Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Wed, 31 May 2023 07:05:32 +0000 Subject: [PATCH 111/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_compute_blob.h | 2 +- src/layer/x86/gridsample_bilinear_compute_blob.h | 2 +- src/layer/x86/gridsample_nearest_compute_blob.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 9af07f69e74..742459d2a25 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -35,7 +35,7 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of if (permute_fusion == 0) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 1cface7e505..30a8fc076e4 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -30,7 +30,7 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o if (permute_fusion == 0) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 4db18c28756..2248fd2cfe8 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -24,7 +24,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of if (permute_fusion == 0) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); From 22e4fe4627970232f0a16ad6afbd572a3be5aba0 Mon Sep 17 00:00:00 2001 From: Yoh Date: Sun, 18 Jun 2023 05:45:11 +0800 Subject: [PATCH 112/127] fix mutithread data race bug --- .../x86/gridsample_bicubic_compute_blob.h | 3 +- .../x86/gridsample_bilinear_compute_blob.h | 5 +- .../x86/gridsample_nearest_compute_blob.h | 5 +- src/layer/x86/gridsample_x86.cpp | 60 +++++++++---------- 4 files changed, 35 insertions(+), 38 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 742459d2a25..84ee0eb5114 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -13,7 +13,7 @@ // specific language governing permissions and limitations under the License. template -void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) +void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) { const int grid_size = grid.w * grid.h; @@ -35,7 +35,6 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of if (permute_fusion == 0) { - #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 30a8fc076e4..fb2686ca419 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -13,7 +13,7 @@ // specific language governing permissions and limitations under the License. template -void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) +void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) { const int grid_size = grid.w * grid.h; @@ -30,7 +30,6 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o if (permute_fusion == 0) { - #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); @@ -274,7 +273,7 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o } template -void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) +void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) { const int grid_size = grid.w * grid.h * grid.d; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index 2248fd2cfe8..ae6905753f6 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -13,7 +13,7 @@ // specific language governing permissions and limitations under the License. template -void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) +void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) { const int grid_size = grid.w * grid.h; @@ -24,7 +24,6 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of if (permute_fusion == 0) { - #pragma omp parallel for num_threads(opt.num_threads) for (int y = 0; y < grid.c; y++) { const float* gridptr = grid.channel(y); @@ -158,7 +157,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of } template -void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion, const Option& opt) +void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) { const int grid_size = grid.w * grid.h * grid.d; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index b03e9b0bb6b..e53f6a53b08 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -84,33 +84,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else @@ -130,33 +130,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else @@ -177,33 +177,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else @@ -235,33 +235,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else @@ -281,33 +281,33 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } else { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion, opt); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); } } else From 875cb2a52f9e7eed7e06579835ac9c35444d870c Mon Sep 17 00:00:00 2001 From: Yoh Date: Mon, 17 Jul 2023 08:11:12 +0800 Subject: [PATCH 113/127] pack compute_value blob --- .../gridsample_bicubic_apply_interpolation.h | 314 ++----- .../x86/gridsample_bicubic_compute_blob.h | 197 ++--- .../gridsample_bilinear_apply_interpolation.h | 791 +++++------------- .../x86/gridsample_bilinear_compute_blob.h | 343 +++----- src/layer/x86/gridsample_compute_blob.h | 2 + .../gridsample_nearest_apply_interpolation.h | 28 +- .../x86/gridsample_nearest_compute_blob.h | 28 +- src/layer/x86/gridsample_x86.cpp | 125 ++- src/layer/x86/gridsample_x86_avx2.cpp | 48 +- src/layer/x86/x86_usability.h | 157 ++++ 10 files changed, 759 insertions(+), 1274 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index 49ec88ab1c0..116d69537a0 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -29,77 +29,59 @@ static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(_mm512_set1_ps(1.0f), coeffs0), coeffs1), coeffs2); } -static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; const int grid_size = outw * outh; - __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m512 value_f[4]; - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*value_x)); + __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m512 value_f[4]; + cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*offset_value_ptr++)); + cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*offset_value_ptr++)); for (int ii = 0; ii < 4; ii++) { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v0_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v1_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v2_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*v3_offset_ptr[ii]), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + offset_value_ptr++; + __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + offset_value_ptr++; + __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + offset_value_ptr++; + __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + offset_value_ptr++; value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm512_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); value_f[ii] = _mm512_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; } - cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*value_y)); - __m512 _v = _mm512_mul_ps(y_coeffs0, value_f[0]); _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v); _mm512_storeu_ps(dstptr, _v); - value_x++; - value_y++; - dstptr += 16; } } } #endif // __AVX512F__ #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ -void gridsample_2d_bicubic_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); -void gridsample_2d_bicubic_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); -void gridsample_2d_bicubic_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt); +void gridsample_2d_bicubic_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt); +void gridsample_2d_bicubic_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt); #endif static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) @@ -117,12 +99,12 @@ static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(_mm256_set1_ps(1), coeffs0), coeffs1), coeffs2); } -static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_2d_bicubic_apply_interpolation_p8_avx2(src, dst, offset, value, opt); + gridsample_2d_bicubic_apply_interpolation_p8_avx2(src, dst, offset_value, opt); return; } #endif @@ -132,49 +114,39 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds const int outh = dst.h; const int grid_size = outw * outh; - __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m256 value_f[4]; - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*value_x)); + __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + __m256 value_f[4]; + cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*offset_value_ptr++)); + cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*offset_value_ptr++)); + for (int ii = 0; ii < 4; ii++) { - float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; + float v0_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; + float v1_in_bound = *reinterpret_cast(offset_value_ptr + 1) >= 0 ? -1.0f : 0.0f; + float v2_in_bound = *reinterpret_cast(offset_value_ptr + 2) >= 0 ? -1.0f : 0.0f; + float v3_in_bound = *reinterpret_cast(offset_value_ptr + 3) >= 0 ? -1.0f : 0.0f; #if __AVX2__ - __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*v0_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*v1_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v2_offset = _mm256_add_epi32(_mm256_set1_epi32(*v2_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v3_offset = _mm256_add_epi32(_mm256_set1_epi32(*v3_offset_ptr[ii]), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v2_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v3_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256i v0_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v0_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v1_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v1_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v2_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v2_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*v3_offset_ptr[ii]), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v0_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v1_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v2_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(v0_in_bound)); @@ -186,24 +158,14 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; } - cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*value_y)); - __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); _mm256_storeu_ps(dstptr, _v); - value_x++; - value_y++; - dstptr += 8; } } @@ -225,12 +187,12 @@ static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(_mm_set_ps1(1.0f), coeffs0), coeffs1), coeffs2); } -static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_2d_bicubic_apply_interpolation_p4_avx2(src, dst, offset, value, opt); + gridsample_2d_bicubic_apply_interpolation_p4_avx2(src, dst, offset_value, opt); return; } #endif @@ -250,56 +212,37 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set_ps1(*value_x)); + cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set_ps1(*offset_value_ptr++)); + cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set_ps1(*offset_value_ptr++)); + for (int ii = 0; ii < 4; ii++) { - float v0_in_bound = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v1_in_bound = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v2_in_bound = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - float v3_in_bound = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? -1.0f : 0.0f; - __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v0_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v0_in_bound)); - __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v1_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v1_in_bound)); - __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v2_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v2_in_bound)); - __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*v3_offset_ptr[ii]), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v3_in_bound)); + float v0_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; + __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v0_in_bound)); + float v1_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; + __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v1_in_bound)); + float v2_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; + __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v2_in_bound)); + float v3_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; + __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v3_in_bound)); value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; } - cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set_ps1(*value_y)); - __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); _mm_storeu_ps(dstptr, _v); - value_x++; - value_y++; - dstptr += 4; } } @@ -321,16 +264,8 @@ static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2 coeffs3 = 1.f - coeffs0 - coeffs1 - coeffs2; } -static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_2d_bicubic_apply_interpolation_p1_avx2(src, dst, offset, value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -342,148 +277,43 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } - - const float* value_x = value.channel(0); - const float* value_y = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); - int x = 0; -#if __SSE2__ -#if __AVX__ - { - __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m256 value_f[4]; - for (; x + 7 < grid_size; x += 8) - { - cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_loadu_ps(value_x)); - for (int ii = 0; ii < 4; ii++) - { - __m256 v0_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v0_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - __m256 v1_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v1_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - __m256 v2_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v2_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - __m256 v3_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(v3_offset_ptr[ii]), _mm256_set1_ps(-1.0f)); - - __m256 x0_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v0_offset_ptr[ii] + 7), *(v0_offset_ptr[ii] + 6), *(v0_offset_ptr[ii] + 5), *(v0_offset_ptr[ii] + 4), *(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); - __m256 x1_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v1_offset_ptr[ii] + 7), *(v1_offset_ptr[ii] + 6), *(v1_offset_ptr[ii] + 5), *(v1_offset_ptr[ii] + 4), *(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); - __m256 x2_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v2_offset_ptr[ii] + 7), *(v2_offset_ptr[ii] + 6), *(v2_offset_ptr[ii] + 5), *(v2_offset_ptr[ii] + 4), *(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); - __m256 x3_val = mask_gather_ps256(srcptr, _mm256_set_epi32(*(v3_offset_ptr[ii] + 7), *(v3_offset_ptr[ii] + 6), *(v3_offset_ptr[ii] + 5), *(v3_offset_ptr[ii] + 4), *(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); - - value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii] += 8; - v1_offset_ptr[ii] += 8; - v2_offset_ptr[ii] += 8; - v3_offset_ptr[ii] += 8; - } - - cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_loadu_ps(value_y)); - - __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); - _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm256_storeu_ps(dstptr, _v); - - value_x += 8; - value_y += 8; - - dstptr += 8; - } - } -#endif // __AVX__ + for (int x = 0; x < grid_size; x++) { - __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - __m128 value_f[4]; - for (; x + 3 < grid_size; x += 4) - { - cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_loadu_ps(value_x)); - for (int ii = 0; ii < 4; ii++) - { - __m128 v0_in_bound = _mm_andnot_ps(_mm_loadu_ps(v0_offset_ptr[ii]), _mm_set_ps1(-1.0f)); - __m128 v1_in_bound = _mm_andnot_ps(_mm_loadu_ps(v1_offset_ptr[ii]), _mm_set_ps1(-1.0f)); - __m128 v2_in_bound = _mm_andnot_ps(_mm_loadu_ps(v2_offset_ptr[ii]), _mm_set_ps1(-1.0f)); - __m128 v3_in_bound = _mm_andnot_ps(_mm_loadu_ps(v3_offset_ptr[ii]), _mm_set_ps1(-1.0f)); - - __m128 x0_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v0_offset_ptr[ii] + 3), *(v0_offset_ptr[ii] + 2), *(v0_offset_ptr[ii] + 1), *v0_offset_ptr[ii]), v0_in_bound); - __m128 x1_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v1_offset_ptr[ii] + 3), *(v1_offset_ptr[ii] + 2), *(v1_offset_ptr[ii] + 1), *v1_offset_ptr[ii]), v1_in_bound); - __m128 x2_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v2_offset_ptr[ii] + 3), *(v2_offset_ptr[ii] + 2), *(v2_offset_ptr[ii] + 1), *v2_offset_ptr[ii]), v2_in_bound); - __m128 x3_val = mask_gather_ps(srcptr, _mm_set_epi32(*(v3_offset_ptr[ii] + 3), *(v3_offset_ptr[ii] + 2), *(v3_offset_ptr[ii] + 1), *v3_offset_ptr[ii]), v3_in_bound); - - value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); - value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); - - v0_offset_ptr[ii] += 4; - v1_offset_ptr[ii] += 4; - v2_offset_ptr[ii] += 4; - v3_offset_ptr[ii] += 4; - } - - cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_loadu_ps(value_y)); - - __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); - _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); - _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); - _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm_storeu_ps(dstptr, _v); - - value_x += 4; - value_y += 4; - - dstptr += 4; - } - } -#endif // __SSE2__ - float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; - float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; - float value_f[4]; + float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; + float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; + float value_f[4]; + cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *offset_value_ptr++); + cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *offset_value_ptr++); - for (; x < grid_size; x++) - { - cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *value_x); for (int ii = 0; ii < 4; ii++) { - float x0_val = *reinterpret_cast(v0_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v0_offset_ptr[ii])) : 0; - float x1_val = *reinterpret_cast(v1_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v1_offset_ptr[ii])) : 0; - float x2_val = *reinterpret_cast(v2_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v2_offset_ptr[ii])) : 0; - float x3_val = *reinterpret_cast(v3_offset_ptr[ii]) >= 0 ? *(srcptr + static_cast(*v3_offset_ptr[ii])) : 0; + float x0_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + + float x1_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + + float x2_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + + float x3_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + value_f[ii] = x_coeffs0 * x0_val; value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; value_f[ii] = x_coeffs2 * x2_val + value_f[ii]; value_f[ii] = x_coeffs3 * x3_val + value_f[ii]; - - v0_offset_ptr[ii]++; - v1_offset_ptr[ii]++; - v2_offset_ptr[ii]++; - v3_offset_ptr[ii]++; } - cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *value_y); - float _v = y_coeffs0 * value_f[0]; _v = y_coeffs1 * value_f[1] + _v; _v = y_coeffs2 * value_f[2] + _v; _v = y_coeffs3 * value_f[3] + _v; *dstptr = _v; - value_x++; - value_y++; - dstptr++; } } diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 84ee0eb5114..65634e1cf33 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -13,22 +13,11 @@ // specific language governing permissions and limitations under the License. template -void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) +void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion) { const int grid_size = grid.w * grid.h; - float *v0_offset_ptr[4], *v1_offset_ptr[4], *v2_offset_ptr[4], *v3_offset_ptr[4]; - - float* value_x = value.channel(0); - float* value_y = value.channel(1); - - for (int i = 0; i < 4; i++) - { - v0_offset_ptr[i] = offset.channel(i * 4 + 0); - v1_offset_ptr[i] = offset.channel(i * 4 + 1); - v2_offset_ptr[i] = offset.channel(i * 4 + 2); - v3_offset_ptr[i] = offset.channel(i * 4 + 3); - } + float* offset_value_ptr = offset_value.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -52,19 +41,14 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of gx = _mm256_shuffle_ps(gx, gy, 0b10001000); gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } + gx = unormalize(_mm256_set1_ps(src.w), gx); + gy = unormalize(_mm256_set1_ps(src.h), gy); __m256 gx_floor = _mm256_floor_ps(gx); __m256 gy_floor = _mm256_floor_ps(gy); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 tx = _mm256_sub_ps(gx, gx_floor); + __m256 ty = _mm256_sub_ps(gy, gy_floor); __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); __m256 gx1 = gx_floor; @@ -80,7 +64,7 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS)); __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS)); __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS)); - + __m256 v0_offset_f[4], v1_offset_f[4], v2_offset_f[4], v3_offset_f[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); @@ -90,33 +74,34 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(src.elempack)); - __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(src.elempack)); - __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(src.elempack)); - __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(src.elempack)); - - v0_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f, _mm256_and_ps(x0_in_range, y_in_range)); - v1_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f, _mm256_and_ps(x1_in_range, y_in_range)); - v2_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f, _mm256_and_ps(x2_in_range, y_in_range)); - v3_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f, _mm256_and_ps(x3_in_range, y_in_range)); - - _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); - _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); - _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); - _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - - v0_offset_ptr[i] += 8; - v1_offset_ptr[i] += 8; - v2_offset_ptr[i] += 8; - v3_offset_ptr[i] += 8; - } + v0_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(src.elempack)); + v1_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(src.elempack)); + v2_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(src.elempack)); + v3_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(src.elempack)); - _mm256_storeu_ps(value_x, tx); - _mm256_storeu_ps(value_y, ty); + v0_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f[i], _mm256_and_ps(x0_in_range, y_in_range)); + v1_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f[i], _mm256_and_ps(x1_in_range, y_in_range)); + v2_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f[i], _mm256_and_ps(x2_in_range, y_in_range)); + v3_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f[i], _mm256_and_ps(x3_in_range, y_in_range)); + } - value_x += 8; - value_y += 8; + transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); + _mm256_storeu_ps(offset_value_ptr, tx); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, ty); + offset_value_ptr += 8; + for (int i = 0; i < 4; i++) + { + _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, v1_offset_f[i]); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, v2_offset_f[i]); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, v3_offset_f[i]); + offset_value_ptr += 8; + } gridptr += 16; } @@ -127,25 +112,25 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of float sample_x = *gridptr; float sample_y = *(gridptr + 1); - // x sample_x = unormalize(src.w, sample_x); - // y sample_y = unormalize(src.h, sample_y); - int x1 = floor(sample_x); - int y1 = floor(sample_y); + int x1 = floorf(sample_x); + int y1 = floorf(sample_y); int x0 = x1 - 1; int x2 = x1 + 1; int x3 = x1 + 2; - *value_x = sample_x - static_cast(x1); - *value_y = sample_y - static_cast(y1); + *offset_value_ptr++ = sample_x - static_cast(x1); + *offset_value_ptr++ = sample_y - static_cast(y1); x1 = get_coord(src.w, x1); x0 = get_coord(src.w, x0); x2 = get_coord(src.w, x2); x3 = get_coord(src.w, x3); + + bool x1_in_range = (x1 > -1) & (x1 < src.w); bool x0_in_range = (x0 > -1) & (x0 < src.w); bool x2_in_range = (x2 > -1) & (x2 < src.w); @@ -164,20 +149,12 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of bool v2_in_bound = (x2_in_range & y_in_range); bool v3_in_bound = (x3_in_range & y_in_range); - *v0_offset_ptr[i] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; - *v1_offset_ptr[i] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; - *v2_offset_ptr[i] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; - *v3_offset_ptr[i] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; - - v0_offset_ptr[i]++; - v1_offset_ptr[i]++; - v2_offset_ptr[i]++; - v3_offset_ptr[i]++; + *offset_value_ptr++ = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; + *offset_value_ptr++ = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; + *offset_value_ptr++ = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; + *offset_value_ptr++ = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; } - value_x++; - value_y++; - gridptr += 2; } } @@ -194,20 +171,15 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); - // compute coord - { - // x - gx = unormalize(_mm256_set1_ps(src.w), gx); - // y - gy = unormalize(_mm256_set1_ps(src.h), gy); - } + gx = unormalize(_mm256_set1_ps(src.w), gx); + gy = unormalize(_mm256_set1_ps(src.h), gy); __m256 gx_floor = _mm256_floor_ps(gx); __m256 gy_floor = _mm256_floor_ps(gy); - const __m256 tx = _mm256_sub_ps(gx, gx_floor); - const __m256 ty = _mm256_sub_ps(gy, gy_floor); - + __m256 tx = _mm256_sub_ps(gx, gx_floor); + __m256 ty = _mm256_sub_ps(gy, gy_floor); + __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); @@ -223,6 +195,7 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS)); __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS)); + __m256 v0_offset_f[4], v1_offset_f[4], v2_offset_f[4], v3_offset_f[4]; for (int i = 0; i < 4; i++) { gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i)); @@ -233,32 +206,34 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); volatile float epack = src.elempack; - __m256 v0_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); - __m256 v1_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); - __m256 v2_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); - __m256 v3_offset_f = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); - - v0_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f, _mm256_and_ps(x0_in_range, y_in_range)); - v1_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f, _mm256_and_ps(x1_in_range, y_in_range)); - v2_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f, _mm256_and_ps(x2_in_range, y_in_range)); - v3_offset_f = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f, _mm256_and_ps(x3_in_range, y_in_range)); - - _mm256_storeu_ps(v0_offset_ptr[i], v0_offset_f); - _mm256_storeu_ps(v1_offset_ptr[i], v1_offset_f); - _mm256_storeu_ps(v2_offset_ptr[i], v2_offset_f); - _mm256_storeu_ps(v3_offset_ptr[i], v3_offset_f); - - v0_offset_ptr[i] += 8; - v1_offset_ptr[i] += 8; - v2_offset_ptr[i] += 8; - v3_offset_ptr[i] += 8; + v0_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); + v1_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); + v2_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); + v3_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); + + v0_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f[i], _mm256_and_ps(x0_in_range, y_in_range)); + v1_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f[i], _mm256_and_ps(x1_in_range, y_in_range)); + v2_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f[i], _mm256_and_ps(x2_in_range, y_in_range)); + v3_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f[i], _mm256_and_ps(x3_in_range, y_in_range)); } - _mm256_storeu_ps(value_x, tx); - _mm256_storeu_ps(value_y, ty); + transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); - value_x += 8; - value_y += 8; + _mm256_storeu_ps(offset_value_ptr, tx); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, ty); + offset_value_ptr += 8; + for (int i = 0; i < 4; i++) + { + _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, v1_offset_f[i]); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, v2_offset_f[i]); + offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr, v3_offset_f[i]); + offset_value_ptr += 8; + } gridptr_x += 8; gridptr_y += 8; @@ -271,19 +246,17 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of float sample_x = *gridptr_x; float sample_y = *gridptr_y; - // x sample_x = unormalize(src.w, sample_x); - // y sample_y = unormalize(src.h, sample_y); - int x1 = floor(sample_x); - int y1 = floor(sample_y); + int x1 = floorf(sample_x); + int y1 = floorf(sample_y); int x0 = x1 - 1; int x2 = x1 + 1; int x3 = x1 + 2; - *value_x = sample_x - static_cast(x1); - *value_y = sample_y - static_cast(y1); + *offset_value_ptr++ = sample_x - static_cast(x1); + *offset_value_ptr++ = sample_y - static_cast(y1); x1 = get_coord(src.w, x1); x0 = get_coord(src.w, x0); @@ -308,20 +281,12 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of bool v2_in_bound = (x2_in_range & y_in_range); bool v3_in_bound = (x3_in_range & y_in_range); - *v0_offset_ptr[i] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; - *v1_offset_ptr[i] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; - *v2_offset_ptr[i] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; - *v3_offset_ptr[i] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; - - v0_offset_ptr[i]++; - v1_offset_ptr[i]++; - v2_offset_ptr[i]++; - v3_offset_ptr[i]++; + *offset_value_ptr++ = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; + *offset_value_ptr++ = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; + *offset_value_ptr++ = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; + *offset_value_ptr++ = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; } - value_x++; - value_y++; - gridptr_x++; gridptr_y++; } diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 8bdfe45b036..4145b9c7868 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -15,7 +15,7 @@ #if __SSE2__ #if __AVX__ #if __AVX512F__ -static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -28,53 +28,40 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - __m512i v00_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_00), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v01_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_01), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_10), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_11), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512i v00_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __mmask16 mask00 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + + __m512i v01_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __mmask16 mask01 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask00 = *reinterpret_cast(offset_ptr_00) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask01 = *reinterpret_cast(offset_ptr_01) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask10 = *reinterpret_cast(offset_ptr_10) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - __mmask16 mask11 = *reinterpret_cast(offset_ptr_11) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __mmask16 mask10 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); + + __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __mmask16 mask11 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); __m512 v00_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask00, v00_offset, srcptr, sizeof(float)); __m512 v01_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask01, v01_offset, srcptr, sizeof(float)); __m512 v10_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask10, v10_offset, srcptr, sizeof(float)); __m512 v11_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask11, v11_offset, srcptr, sizeof(float)); - __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); - __m512 beta = _mm512_set1_ps(*value_ptr_beta); - - __m512 v0 = _mm512_fmadd_ps(v01_val, alpha, _mm512_fnmadd_ps(v00_val, alpha, v00_val)); - __m512 v1 = _mm512_fmadd_ps(v11_val, alpha, _mm512_fnmadd_ps(v10_val, alpha, v10_val)); + __m512 value = _mm512_set1_ps(*offset_value_ptr++); + __m512 v0 = _mm512_fmadd_ps(v01_val, value, _mm512_fnmadd_ps(v00_val, value, v00_val)); + __m512 v1 = _mm512_fmadd_ps(v11_val, value, _mm512_fnmadd_ps(v10_val, value, v10_val)); - __m512 _v = _mm512_fmadd_ps(v1, beta, _mm512_fnmadd_ps(v0, beta, v0)); + value = _mm512_set1_ps(*offset_value_ptr++); + __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0)); _mm512_storeu_ps(dstptr, _v); - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - dstptr += 16; } } } -static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -88,67 +75,47 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - __m512i v000_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_000), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v001_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_001), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v010_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_010), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v011_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_011), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v100_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_100), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v101_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_101), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_110), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr_111), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - - __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_000) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); - __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_001) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); - __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_010) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); - __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_011) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); - __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_100) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); - __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_101) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); - __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_110) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); - __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr_111) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); - - __m512 alpha = _mm512_set1_ps(*value_ptr_alpha); - __m512 beta = _mm512_set1_ps(*value_ptr_beta); - __m512 gamma = _mm512_set1_ps(*value_ptr_gamma); - - __m512 v00 = _mm512_fmadd_ps(v001_val, alpha, _mm512_fnmadd_ps(v000_val, alpha, v000_val)); - __m512 v01 = _mm512_fmadd_ps(v011_val, alpha, _mm512_fnmadd_ps(v010_val, alpha, v010_val)); - __m512 v10 = _mm512_fmadd_ps(v101_val, alpha, _mm512_fnmadd_ps(v100_val, alpha, v100_val)); - __m512 v11 = _mm512_fmadd_ps(v111_val, alpha, _mm512_fnmadd_ps(v110_val, alpha, v110_val)); - - __m512 v0 = _mm512_fmadd_ps(v01, beta, _mm512_fnmadd_ps(v00, beta, v00)); - __m512 v1 = _mm512_fmadd_ps(v11, beta, _mm512_fnmadd_ps(v10, beta, v10)); - - __m512 _v = _mm512_fmadd_ps(v1, gamma, _mm512_fnmadd_ps(v0, gamma, v0)); - _mm512_storeu_ps(dstptr, _v); + __m512i v000_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; + __m512i v001_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; + __m512i v010_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; + __m512i v011_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); + + __m512i v100_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); + + __m512i v101_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); + + __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); + + __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); + + __m512 value = _mm512_set1_ps(*offset_value_ptr++); + __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); + __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val)); + __m512 v10 = _mm512_fmadd_ps(v101_val, value, _mm512_fnmadd_ps(v100_val, value, v100_val)); + __m512 v11 = _mm512_fmadd_ps(v111_val, value, _mm512_fnmadd_ps(v110_val, value, v110_val)); + + value = _mm512_set1_ps(*offset_value_ptr++); + __m512 v0 = _mm512_fmadd_ps(v01, value, _mm512_fnmadd_ps(v00, value, v00)); + __m512 v1 = _mm512_fmadd_ps(v11, value, _mm512_fnmadd_ps(v10, value, v10)); + + value = _mm512_set1_ps(*offset_value_ptr++); + __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0)); + _mm512_storeu_ps(dstptr, _v); dstptr += 16; } @@ -158,21 +125,19 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& #endif // __AVX512F__ #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ -void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); -void gridsample_2d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); -void gridsample_2d_bilinear_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); - -void gridsample_3d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); -void gridsample_3d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); -void gridsample_3d_bilinear_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt); +void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); +void gridsample_2d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); + +void gridsample_3d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); +void gridsample_3d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); #endif -static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_2d_bilinear_apply_interpolation_p8_avx2(src, dst, offset, value, opt); + gridsample_2d_bilinear_apply_interpolation_p8_avx2(src, dst, offset_value, opt); return; } #endif @@ -188,65 +153,49 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { + float in_bound_00 = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; + float in_bound_01 = *reinterpret_cast(offset_value_ptr + 1) >= 0 ? -1.0f : 0.0f; + float in_bound_10 = *reinterpret_cast(offset_value_ptr + 2) >= 0 ? -1.0f : 0.0f; + float in_bound_11 = *reinterpret_cast(offset_value_ptr + 3) >= 0 ? -1.0f : 0.0f; #if __AVX2__ - __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_00), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_01), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_10), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_11), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256i v00_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_00), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v01_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_01), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v10_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_10), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_11), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v00_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v01_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v10_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ - float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; - float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; - float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; - float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, _mm256_set1_ps(in_bound_00)); __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, _mm256_set1_ps(in_bound_01)); __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, _mm256_set1_ps(in_bound_10)); __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, _mm256_set1_ps(in_bound_11)); - __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); - __m256 beta = _mm256_set1_ps(*value_ptr_beta); + __m256 value = _mm256_set1_ps(*offset_value_ptr++); + __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value, _mm256_comp_fnmadd_ps(v00_val, value, v00_val)); + __m256 v1 = _mm256_comp_fmadd_ps(v11_val, value, _mm256_comp_fnmadd_ps(v10_val, value, v10_val)); - __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); + value = _mm256_set1_ps(*offset_value_ptr++); + __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0)); _mm256_storeu_ps(dstptr, _v); - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - dstptr += 8; } } } -static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_3d_bilinear_apply_interpolation_p8_avx2(src, dst, offset, value, opt); + gridsample_3d_bilinear_apply_interpolation_p8_avx2(src, dst, offset_value, opt); return; } #endif @@ -263,49 +212,38 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { #if __AVX2__ - __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_000), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_001), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_010), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v011_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_011), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v100_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_100), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_101), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_110), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr_111), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 1)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 2)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v011_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 3)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v100_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 4)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 5)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 6)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); + __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 7)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); #else - __m256i v000_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_000), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v001_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_001), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v010_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_010), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v011_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_011), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v100_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_100), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v101_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_101), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v110_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_110), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr_111), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v000_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v001_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 1)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v010_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 2)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v011_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 3)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v100_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 4)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v101_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 5)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v110_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 6)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); + __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 7)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); #endif // __AVX2__ - float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; - float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; - float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; - float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; - float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; - float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; - float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; - float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; + float in_bound_000 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_001 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_010 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_011 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_100 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_101 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_110 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + float in_bound_111 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, _mm256_set1_ps(in_bound_000)); __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, _mm256_set1_ps(in_bound_001)); @@ -316,46 +254,31 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, _mm256_set1_ps(in_bound_110)); __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, _mm256_set1_ps(in_bound_111)); - __m256 alpha = _mm256_set1_ps(*value_ptr_alpha); - __m256 beta = _mm256_set1_ps(*value_ptr_beta); - __m256 gamma = _mm256_set1_ps(*value_ptr_gamma); - - __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); + __m256 value = _mm256_set1_ps(*offset_value_ptr++); + __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val)); + __m256 v01 = _mm256_comp_fmadd_ps(v011_val, value, _mm256_comp_fnmadd_ps(v010_val, value, v010_val)); + __m256 v10 = _mm256_comp_fmadd_ps(v101_val, value, _mm256_comp_fnmadd_ps(v100_val, value, v100_val)); + __m256 v11 = _mm256_comp_fmadd_ps(v111_val, value, _mm256_comp_fnmadd_ps(v110_val, value, v110_val)); - __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); - __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); + value = _mm256_set1_ps(*offset_value_ptr++); + __m256 v0 = _mm256_comp_fmadd_ps(v01, value, _mm256_comp_fnmadd_ps(v00, value, v00)); + __m256 v1 = _mm256_comp_fmadd_ps(v11, value, _mm256_comp_fnmadd_ps(v10, value, v10)); - __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); + value = _mm256_set1_ps(*offset_value_ptr++); + __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0)); _mm256_storeu_ps(dstptr, _v); - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - dstptr += 8; } } } #endif // __AVX__ -static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_2d_bilinear_apply_interpolation_p4_avx2(src, dst, offset, value, opt); + gridsample_2d_bilinear_apply_interpolation_p4_avx2(src, dst, offset_value, opt); return; } #endif @@ -371,58 +294,45 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - __m128i v00_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_00), _mm_set_epi32(3, 2, 1, 0)); - __m128i v01_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_01), _mm_set_epi32(3, 2, 1, 0)); - __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_10), _mm_set_epi32(3, 2, 1, 0)); - __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_11), _mm_set_epi32(3, 2, 1, 0)); + __m128i v00_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_00 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v01_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_01 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_00 = *reinterpret_cast(offset_ptr_00) >= 0 ? -1.0f : 0.0f; - float in_bound_01 = *reinterpret_cast(offset_ptr_01) >= 0 ? -1.0f : 0.0f; - float in_bound_10 = *reinterpret_cast(offset_ptr_10) >= 0 ? -1.0f : 0.0f; - float in_bound_11 = *reinterpret_cast(offset_ptr_11) >= 0 ? -1.0f : 0.0f; + __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_10 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_11 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; __m128 v00_val = mask_gather_ps(srcptr, v00_offset, _mm_set1_ps(in_bound_00)); __m128 v01_val = mask_gather_ps(srcptr, v01_offset, _mm_set1_ps(in_bound_01)); __m128 v10_val = mask_gather_ps(srcptr, v10_offset, _mm_set1_ps(in_bound_10)); __m128 v11_val = mask_gather_ps(srcptr, v11_offset, _mm_set1_ps(in_bound_11)); - __m128 alpha = _mm_set1_ps(*value_ptr_alpha); - __m128 beta = _mm_set1_ps(*value_ptr_beta); - - __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); + __m128 value = _mm_set1_ps(*offset_value_ptr++); + __m128 v0 = _mm_comp_fmadd_ps(v01_val, value, _mm_comp_fnmadd_ps(v00_val, value, v00_val)); + __m128 v1 = _mm_comp_fmadd_ps(v11_val, value, _mm_comp_fnmadd_ps(v10_val, value, v10_val)); - __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); + value = _mm_set1_ps(*offset_value_ptr++); + __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0)); _mm_storeu_ps(dstptr, _v); - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; - dstptr += 4; } } } -static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_3d_bilinear_apply_interpolation_p4_avx2(src, dst, offset, value, opt); + gridsample_3d_bilinear_apply_interpolation_p4_avx2(src, dst, offset_value, opt); return; } #endif @@ -439,38 +349,33 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); + const float* offset_value_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - __m128i v000_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_000), _mm_set_epi32(3, 2, 1, 0)); - __m128i v001_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_001), _mm_set_epi32(3, 2, 1, 0)); - __m128i v010_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_010), _mm_set_epi32(3, 2, 1, 0)); - __m128i v011_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_011), _mm_set_epi32(3, 2, 1, 0)); - __m128i v100_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_100), _mm_set_epi32(3, 2, 1, 0)); - __m128i v101_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_101), _mm_set_epi32(3, 2, 1, 0)); - __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_110), _mm_set_epi32(3, 2, 1, 0)); - __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_ptr_111), _mm_set_epi32(3, 2, 1, 0)); - - float in_bound_000 = *reinterpret_cast(offset_ptr_000) >= 0 ? -1.0f : 0.0f; - float in_bound_001 = *reinterpret_cast(offset_ptr_001) >= 0 ? -1.0f : 0.0f; - float in_bound_010 = *reinterpret_cast(offset_ptr_010) >= 0 ? -1.0f : 0.0f; - float in_bound_011 = *reinterpret_cast(offset_ptr_011) >= 0 ? -1.0f : 0.0f; - float in_bound_100 = *reinterpret_cast(offset_ptr_100) >= 0 ? -1.0f : 0.0f; - float in_bound_101 = *reinterpret_cast(offset_ptr_101) >= 0 ? -1.0f : 0.0f; - float in_bound_110 = *reinterpret_cast(offset_ptr_110) >= 0 ? -1.0f : 0.0f; - float in_bound_111 = *reinterpret_cast(offset_ptr_111) >= 0 ? -1.0f : 0.0f; + __m128i v000_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_000 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v001_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_001 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v010_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_010 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v011_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_011 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v100_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_100 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v101_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_101 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_110 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + + __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); + float in_bound_111 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; __m128 v000_val = mask_gather_ps(srcptr, v000_offset, _mm_set1_ps(in_bound_000)); __m128 v001_val = mask_gather_ps(srcptr, v001_offset, _mm_set1_ps(in_bound_001)); @@ -481,51 +386,28 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d __m128 v110_val = mask_gather_ps(srcptr, v110_offset, _mm_set1_ps(in_bound_110)); __m128 v111_val = mask_gather_ps(srcptr, v111_offset, _mm_set1_ps(in_bound_111)); - __m128 alpha = _mm_set1_ps(*value_ptr_alpha); - __m128 beta = _mm_set1_ps(*value_ptr_beta); - __m128 gamma = _mm_set1_ps(*value_ptr_gamma); - - __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); + __m128 value = _mm_set1_ps(*offset_value_ptr++); + __m128 v00 = _mm_comp_fmadd_ps(v001_val, value, _mm_comp_fnmadd_ps(v000_val, value, v000_val)); + __m128 v01 = _mm_comp_fmadd_ps(v011_val, value, _mm_comp_fnmadd_ps(v010_val, value, v010_val)); + __m128 v10 = _mm_comp_fmadd_ps(v101_val, value, _mm_comp_fnmadd_ps(v100_val, value, v100_val)); + __m128 v11 = _mm_comp_fmadd_ps(v111_val, value, _mm_comp_fnmadd_ps(v110_val, value, v110_val)); - __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); - __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); + value = _mm_set1_ps(*offset_value_ptr++); + __m128 v0 = _mm_comp_fmadd_ps(v01, value, _mm_comp_fnmadd_ps(v00, value, v00)); + __m128 v1 = _mm_comp_fmadd_ps(v11, value, _mm_comp_fnmadd_ps(v10, value, v10)); - __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); + value = _mm_set1_ps(*offset_value_ptr++); + __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0)); _mm_storeu_ps(dstptr, _v); - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; - dstptr += 4; } } } #endif // __SSE2__ -static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) +static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_2d_bilinear_apply_interpolation_p1_avx2(src, dst, offset, value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -537,125 +419,33 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_00 = offset.channel(0); - const float* offset_ptr_01 = offset.channel(1); - const float* offset_ptr_10 = offset.channel(2); - const float* offset_ptr_11 = offset.channel(3); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - - int x = 0; -#if __SSE2__ -#if __AVX__ - - for (; x + 7 < grid_size; x += 8) - { - __m256i v00_offset = _mm256_set_epi32(*(offset_ptr_00 + 7), *(offset_ptr_00 + 6), *(offset_ptr_00 + 5), *(offset_ptr_00 + 4), *(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); - __m256i v01_offset = _mm256_set_epi32(*(offset_ptr_01 + 7), *(offset_ptr_01 + 6), *(offset_ptr_01 + 5), *(offset_ptr_01 + 4), *(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); - __m256i v10_offset = _mm256_set_epi32(*(offset_ptr_10 + 7), *(offset_ptr_10 + 6), *(offset_ptr_10 + 5), *(offset_ptr_10 + 4), *(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); - __m256i v11_offset = _mm256_set_epi32(*(offset_ptr_11 + 7), *(offset_ptr_11 + 6), *(offset_ptr_11 + 5), *(offset_ptr_11 + 4), *(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); - - __m256 v00_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_00), _mm256_set1_ps(-1.0f)); - __m256 v01_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_01), _mm256_set1_ps(-1.0f)); - __m256 v10_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_10), _mm256_set1_ps(-1.0f)); - __m256 v11_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_11), _mm256_set1_ps(-1.0f)); - - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, v00_in_bound); - __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, v01_in_bound); - __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, v10_in_bound); - __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, v11_in_bound); - - __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); - __m256 beta = _mm256_loadu_ps(value_ptr_beta); - - __m256 v0 = _mm256_comp_fmadd_ps(v01_val, alpha, _mm256_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m256 v1 = _mm256_comp_fmadd_ps(v11_val, alpha, _mm256_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, beta, _mm256_comp_fnmadd_ps(v0, beta, v0)); - _mm256_storeu_ps(dstptr, _v); - - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; + const float* offset_value_ptr = offset_value.channel(0); - value_ptr_alpha += 8; - value_ptr_beta += 8; - - dstptr += 8; - } -#endif // __AVX__ - for (; x + 3 < grid_size; x += 4) + for (int x = 0 ; x < grid_size; x++) { - __m128i v00_offset = _mm_set_epi32(*(offset_ptr_00 + 3), *(offset_ptr_00 + 2), *(offset_ptr_00 + 1), *offset_ptr_00); - __m128i v01_offset = _mm_set_epi32(*(offset_ptr_01 + 3), *(offset_ptr_01 + 2), *(offset_ptr_01 + 1), *offset_ptr_01); - __m128i v10_offset = _mm_set_epi32(*(offset_ptr_10 + 3), *(offset_ptr_10 + 2), *(offset_ptr_10 + 1), *offset_ptr_10); - __m128i v11_offset = _mm_set_epi32(*(offset_ptr_11 + 3), *(offset_ptr_11 + 2), *(offset_ptr_11 + 1), *offset_ptr_11); - - __m128 v00_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_00), _mm_set1_ps(-1.0f)); - __m128 v01_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_01), _mm_set1_ps(-1.0f)); - __m128 v10_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_10), _mm_set1_ps(-1.0f)); - __m128 v11_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_11), _mm_set1_ps(-1.0f)); - - __m128 v00_val = mask_gather_ps(srcptr, v00_offset, v00_in_bound); - __m128 v01_val = mask_gather_ps(srcptr, v01_offset, v01_in_bound); - __m128 v10_val = mask_gather_ps(srcptr, v10_offset, v10_in_bound); - __m128 v11_val = mask_gather_ps(srcptr, v11_offset, v11_in_bound); - - __m128 alpha = _mm_loadu_ps(value_ptr_alpha); - __m128 beta = _mm_loadu_ps(value_ptr_beta); + float v00 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v01 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v10 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v11 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + + float v0 = v00 * (1 - *offset_value_ptr) + v01 * *offset_value_ptr; + float v1 = v10 * (1 - *offset_value_ptr) + v11 * *offset_value_ptr; + offset_value_ptr++; + + *dstptr = v0 * (1 - *offset_value_ptr) + v1 * *offset_value_ptr; + offset_value_ptr++; - __m128 v0 = _mm_comp_fmadd_ps(v01_val, alpha, _mm_comp_fnmadd_ps(v00_val, alpha, v00_val)); - __m128 v1 = _mm_comp_fmadd_ps(v11_val, alpha, _mm_comp_fnmadd_ps(v10_val, alpha, v10_val)); - - __m128 _v = _mm_comp_fmadd_ps(v1, beta, _mm_comp_fnmadd_ps(v0, beta, v0)); - _mm_storeu_ps(dstptr, _v); - - offset_ptr_00 += 4; - offset_ptr_01 += 4; - offset_ptr_10 += 4; - offset_ptr_11 += 4; - - value_ptr_alpha += 4; - value_ptr_beta += 4; - - dstptr += 4; - } -#endif // __SSE2__ - for (; x < grid_size; x++) - { - float v00 = *offset_ptr_00 >= 0 ? *(srcptr + static_cast(*offset_ptr_00)) : 0; - float v01 = *offset_ptr_01 >= 0 ? *(srcptr + static_cast(*offset_ptr_01)) : 0; - float v10 = *offset_ptr_10 >= 0 ? *(srcptr + static_cast(*offset_ptr_10)) : 0; - float v11 = *offset_ptr_11 >= 0 ? *(srcptr + static_cast(*offset_ptr_11)) : 0; - - float v0 = v00 * (1 - *value_ptr_alpha) + v01 * *value_ptr_alpha; - float v1 = v10 * (1 - *value_ptr_alpha) + v11 * *value_ptr_alpha; - - *dstptr = v0 * (1 - *value_ptr_beta) + v1 * *value_ptr_beta; - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; dstptr++; } } } -static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Mat& value, const Option& opt) -{ -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_3d_bilinear_apply_interpolation_p1_avx2(src, dst, offset, value, opt); - return; - } -#endif +static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) +{ const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -668,180 +458,41 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr_000 = offset.channel(0); - const float* offset_ptr_001 = offset.channel(1); - const float* offset_ptr_010 = offset.channel(2); - const float* offset_ptr_011 = offset.channel(3); - const float* offset_ptr_100 = offset.channel(4); - const float* offset_ptr_101 = offset.channel(5); - const float* offset_ptr_110 = offset.channel(6); - const float* offset_ptr_111 = offset.channel(7); - - const float* value_ptr_alpha = value.channel(0); - const float* value_ptr_beta = value.channel(1); - const float* value_ptr_gamma = value.channel(2); - - int x = 0; -#if __SSE2__ -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256i v000_offset = _mm256_set_epi32(*(offset_ptr_000 + 7), *(offset_ptr_000 + 6), *(offset_ptr_000 + 5), *(offset_ptr_000 + 4), *(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); - __m256i v001_offset = _mm256_set_epi32(*(offset_ptr_001 + 7), *(offset_ptr_001 + 6), *(offset_ptr_001 + 5), *(offset_ptr_001 + 4), *(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); - __m256i v010_offset = _mm256_set_epi32(*(offset_ptr_010 + 7), *(offset_ptr_010 + 6), *(offset_ptr_010 + 5), *(offset_ptr_010 + 4), *(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); - __m256i v011_offset = _mm256_set_epi32(*(offset_ptr_011 + 7), *(offset_ptr_011 + 6), *(offset_ptr_011 + 5), *(offset_ptr_011 + 4), *(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); - __m256i v100_offset = _mm256_set_epi32(*(offset_ptr_100 + 7), *(offset_ptr_100 + 6), *(offset_ptr_100 + 5), *(offset_ptr_100 + 4), *(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); - __m256i v101_offset = _mm256_set_epi32(*(offset_ptr_101 + 7), *(offset_ptr_101 + 6), *(offset_ptr_101 + 5), *(offset_ptr_101 + 4), *(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); - __m256i v110_offset = _mm256_set_epi32(*(offset_ptr_110 + 7), *(offset_ptr_110 + 6), *(offset_ptr_110 + 5), *(offset_ptr_110 + 4), *(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); - __m256i v111_offset = _mm256_set_epi32(*(offset_ptr_111 + 7), *(offset_ptr_111 + 6), *(offset_ptr_111 + 5), *(offset_ptr_111 + 4), *(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); - - __m256 v000_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_000), _mm256_set1_ps(-1.0f)); - __m256 v001_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_001), _mm256_set1_ps(-1.0f)); - __m256 v010_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_010), _mm256_set1_ps(-1.0f)); - __m256 v011_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_011), _mm256_set1_ps(-1.0f)); - __m256 v100_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_100), _mm256_set1_ps(-1.0f)); - __m256 v101_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_101), _mm256_set1_ps(-1.0f)); - __m256 v110_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_110), _mm256_set1_ps(-1.0f)); - __m256 v111_in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr_111), _mm256_set1_ps(-1.0f)); - - __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, v000_in_bound); - __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, v001_in_bound); - __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, v010_in_bound); - __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, v011_in_bound); - __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, v100_in_bound); - __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, v101_in_bound); - __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, v110_in_bound); - __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, v111_in_bound); - - __m256 alpha = _mm256_loadu_ps(value_ptr_alpha); - __m256 beta = _mm256_loadu_ps(value_ptr_beta); - __m256 gamma = _mm256_loadu_ps(value_ptr_gamma); - - __m256 v00 = _mm256_comp_fmadd_ps(v001_val, alpha, _mm256_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m256 v01 = _mm256_comp_fmadd_ps(v011_val, alpha, _mm256_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m256 v10 = _mm256_comp_fmadd_ps(v101_val, alpha, _mm256_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m256 v11 = _mm256_comp_fmadd_ps(v111_val, alpha, _mm256_comp_fnmadd_ps(v110_val, alpha, v110_val)); - - __m256 v0 = _mm256_comp_fmadd_ps(v01, beta, _mm256_comp_fnmadd_ps(v00, beta, v00)); - __m256 v1 = _mm256_comp_fmadd_ps(v11, beta, _mm256_comp_fnmadd_ps(v10, beta, v10)); - - __m256 _v = _mm256_comp_fmadd_ps(v1, gamma, _mm256_comp_fnmadd_ps(v0, gamma, v0)); - _mm256_storeu_ps(dstptr, _v); - - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; + const float* offset_value_ptr = offset_value.channel(0); - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; - - dstptr += 8; - } - -#endif // __AVX__ - for (; x + 3 < grid_size; x += 4) + for (int x = 0; x < grid_size; x++) { - __m128i v000_offset = _mm_set_epi32(*(offset_ptr_000 + 3), *(offset_ptr_000 + 2), *(offset_ptr_000 + 1), *offset_ptr_000); - __m128i v001_offset = _mm_set_epi32(*(offset_ptr_001 + 3), *(offset_ptr_001 + 2), *(offset_ptr_001 + 1), *offset_ptr_001); - __m128i v010_offset = _mm_set_epi32(*(offset_ptr_010 + 3), *(offset_ptr_010 + 2), *(offset_ptr_010 + 1), *offset_ptr_010); - __m128i v011_offset = _mm_set_epi32(*(offset_ptr_011 + 3), *(offset_ptr_011 + 2), *(offset_ptr_011 + 1), *offset_ptr_011); - __m128i v100_offset = _mm_set_epi32(*(offset_ptr_100 + 3), *(offset_ptr_100 + 2), *(offset_ptr_100 + 1), *offset_ptr_100); - __m128i v101_offset = _mm_set_epi32(*(offset_ptr_101 + 3), *(offset_ptr_101 + 2), *(offset_ptr_101 + 1), *offset_ptr_101); - __m128i v110_offset = _mm_set_epi32(*(offset_ptr_110 + 3), *(offset_ptr_110 + 2), *(offset_ptr_110 + 1), *offset_ptr_110); - __m128i v111_offset = _mm_set_epi32(*(offset_ptr_111 + 3), *(offset_ptr_111 + 2), *(offset_ptr_111 + 1), *offset_ptr_111); - - __m128 v000_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_000), _mm_set1_ps(-1.0f)); - __m128 v001_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_001), _mm_set1_ps(-1.0f)); - __m128 v010_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_010), _mm_set1_ps(-1.0f)); - __m128 v011_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_011), _mm_set1_ps(-1.0f)); - __m128 v100_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_100), _mm_set1_ps(-1.0f)); - __m128 v101_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_101), _mm_set1_ps(-1.0f)); - __m128 v110_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_110), _mm_set1_ps(-1.0f)); - __m128 v111_in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr_111), _mm_set1_ps(-1.0f)); - - __m128 v000_val = mask_gather_ps(srcptr, v000_offset, v000_in_bound); - __m128 v001_val = mask_gather_ps(srcptr, v001_offset, v001_in_bound); - __m128 v010_val = mask_gather_ps(srcptr, v010_offset, v010_in_bound); - __m128 v011_val = mask_gather_ps(srcptr, v011_offset, v011_in_bound); - __m128 v100_val = mask_gather_ps(srcptr, v100_offset, v100_in_bound); - __m128 v101_val = mask_gather_ps(srcptr, v101_offset, v101_in_bound); - __m128 v110_val = mask_gather_ps(srcptr, v110_offset, v110_in_bound); - __m128 v111_val = mask_gather_ps(srcptr, v111_offset, v111_in_bound); - - __m128 alpha = _mm_loadu_ps(value_ptr_alpha); - __m128 beta = _mm_loadu_ps(value_ptr_beta); - __m128 gamma = _mm_loadu_ps(value_ptr_gamma); - - __m128 v00 = _mm_comp_fmadd_ps(v001_val, alpha, _mm_comp_fnmadd_ps(v000_val, alpha, v000_val)); - __m128 v01 = _mm_comp_fmadd_ps(v011_val, alpha, _mm_comp_fnmadd_ps(v010_val, alpha, v010_val)); - __m128 v10 = _mm_comp_fmadd_ps(v101_val, alpha, _mm_comp_fnmadd_ps(v100_val, alpha, v100_val)); - __m128 v11 = _mm_comp_fmadd_ps(v111_val, alpha, _mm_comp_fnmadd_ps(v110_val, alpha, v110_val)); - - __m128 v0 = _mm_comp_fmadd_ps(v01, beta, _mm_comp_fnmadd_ps(v00, beta, v00)); - __m128 v1 = _mm_comp_fmadd_ps(v11, beta, _mm_comp_fnmadd_ps(v10, beta, v10)); - - __m128 _v = _mm_comp_fmadd_ps(v1, gamma, _mm_comp_fnmadd_ps(v0, gamma, v0)); - _mm_storeu_ps(dstptr, _v); + float v000 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v001 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v010 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v011 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + + float v100 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v101 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v110 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + float v111 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; + offset_value_ptr++; + + float v00 = v000 * (1 - *offset_value_ptr) + v001 * *offset_value_ptr; + float v01 = v010 * (1 - *offset_value_ptr) + v011 * *offset_value_ptr; + float v10 = v100 * (1 - *offset_value_ptr) + v101 * *offset_value_ptr; + float v11 = v110 * (1 - *offset_value_ptr) + v111 * *offset_value_ptr; + offset_value_ptr++; + + float v0 = v00 * (1 - *offset_value_ptr) + v01 * *offset_value_ptr; + float v1 = v10 * (1 - *offset_value_ptr) + v11 * *offset_value_ptr; + offset_value_ptr++; + + *dstptr = v0 * (1 - *offset_value_ptr) + v1 * *offset_value_ptr; + offset_value_ptr++; - offset_ptr_000 += 4; - offset_ptr_001 += 4; - offset_ptr_010 += 4; - offset_ptr_011 += 4; - - offset_ptr_100 += 4; - offset_ptr_101 += 4; - offset_ptr_110 += 4; - offset_ptr_111 += 4; - - value_ptr_alpha += 4; - value_ptr_beta += 4; - value_ptr_gamma += 4; - - dstptr += 4; - } -#endif // __SSE2__ - for (; x < grid_size; x++) - { - float v000 = *reinterpret_cast(offset_ptr_000) >= 0 ? *(srcptr + static_cast(*offset_ptr_000)) : 0; - float v001 = *reinterpret_cast(offset_ptr_001) >= 0 ? *(srcptr + static_cast(*offset_ptr_001)) : 0; - float v010 = *reinterpret_cast(offset_ptr_010) >= 0 ? *(srcptr + static_cast(*offset_ptr_010)) : 0; - float v011 = *reinterpret_cast(offset_ptr_011) >= 0 ? *(srcptr + static_cast(*offset_ptr_011)) : 0; - - float v100 = *reinterpret_cast(offset_ptr_100) >= 0 ? *(srcptr + static_cast(*offset_ptr_100)) : 0; - float v101 = *reinterpret_cast(offset_ptr_101) >= 0 ? *(srcptr + static_cast(*offset_ptr_101)) : 0; - float v110 = *reinterpret_cast(offset_ptr_110) >= 0 ? *(srcptr + static_cast(*offset_ptr_110)) : 0; - float v111 = *reinterpret_cast(offset_ptr_111) >= 0 ? *(srcptr + static_cast(*offset_ptr_111)) : 0; - - float v00 = v000 * (1 - *value_ptr_alpha) + v001 * *value_ptr_alpha; - float v01 = v010 * (1 - *value_ptr_alpha) + v011 * *value_ptr_alpha; - float v10 = v100 * (1 - *value_ptr_alpha) + v101 * *value_ptr_alpha; - float v11 = v110 * (1 - *value_ptr_alpha) + v111 * *value_ptr_alpha; - - float v0 = v00 * (1 - *value_ptr_beta) + v01 * *value_ptr_beta; - float v1 = v10 * (1 - *value_ptr_beta) + v11 * *value_ptr_beta; - - *dstptr = v0 * (1 - *value_ptr_gamma) + v1 * *value_ptr_gamma; - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; dstptr++; } } diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index fb2686ca419..8db03092332 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -13,17 +13,11 @@ // specific language governing permissions and limitations under the License. template -void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) +void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion) { const int grid_size = grid.w * grid.h; - float* offset_ptr_00 = offset.channel(0); - float* offset_ptr_01 = offset.channel(1); - float* offset_ptr_10 = offset.channel(2); - float* offset_ptr_11 = offset.channel(3); - - float* value_ptr_alpha = value.channel(0); - float* value_ptr_beta = value.channel(1); + float* offset_value_ptr = offset_value.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -47,13 +41,11 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o gx = _mm256_shuffle_ps(gx, gy, 0b10001000); gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - } + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); @@ -80,30 +72,21 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range); se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range); - - _mm256_storeu_ps(offset_ptr_00, nw_offset); - _mm256_storeu_ps(offset_ptr_01, ne_offset); - _mm256_storeu_ps(offset_ptr_10, sw_offset); - _mm256_storeu_ps(offset_ptr_11, se_offset); - __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); + transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); + _mm256_storeu_ps(offset_value_ptr, nw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, se_offset); - gridptr += 16; - - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; + _mm256_storeu_ps(offset_value_ptr + 32, alpha); + _mm256_storeu_ps(offset_value_ptr + 40, beta); - value_ptr_alpha += 8; - value_ptr_beta += 8; + gridptr += 16; + offset_value_ptr += 48; } #endif // __AVX__ @@ -118,8 +101,8 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); + int x0 = (int)floorf(sample_x); + int y0 = (int)floorf(sample_y); int x1 = x0 + 1; int y1 = y0 + 1; @@ -133,23 +116,15 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_10 = x0_in_bound & y1_in_bound; bool in_bound_11 = x1_in_bound & y1_in_bound; - *offset_ptr_00 = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; - *offset_ptr_01 = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; - *offset_ptr_10 = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; - *offset_ptr_11 = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; + *offset_value_ptr++ = sample_x - x0; + *offset_value_ptr++ = sample_y - y0; gridptr += 2; - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; } } } @@ -165,13 +140,11 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - } + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); @@ -198,28 +171,22 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range); se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range); - - _mm256_storeu_ps(offset_ptr_00, nw_offset); - _mm256_storeu_ps(offset_ptr_01, ne_offset); - _mm256_storeu_ps(offset_ptr_10, sw_offset); - _mm256_storeu_ps(offset_ptr_11, se_offset); - __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); + transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); - gridptr_x += 8; - gridptr_y += 8; + _mm256_storeu_ps(offset_value_ptr, nw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, se_offset); - offset_ptr_00 += 8; - offset_ptr_01 += 8; - offset_ptr_10 += 8; - offset_ptr_11 += 8; + _mm256_storeu_ps(offset_value_ptr + 32, alpha); + _mm256_storeu_ps(offset_value_ptr + 40, beta); - value_ptr_alpha += 8; - value_ptr_beta += 8; + gridptr_x += 8; + gridptr_y += 8; + offset_value_ptr += 48; } #endif // __AVX__ @@ -235,8 +202,8 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); + int x0 = (int)floorf(sample_x); + int y0 = (int)floorf(sample_y); int x1 = x0 + 1; int y1 = y0 + 1; @@ -250,46 +217,26 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_10 = x0_in_bound & y1_in_bound; bool in_bound_11 = x1_in_bound & y1_in_bound; - *offset_ptr_00 = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; - *offset_ptr_01 = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; - *offset_ptr_10 = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; - *offset_ptr_11 = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; + *offset_value_ptr++ = sample_x - x0; + *offset_value_ptr++ = sample_y - y0; gridptr_x++; gridptr_y++; - - offset_ptr_00++; - offset_ptr_01++; - offset_ptr_10++; - offset_ptr_11++; - - value_ptr_alpha++; - value_ptr_beta++; } } } template -void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) +void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion) { const int grid_size = grid.w * grid.h * grid.d; - float* offset_ptr_000 = offset.channel(0); - float* offset_ptr_001 = offset.channel(1); - float* offset_ptr_010 = offset.channel(2); - float* offset_ptr_011 = offset.channel(3); - - float* offset_ptr_100 = offset.channel(4); - float* offset_ptr_101 = offset.channel(5); - float* offset_ptr_110 = offset.channel(6); - float* offset_ptr_111 = offset.channel(7); - - float* value_ptr_alpha = value.channel(0); - float* value_ptr_beta = value.channel(1); - float* value_ptr_gamma = value.channel(2); + float* offset_value_ptr = offset_value.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -318,16 +265,14 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - gz = get_coord(_mm256_set1_ps(src.d), gz); - } + gz = unormalize(_mm256_set1_ps(src.d), gz); + gz = get_coord(_mm256_set1_ps(src.d), gz); __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); @@ -384,39 +329,29 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range); bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range); - _mm256_storeu_ps(offset_ptr_000, tnw_offset); - _mm256_storeu_ps(offset_ptr_001, tne_offset); - _mm256_storeu_ps(offset_ptr_010, tsw_offset); - _mm256_storeu_ps(offset_ptr_011, tse_offset); - - _mm256_storeu_ps(offset_ptr_100, bnw_offset); - _mm256_storeu_ps(offset_ptr_101, bne_offset); - _mm256_storeu_ps(offset_ptr_110, bsw_offset); - _mm256_storeu_ps(offset_ptr_111, bse_offset); - __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); __m256 gamma = _mm256_sub_ps(gz, z_t); - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - _mm256_storeu_ps(value_ptr_gamma, gamma); + transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma); - gridptr += 24; + _mm256_storeu_ps(offset_value_ptr, tnw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, tne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; + _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset); + _mm256_storeu_ps(offset_value_ptr + 40, bne_offset); + _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset); + _mm256_storeu_ps(offset_value_ptr + 56, bse_offset); - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; + _mm256_storeu_ps(offset_value_ptr + 64, alpha); + _mm256_storeu_ps(offset_value_ptr + 72, beta); + _mm256_storeu_ps(offset_value_ptr + 80, gamma); - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; + gridptr += 24; + + offset_value_ptr += 88; } #endif // __AVX__ @@ -435,9 +370,9 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o sample_z = unormalize(src.d, sample_z); sample_z = get_coord(src.d, sample_z); - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int z0 = (int)floor(sample_z); + int x0 = (int)floorf(sample_x); + int y0 = (int)floorf(sample_y); + int z0 = (int)floorf(sample_z); int x1 = x0 + 1; int y1 = y0 + 1; int z1 = z0 + 1; @@ -464,35 +399,21 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_110 = v10_in_range & z1_in_range; bool in_bound_111 = v11_in_range & z1_in_range; - *offset_ptr_000 = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_001 = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_010 = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_011 = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_100 = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_101 = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_110 = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_111 = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - *value_ptr_gamma = sample_z - z0; + *offset_value_ptr++ = sample_x - x0; + *offset_value_ptr++ = sample_y - y0; + *offset_value_ptr++ = sample_z - z0; gridptr += 3; - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; } } } @@ -510,16 +431,14 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o __m256 gy = _mm256_loadu_ps(gridptr_y); __m256 gz = _mm256_loadu_ps(gridptr_z); - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - gz = get_coord(_mm256_set1_ps(src.d), gz); - } + gz = unormalize(_mm256_set1_ps(src.d), gz); + gz = get_coord(_mm256_set1_ps(src.d), gz); __m256 x_w = _mm256_floor_ps(gx); __m256 y_n = _mm256_floor_ps(gy); @@ -576,41 +495,31 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range); bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range); - _mm256_storeu_ps(offset_ptr_000, tnw_offset); - _mm256_storeu_ps(offset_ptr_001, tne_offset); - _mm256_storeu_ps(offset_ptr_010, tsw_offset); - _mm256_storeu_ps(offset_ptr_011, tse_offset); - - _mm256_storeu_ps(offset_ptr_100, bnw_offset); - _mm256_storeu_ps(offset_ptr_101, bne_offset); - _mm256_storeu_ps(offset_ptr_110, bsw_offset); - _mm256_storeu_ps(offset_ptr_111, bse_offset); - __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); __m256 gamma = _mm256_sub_ps(gz, z_t); - _mm256_storeu_ps(value_ptr_alpha, alpha); - _mm256_storeu_ps(value_ptr_beta, beta); - _mm256_storeu_ps(value_ptr_gamma, gamma); + transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma); + + _mm256_storeu_ps(offset_value_ptr, tnw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, tne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); + + _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset); + _mm256_storeu_ps(offset_value_ptr + 40, bne_offset); + _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset); + _mm256_storeu_ps(offset_value_ptr + 56, bse_offset); + + _mm256_storeu_ps(offset_value_ptr + 64, alpha); + _mm256_storeu_ps(offset_value_ptr + 72, beta); + _mm256_storeu_ps(offset_value_ptr + 80, gamma); gridptr_x += 8; gridptr_y += 8; gridptr_z += 8; - offset_ptr_000 += 8; - offset_ptr_001 += 8; - offset_ptr_010 += 8; - offset_ptr_011 += 8; - - offset_ptr_100 += 8; - offset_ptr_101 += 8; - offset_ptr_110 += 8; - offset_ptr_111 += 8; - - value_ptr_alpha += 8; - value_ptr_beta += 8; - value_ptr_gamma += 8; + offset_value_ptr += 88; } #endif // __AVX__ @@ -629,9 +538,9 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o sample_z = unormalize(src.d, sample_z); sample_z = get_coord(src.d, sample_z); - int x0 = (int)floor(sample_x); - int y0 = (int)floor(sample_y); - int z0 = (int)floor(sample_z); + int x0 = (int)floorf(sample_x); + int y0 = (int)floorf(sample_y); + int z0 = (int)floorf(sample_z); int x1 = x0 + 1; int y1 = y0 + 1; int z1 = z0 + 1; @@ -658,37 +567,23 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_110 = v10_in_range & z1_in_range; bool in_bound_111 = v11_in_range & z1_in_range; - *offset_ptr_000 = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_001 = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_010 = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_011 = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + + *offset_value_ptr++ = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + *offset_value_ptr++ = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_100 = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_101 = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_110 = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_ptr_111 = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - - *value_ptr_alpha = sample_x - x0; - *value_ptr_beta = sample_y - y0; - *value_ptr_gamma = sample_z - z0; + *offset_value_ptr++ = sample_x - x0; + *offset_value_ptr++ = sample_y - y0; + *offset_value_ptr++ = sample_z - z0; gridptr_x++; gridptr_y++; gridptr_z++; - - offset_ptr_000++; - offset_ptr_001++; - offset_ptr_010++; - offset_ptr_011++; - - offset_ptr_100++; - offset_ptr_101++; - offset_ptr_110++; - offset_ptr_111++; - - value_ptr_alpha++; - value_ptr_beta++; - value_ptr_gamma++; } } } diff --git a/src/layer/x86/gridsample_compute_blob.h b/src/layer/x86/gridsample_compute_blob.h index d88251da3f7..4fb41f2cc24 100644 --- a/src/layer/x86/gridsample_compute_blob.h +++ b/src/layer/x86/gridsample_compute_blob.h @@ -12,6 +12,8 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. +#include "x86_usability.h" + template struct grid_sample_unormalize; diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index c71086e5bea..e08eefd526f 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -15,7 +15,7 @@ #if __SSE2__ #if __AVX__ #if __AVX512F__ -static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { const int channels = dst.c; const int outw = dst.w; @@ -29,7 +29,7 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset.channel(0); + const float* offset_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { @@ -45,17 +45,17 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, #endif // __AVX512F__ #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ -void gridsample_nearest_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset, const Option& opt); -void gridsample_nearest_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset, const Option& opt); -void gridsample_nearest_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset, const Option& opt); +void gridsample_nearest_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); +void gridsample_nearest_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); +void gridsample_nearest_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); #endif -static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_nearest_apply_interpolation_p8_avx2(src, dst, offset, opt); + gridsample_nearest_apply_interpolation_p8_avx2(src, dst, offset_value, opt); return; } #endif @@ -72,7 +72,7 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset.channel(0); + const float* offset_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { @@ -92,12 +92,12 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, } } #endif // __AVX__ -static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_nearest_apply_interpolation_p4_avx2(src, dst, offset, opt); + gridsample_nearest_apply_interpolation_p4_avx2(src, dst, offset_value, opt); return; } #endif @@ -114,7 +114,7 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset.channel(0); + const float* offset_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { @@ -131,12 +131,12 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, #endif // __SSE2__ -static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset, const Option& opt) +static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ if (ncnn::cpu_support_x86_avx2()) { - gridsample_nearest_apply_interpolation_p1_avx2(src, dst, offset, opt); + gridsample_nearest_apply_interpolation_p1_avx2(src, dst, offset_value, opt); return; } #endif @@ -153,7 +153,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset.channel(0); + const float* offset_ptr = offset_value.channel(0); int x = 0; #if __SSE2__ diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index ae6905753f6..f3e59ce33d5 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -13,11 +13,11 @@ // specific language governing permissions and limitations under the License. template -void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) +void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion) { const int grid_size = grid.w * grid.h; - float* offset_ptr = offset.channel(0); + float* offset_ptr = offset_value.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -79,8 +79,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + int x0 = static_cast(floorf(sample_x + 0.5f)); + int y0 = static_cast(floorf(sample_y + 0.5f)); bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)); *offset_ptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0f; @@ -141,8 +141,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of sample_y = unormalize(src.h, sample_y); sample_y = get_coord(src.h, sample_y); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); + int x0 = static_cast(floorf(sample_x + 0.5f)); + int y0 = static_cast(floorf(sample_y + 0.5f)); bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)); @@ -157,11 +157,11 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of } template -void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset, Mat& value, int permute_fusion) +void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion) { const int grid_size = grid.w * grid.h * grid.d; - float* offset_ptr = offset.channel(0); + float* offset_ptr = offset_value.channel(0); grid_sample_unormalize unormalize; compute_coord get_coord; @@ -239,9 +239,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of sample_z = unormalize(src.d, sample_z); sample_z = get_coord(src.d, sample_z); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - int z0 = static_cast(floor(sample_z + 0.5f)); + int x0 = static_cast(floorf(sample_x + 0.5f)); + int y0 = static_cast(floorf(sample_y + 0.5f)); + int z0 = static_cast(floorf(sample_z + 0.5f)); bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)); @@ -318,9 +318,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of sample_z = unormalize(src.d, sample_z); sample_z = get_coord(src.d, sample_z); - int x0 = static_cast(floor(sample_x + 0.5f)); - int y0 = static_cast(floor(sample_y + 0.5f)); - int z0 = static_cast(floor(sample_z + 0.5f)); + int x0 = static_cast(floorf(sample_x + 0.5f)); + int y0 = static_cast(floorf(sample_y + 0.5f)); + int z0 = static_cast(floorf(sample_z + 0.5f)); bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)); diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index e53f6a53b08..db9efc85287 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -52,7 +52,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else @@ -122,41 +121,41 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else @@ -168,42 +167,41 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_2d_bicubic_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else @@ -226,42 +224,41 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_bilinear_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else @@ -273,41 +270,41 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_BORDER) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else if (padding_mode == GridSample::Padding_REFLECTION) { if (align_corner == 0) { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } else { - gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_blob, value_blob, permute_fusion); + gridsample_3d_nearest_compute_blob(bottom_blob, grid_p1, offset_value_blob, permute_fusion); } } else @@ -333,26 +330,26 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector Date: Mon, 17 Jul 2023 00:13:39 +0000 Subject: [PATCH 114/127] apply code-format changes --- src/layer/x86/gridsample_bicubic_apply_interpolation.h | 2 -- src/layer/x86/gridsample_bicubic_compute_blob.h | 4 +--- src/layer/x86/gridsample_bilinear_apply_interpolation.h | 4 ++-- src/layer/x86/gridsample_bilinear_compute_blob.h | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index 116d69537a0..85fbf9239fe 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -221,7 +221,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { - float v0_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v0_in_bound)); float v1_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; @@ -301,7 +300,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds float x3_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; offset_value_ptr++; - value_f[ii] = x_coeffs0 * x0_val; value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; value_f[ii] = x_coeffs2 * x2_val + value_f[ii]; diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 65634e1cf33..bd46e0d961d 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -129,8 +129,6 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of x2 = get_coord(src.w, x2); x3 = get_coord(src.w, x3); - - bool x1_in_range = (x1 > -1) & (x1 < src.w); bool x0_in_range = (x0 > -1) & (x0 < src.w); bool x2_in_range = (x2 > -1) & (x2 < src.w); @@ -179,7 +177,7 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 tx = _mm256_sub_ps(gx, gx_floor); __m256 ty = _mm256_sub_ps(gy, gy_floor); - + __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1)); __m256 gx1 = gx_floor; __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1)); diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 4145b9c7868..9b42ceaac3f 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -127,7 +127,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& #if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); void gridsample_2d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); - + void gridsample_3d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); void gridsample_3d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); #endif @@ -421,7 +421,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d const float* offset_value_ptr = offset_value.channel(0); - for (int x = 0 ; x < grid_size; x++) + for (int x = 0; x < grid_size; x++) { float v00 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; offset_value_ptr++; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 8db03092332..33fb0f120aa 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -571,7 +571,7 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o *offset_value_ptr++ = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; *offset_value_ptr++ = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; *offset_value_ptr++ = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - + *offset_value_ptr++ = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; *offset_value_ptr++ = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; *offset_value_ptr++ = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; From 1a2471997ab28ca93429cd5ff5c672b692479b24 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 21 Jul 2023 16:34:00 +0800 Subject: [PATCH 115/127] optimize gather --- .../x86/gridsample_apply_interpolation.h | 54 ---- .../gridsample_bicubic_apply_interpolation.h | 78 ++--- .../gridsample_bilinear_apply_interpolation.h | 267 ++++++------------ .../gridsample_nearest_apply_interpolation.h | 80 +----- src/layer/x86/gridsample_x86_avx2.cpp | 62 ---- tests/test_gridsample.cpp | 7 +- 6 files changed, 124 insertions(+), 424 deletions(-) delete mode 100644 src/layer/x86/gridsample_x86_avx2.cpp diff --git a/src/layer/x86/gridsample_apply_interpolation.h b/src/layer/x86/gridsample_apply_interpolation.h index abb457b874d..9e9599c34be 100644 --- a/src/layer/x86/gridsample_apply_interpolation.h +++ b/src/layer/x86/gridsample_apply_interpolation.h @@ -12,60 +12,6 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#if __SSE2__ -#if __AVX__ -static __m256 mask_gather_ps256(const float* ptr, __m256i offset, __m256 mask) -{ -#if __AVX2__ - __m256 v = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[8], maski[8]; - memcpy(offseti, &offset, 8 * sizeof(int)); - memcpy(maski, &mask, 8 * sizeof(int)); - - float data[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = 0; i < 8; i++) - { - if (maski[i] & 0xF0000000) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m256 v = _mm256_loadu_ps(data); -#endif // __AVX2__ - - return v; -} - -#endif // __AVX__ - -static __m128 mask_gather_ps(const float* ptr, __m128i offset, __m128 mask) -{ -#if __AVX2__ - __m128 v = _mm_mask_i32gather_ps(_mm_setzero_ps(), ptr, offset, mask, sizeof(float)); -#else - int offseti[4], maski[4]; - memcpy(offseti, &offset, 4 * sizeof(int)); - memcpy(maski, &mask, 4 * sizeof(int)); - - float data[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = 0; i < 4; i++) - { - if (maski[i] & 0xF0000000) - { - data[i] = *(ptr + offseti[i]); - } - } - - __m128 v = _mm_loadu_ps(data); -#endif // __AVX__ - - return v; -} - -#endif // __SSE2__ - #include "gridsample_bilinear_apply_interpolation.h" #include "gridsample_bicubic_apply_interpolation.h" #include "gridsample_nearest_apply_interpolation.h" \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index 85fbf9239fe..a4febb945bd 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -53,13 +53,17 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*offset_value_ptr++)); for (int ii = 0; ii < 4; ii++) { - __m512 x0_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __mmask16 in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 x0_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); offset_value_ptr++; - __m512 x1_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 x1_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); offset_value_ptr++; - __m512 x2_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 x2_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); offset_value_ptr++; - __m512 x3_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 x3_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); offset_value_ptr++; value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); @@ -78,11 +82,8 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d } } } + #endif // __AVX512F__ -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ -void gridsample_2d_bicubic_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt); -void gridsample_2d_bicubic_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt); -#endif static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) { @@ -101,14 +102,6 @@ static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_2d_bicubic_apply_interpolation_p8_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -132,27 +125,14 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { - float v0_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; - float v1_in_bound = *reinterpret_cast(offset_value_ptr + 1) >= 0 ? -1.0f : 0.0f; - float v2_in_bound = *reinterpret_cast(offset_value_ptr + 2) >= 0 ? -1.0f : 0.0f; - float v3_in_bound = *reinterpret_cast(offset_value_ptr + 3) >= 0 ? -1.0f : 0.0f; - -#if __AVX2__ - __m256i v0_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v1_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v2_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v3_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i v0_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v1_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v2_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v3_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - - __m256 x0_val = mask_gather_ps256(srcptr, v0_offset, _mm256_set1_ps(v0_in_bound)); - __m256 x1_val = mask_gather_ps256(srcptr, v1_offset, _mm256_set1_ps(v1_in_bound)); - __m256 x2_val = mask_gather_ps256(srcptr, v2_offset, _mm256_set1_ps(v2_in_bound)); - __m256 x3_val = mask_gather_ps256(srcptr, v3_offset, _mm256_set1_ps(v3_in_bound)); + int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 x0_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 x1_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 x2_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 x3_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -189,14 +169,6 @@ static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_2d_bicubic_apply_interpolation_p4_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -221,14 +193,14 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { - float v0_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; - __m128 x0_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v0_in_bound)); - float v1_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; - __m128 x1_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v1_in_bound)); - float v2_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; - __m128 x2_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v2_in_bound)); - float v3_in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; - __m128 x3_val = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr++), _mm_set_epi32(3, 2, 1, 0)), _mm_set_ps1(v3_in_bound)); + __m128 x0_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 x1_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 x2_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 x3_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 9b42ceaac3f..5117e13ccea 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -32,22 +32,18 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& for (int i = 0; i < grid_size; i++) { - __m512i v00_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __mmask16 mask00 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - - __m512i v01_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __mmask16 mask01 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - - __m512i v10_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __mmask16 mask10 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - - __m512i v11_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __mmask16 mask11 = *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0); - - __m512 v00_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask00, v00_offset, srcptr, sizeof(float)); - __m512 v01_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask01, v01_offset, srcptr, sizeof(float)); - __m512 v10_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask10, v10_offset, srcptr, sizeof(float)); - __m512 v11_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask11, v11_offset, srcptr, sizeof(float)); + __mmask16 in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v00_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v01_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v10_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v11_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; __m512 value = _mm512_set1_ps(*offset_value_ptr++); __m512 v0 = _mm512_fmadd_ps(v01_val, value, _mm512_fnmadd_ps(v00_val, value, v00_val)); @@ -79,29 +75,31 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& for (int i = 0; i < grid_size; i++) { - __m512i v000_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v000_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v000_offset, srcptr, sizeof(float)); - - __m512i v001_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v001_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v001_offset, srcptr, sizeof(float)); - - __m512i v010_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v010_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v010_offset, srcptr, sizeof(float)); - - __m512i v011_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v011_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v011_offset, srcptr, sizeof(float)); - - __m512i v100_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v100_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v100_offset, srcptr, sizeof(float)); - - __m512i v101_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v101_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v101_offset, srcptr, sizeof(float)); - - __m512i v110_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v110_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v110_offset, srcptr, sizeof(float)); + __mmask16 in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v000_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v001_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v010_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v011_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; - __m512i v111_offset = _mm512_add_epi32(_mm512_set1_epi32(*offset_value_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); - __m512 v111_val = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_value_ptr++) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), v111_offset, srcptr, sizeof(float)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v100_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v101_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v110_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; + __m512 v111_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); + offset_value_ptr++; __m512 value = _mm512_set1_ps(*offset_value_ptr++); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); @@ -124,24 +122,8 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& #endif // __AVX512F__ -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ -void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); -void gridsample_2d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); - -void gridsample_3d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); -void gridsample_3d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); -#endif - static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_2d_bilinear_apply_interpolation_p8_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -157,26 +139,14 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - float in_bound_00 = *reinterpret_cast(offset_value_ptr) >= 0 ? -1.0f : 0.0f; - float in_bound_01 = *reinterpret_cast(offset_value_ptr + 1) >= 0 ? -1.0f : 0.0f; - float in_bound_10 = *reinterpret_cast(offset_value_ptr + 2) >= 0 ? -1.0f : 0.0f; - float in_bound_11 = *reinterpret_cast(offset_value_ptr + 3) >= 0 ? -1.0f : 0.0f; -#if __AVX2__ - __m256i v00_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v01_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v10_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v11_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr++), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i v00_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v01_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v10_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v11_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr++), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - - __m256 v00_val = mask_gather_ps256(srcptr, v00_offset, _mm256_set1_ps(in_bound_00)); - __m256 v01_val = mask_gather_ps256(srcptr, v01_offset, _mm256_set1_ps(in_bound_01)); - __m256 v10_val = mask_gather_ps256(srcptr, v10_offset, _mm256_set1_ps(in_bound_10)); - __m256 v11_val = mask_gather_ps256(srcptr, v11_offset, _mm256_set1_ps(in_bound_11)); + int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v00_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v01_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v10_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v11_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); __m256 value = _mm256_set1_ps(*offset_value_ptr++); __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value, _mm256_comp_fnmadd_ps(v00_val, value, v00_val)); @@ -192,14 +162,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d } static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_3d_bilinear_apply_interpolation_p8_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -216,43 +178,23 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { -#if __AVX2__ - __m256i v000_offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_value_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v001_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 1)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v010_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 2)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v011_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 3)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v100_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 4)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v101_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 5)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v110_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 6)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); - __m256i v111_offset = _mm256_add_epi32(_mm256_set1_epi32(*(offset_value_ptr + 7)), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i v000_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_value_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v001_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 1)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v010_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 2)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v011_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 3)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v100_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 4)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v101_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 5)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v110_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 6)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); - __m256i v111_offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*(offset_value_ptr + 7)), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - - float in_bound_000 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_001 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_010 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_011 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_100 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_101 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_110 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - float in_bound_111 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m256 v000_val = mask_gather_ps256(srcptr, v000_offset, _mm256_set1_ps(in_bound_000)); - __m256 v001_val = mask_gather_ps256(srcptr, v001_offset, _mm256_set1_ps(in_bound_001)); - __m256 v010_val = mask_gather_ps256(srcptr, v010_offset, _mm256_set1_ps(in_bound_010)); - __m256 v011_val = mask_gather_ps256(srcptr, v011_offset, _mm256_set1_ps(in_bound_011)); - __m256 v100_val = mask_gather_ps256(srcptr, v100_offset, _mm256_set1_ps(in_bound_100)); - __m256 v101_val = mask_gather_ps256(srcptr, v101_offset, _mm256_set1_ps(in_bound_101)); - __m256 v110_val = mask_gather_ps256(srcptr, v110_offset, _mm256_set1_ps(in_bound_110)); - __m256 v111_val = mask_gather_ps256(srcptr, v111_offset, _mm256_set1_ps(in_bound_111)); + int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v000_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v001_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v010_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v011_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v100_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v101_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v110_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; + __m256 v111_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); __m256 value = _mm256_set1_ps(*offset_value_ptr++); __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val)); @@ -275,14 +217,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d #endif // __AVX__ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_2d_bilinear_apply_interpolation_p4_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -298,22 +232,14 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - __m128i v00_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_00 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v01_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_01 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v10_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_10 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v11_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_11 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128 v00_val = mask_gather_ps(srcptr, v00_offset, _mm_set1_ps(in_bound_00)); - __m128 v01_val = mask_gather_ps(srcptr, v01_offset, _mm_set1_ps(in_bound_01)); - __m128 v10_val = mask_gather_ps(srcptr, v10_offset, _mm_set1_ps(in_bound_10)); - __m128 v11_val = mask_gather_ps(srcptr, v11_offset, _mm_set1_ps(in_bound_11)); + __m128 v00_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v01_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v10_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v11_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; __m128 value = _mm_set1_ps(*offset_value_ptr++); __m128 v0 = _mm_comp_fmadd_ps(v01_val, value, _mm_comp_fnmadd_ps(v00_val, value, v00_val)); @@ -329,14 +255,6 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d } static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_3d_bilinear_apply_interpolation_p4_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -353,38 +271,23 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - __m128i v000_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_000 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v001_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_001 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v010_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_010 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v011_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_011 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v100_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_100 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v101_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_101 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v110_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_110 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; - - __m128i v111_offset = _mm_add_epi32(_mm_set1_epi32(*offset_value_ptr), _mm_set_epi32(3, 2, 1, 0)); - float in_bound_111 = *reinterpret_cast(offset_value_ptr++) >= 0 ? -1.0f : 0.0f; + __m128 v000_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v001_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v010_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v011_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; - __m128 v000_val = mask_gather_ps(srcptr, v000_offset, _mm_set1_ps(in_bound_000)); - __m128 v001_val = mask_gather_ps(srcptr, v001_offset, _mm_set1_ps(in_bound_001)); - __m128 v010_val = mask_gather_ps(srcptr, v010_offset, _mm_set1_ps(in_bound_010)); - __m128 v011_val = mask_gather_ps(srcptr, v011_offset, _mm_set1_ps(in_bound_011)); - __m128 v100_val = mask_gather_ps(srcptr, v100_offset, _mm_set1_ps(in_bound_100)); - __m128 v101_val = mask_gather_ps(srcptr, v101_offset, _mm_set1_ps(in_bound_101)); - __m128 v110_val = mask_gather_ps(srcptr, v110_offset, _mm_set1_ps(in_bound_110)); - __m128 v111_val = mask_gather_ps(srcptr, v111_offset, _mm_set1_ps(in_bound_111)); + __m128 v100_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v101_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v110_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; + __m128 v111_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); + offset_value_ptr++; __m128 value = _mm_set1_ps(*offset_value_ptr++); __m128 v00 = _mm_comp_fmadd_ps(v001_val, value, _mm_comp_fnmadd_ps(v000_val, value, v000_val)); diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index e08eefd526f..2daeb11cee9 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -33,33 +33,19 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - __m512 _v = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), *reinterpret_cast(offset_ptr) >= 0 ? static_cast<__mmask16>(0xFFFF) : static_cast<__mmask16>(0x0), _mm512_add_epi32(_mm512_set1_epi32(*offset_ptr), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)), srcptr, sizeof(float)); + __mmask16 in_bound = *reinterpret_cast(offset_ptr) >= 0 ? 0xFFFF : 0; + __m512 _v = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_ptr)); + offset_ptr++; _mm512_storeu_ps(dstptr, _v); - - offset_ptr++; dstptr += 16; } } } #endif // __AVX512F__ -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ -void gridsample_nearest_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); -void gridsample_nearest_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); -void gridsample_nearest_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt); -#endif - static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_nearest_apply_interpolation_p8_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -76,17 +62,10 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; -#if __AVX2__ - __m256i _offset = _mm256_add_epi32(_mm256_set1_epi32(*offset_ptr), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); -#else - __m256i _offset = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(*offset_ptr), _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0))); -#endif // __AVX2__ - __m256 _v = mask_gather_ps256(srcptr, _offset, _mm256_set1_ps(in_bound)); + int in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1 : 0; + __m256 _v = _mm256_maskload_ps(srcptr + static_cast(*offset_ptr++), _mm256_set1_epi32(in_bound)); _mm256_storeu_ps(dstptr, _v); - - offset_ptr++; dstptr += 8; } } @@ -94,14 +73,6 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, #endif // __AVX__ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_nearest_apply_interpolation_p4_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -118,12 +89,10 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - float in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1.0f : 0.0f; - __m128 _v = mask_gather_ps(srcptr, _mm_add_epi32(_mm_set1_epi32(*offset_ptr), _mm_set_epi32(3, 2, 1, 0)), _mm_set1_ps(in_bound)); + __m128 _v = *reinterpret_cast(offset_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_ptr)) : _mm_set1_ps(0); + offset_ptr++; _mm_storeu_ps(dstptr, _v); - - offset_ptr++; dstptr += 4; } } @@ -133,14 +102,6 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { -#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ - if (ncnn::cpu_support_x86_avx2()) - { - gridsample_nearest_apply_interpolation_p1_avx2(src, dst, offset_value, opt); - return; - } -#endif - const int channels = dst.c; const int outw = dst.w; const int outh = dst.h; @@ -155,32 +116,7 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const float* offset_ptr = offset_value.channel(0); - int x = 0; -#if __SSE2__ -#if __AVX__ - for (; x + 7 < grid_size; x += 8) - { - __m256 in_bound = _mm256_andnot_ps(_mm256_loadu_ps(offset_ptr), _mm256_set1_ps(-1.0f)); - __m256 _v = mask_gather_ps256(srcptr, _mm256_set_epi32(*(offset_ptr + 7), *(offset_ptr + 6), *(offset_ptr + 5), *(offset_ptr + 4), *(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); - - _mm256_storeu_ps(dstptr, _v); - - offset_ptr += 8; - dstptr += 8; - } -#endif // __AVX__ - for (; x + 3 < grid_size; x += 4) - { - __m128 in_bound = _mm_andnot_ps(_mm_loadu_ps(offset_ptr), _mm_set1_ps(-1.0f)); - __m128 _v = mask_gather_ps(srcptr, _mm_set_epi32(*(offset_ptr + 3), *(offset_ptr + 2), *(offset_ptr + 1), *offset_ptr), in_bound); - - _mm_storeu_ps(dstptr, _v); - - offset_ptr += 4; - dstptr += 4; - } -#endif // __SSE2__ - for (; x < grid_size; x++) + for (int x = 0; x < grid_size; x++) { *dstptr = *reinterpret_cast(offset_ptr) >= 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; diff --git a/src/layer/x86/gridsample_x86_avx2.cpp b/src/layer/x86/gridsample_x86_avx2.cpp deleted file mode 100644 index 270d0a01f3f..00000000000 --- a/src/layer/x86/gridsample_x86_avx2.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "cpu.h" -#include "mat.h" -#include "x86_usability.h" - -namespace ncnn { -#include "gridsample_apply_interpolation.h" - -void gridsample_2d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_2d_bilinear_apply_interpolation_p8(src, dst, offset_value, opt); -} -void gridsample_2d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_2d_bilinear_apply_interpolation_p4(src, dst, offset_value, opt); -} - -void gridsample_3d_bilinear_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_3d_bilinear_apply_interpolation_p8(src, dst, offset_value, opt); -} -void gridsample_3d_bilinear_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_3d_bilinear_apply_interpolation_p4(src, dst, offset_value, opt); -} - -void gridsample_nearest_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_nearest_apply_interpolation_p8(src, dst, offset_value, opt); -} -void gridsample_nearest_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_nearest_apply_interpolation_p4(src, dst, offset_value, opt); -} -void gridsample_nearest_apply_interpolation_p1_avx2(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) -{ - gridsample_nearest_apply_interpolation_p1(src, dst, offset_value, opt); -} - -void gridsample_2d_bicubic_apply_interpolation_p8_avx2(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) -{ - gridsample_2d_bicubic_apply_interpolation_p8(src, dst, offset_value, opt); -} -void gridsample_2d_bicubic_apply_interpolation_p4_avx2(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) -{ - gridsample_2d_bicubic_apply_interpolation_p4(src, dst, offset_value, opt); -} - -} // namespace ncnn \ No newline at end of file diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 0e384115352..fb580df3d22 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -184,9 +184,14 @@ int main() { SRAND(7767517); - return 0 + int ret = 0 || test_gridsample_0() || test_gridsample_1() || test_gridsample_2() || test_gridsample_3(); + + getchar(); + getchar(); + + return ret; } From bf224d36b3dc1aa87c111afd0dbd7b46b4fb2bb1 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Fri, 21 Jul 2023 08:35:56 +0000 Subject: [PATCH 116/127] apply code-format changes --- tests/test_gridsample.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index fb580df3d22..161b13eb4be 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -185,10 +185,10 @@ int main() SRAND(7767517); int ret = 0 - || test_gridsample_0() - || test_gridsample_1() - || test_gridsample_2() - || test_gridsample_3(); + || test_gridsample_0() + || test_gridsample_1() + || test_gridsample_2() + || test_gridsample_3(); getchar(); getchar(); From 64868ef2351ae78bcabf335e6c3ad0517c76a993 Mon Sep 17 00:00:00 2001 From: Yoh-Z <550856122@qq.com> Date: Fri, 21 Jul 2023 21:07:40 +0800 Subject: [PATCH 117/127] fix bug and optimize code --- src/layer/gridsample.cpp | 10 +++--- .../gridsample_bicubic_apply_interpolation.h | 12 ++++--- .../gridsample_bilinear_apply_interpolation.h | 36 ++++++++++++------- .../gridsample_nearest_apply_interpolation.h | 3 +- tests/test_gridsample.cpp | 15 +++----- 5 files changed, 45 insertions(+), 31 deletions(-) diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp index 4321a1c05e9..e8579cf4aef 100644 --- a/src/layer/gridsample.cpp +++ b/src/layer/gridsample.cpp @@ -155,12 +155,13 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& int outh = permute_fusion == 0 ? grid.c : grid.h; top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; Mat offset_blob; offset_blob.create(outw, outh, grid.c, elemsize, opt.workspace_allocator); + if (top_blob.empty() || offset_blob.empty()) + return -100; + //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly if (permute_fusion == 0) { @@ -375,12 +376,13 @@ int GridSample::forward(const std::vector& bottom_blobs, std::vector& int outd = permute_fusion == 0 ? grid.c : grid.d; top_blob.create(outw, outh, outd, channels, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; Mat offset_blob; offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.workspace_allocator); + if (top_blob.empty() || offset_blob.empty()) + return -100; + //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly if (permute_fusion == 0) { diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index a4febb945bd..e07fce2cf9d 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -126,13 +126,17 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x0_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 x0_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x1_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 x1_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x2_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 x2_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x3_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 x3_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 5117e13ccea..55f97f4500e 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -140,13 +140,17 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v00_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v00_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v01_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v01_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v10_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v10_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v11_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v11_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; __m256 value = _mm256_set1_ps(*offset_value_ptr++); __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value, _mm256_comp_fnmadd_ps(v00_val, value, v00_val)); @@ -179,22 +183,30 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v000_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v000_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v001_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v001_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v010_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v010_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v011_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v011_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v100_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v100_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v101_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v101_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v110_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v110_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v111_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr++), _mm256_set1_epi32(in_bound)); + __m256 v111_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); + offset_value_ptr++; __m256 value = _mm256_set1_ps(*offset_value_ptr++); __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val)); diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index 2daeb11cee9..f7b92cc6bd3 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -63,7 +63,8 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { int in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1 : 0; - __m256 _v = _mm256_maskload_ps(srcptr + static_cast(*offset_ptr++), _mm256_set1_epi32(in_bound)); + __m256 _v = _mm256_maskload_ps(srcptr + static_cast(*offset_ptr), _mm256_set1_epi32(in_bound)); + offset_ptr++; _mm256_storeu_ps(dstptr, _v); dstptr += 8; diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp index 161b13eb4be..0e384115352 100644 --- a/tests/test_gridsample.cpp +++ b/tests/test_gridsample.cpp @@ -184,14 +184,9 @@ int main() { SRAND(7767517); - int ret = 0 - || test_gridsample_0() - || test_gridsample_1() - || test_gridsample_2() - || test_gridsample_3(); - - getchar(); - getchar(); - - return ret; + return 0 + || test_gridsample_0() + || test_gridsample_1() + || test_gridsample_2() + || test_gridsample_3(); } From 429d5b83765f6ec1983f37b07a2503c3bda429f9 Mon Sep 17 00:00:00 2001 From: Yoh Date: Sun, 6 Aug 2023 08:04:42 +0800 Subject: [PATCH 118/127] fix vs2017 bug and optimize code --- .../x86/gridsample_apply_interpolation.h | 17 - .../gridsample_bicubic_apply_interpolation.h | 112 +++--- .../x86/gridsample_bicubic_compute_blob.h | 95 ++--- .../gridsample_bilinear_apply_interpolation.h | 344 +++++++++--------- .../x86/gridsample_bilinear_compute_blob.h | 158 ++++---- .../gridsample_nearest_apply_interpolation.h | 20 +- .../x86/gridsample_nearest_compute_blob.h | 97 ++--- src/layer/x86/gridsample_x86.cpp | 6 +- 8 files changed, 426 insertions(+), 423 deletions(-) delete mode 100644 src/layer/x86/gridsample_apply_interpolation.h diff --git a/src/layer/x86/gridsample_apply_interpolation.h b/src/layer/x86/gridsample_apply_interpolation.h deleted file mode 100644 index 9e9599c34be..00000000000 --- a/src/layer/x86/gridsample_apply_interpolation.h +++ /dev/null @@ -1,17 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "gridsample_bilinear_apply_interpolation.h" -#include "gridsample_bicubic_apply_interpolation.h" -#include "gridsample_nearest_apply_interpolation.h" \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index e07fce2cf9d..a596cf36060 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -29,6 +29,9 @@ static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(_mm512_set1_ps(1.0f), coeffs0), coeffs1), coeffs2); } +#if _MSC_VER >= 1910 && _MSC_VER < 1920 +#pragma optimize("", off) +#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { const int channels = dst.c; @@ -49,27 +52,28 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m512 value_f[4]; - cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(*offset_value_ptr++)); - cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(*offset_value_ptr++)); + cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(offset_value_ptr[0])); + cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(offset_value_ptr[1])); + + const int* offset_ptr = (int*)offset_value_ptr + 2; + for (int ii = 0; ii < 4; ii++) { - __mmask16 in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 x0_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 x1_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 x2_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 x3_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; + __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; + __m512 x0_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); + in_bound = offset_ptr[1] >= 0 ? 0xFFFF : 0; + __m512 x1_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[1]); + in_bound = offset_ptr[2] >= 0 ? 0xFFFF : 0; + __m512 x2_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[2]); + in_bound = offset_ptr[3] >= 0 ? 0xFFFF : 0; + __m512 x3_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[3]); value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm512_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); value_f[ii] = _mm512_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + offset_ptr += 4; } __m512 _v = _mm512_mul_ps(y_coeffs0, value_f[0]); @@ -79,12 +83,15 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d _mm512_storeu_ps(dstptr, _v); dstptr += 16; + offset_value_ptr += 18; } } } +#if _MSC_VER >= 1910 && _MSC_VER < 1920 +#pragma optimize("", on) +#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 #endif // __AVX512F__ - static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) { const __m256 A = _mm256_set1_ps(-0.75f); @@ -120,28 +127,28 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; __m256 value_f[4]; - cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(*offset_value_ptr++)); - cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(*offset_value_ptr++)); + cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(offset_value_ptr[0])); + cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(offset_value_ptr[1])); + + const int* offset_ptr = (int*)offset_value_ptr + 2; for (int ii = 0; ii < 4; ii++) { - int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x0_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x1_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x2_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 x3_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; + int in_bound = offset_ptr[0] >= 0 ? -1 : 0; + __m256 x0_val = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[1] >= 0 ? -1 : 0; + __m256 x1_val = _mm256_maskload_ps(srcptr + offset_ptr[1], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[2] >= 0 ? -1 : 0; + __m256 x2_val = _mm256_maskload_ps(srcptr + offset_ptr[2], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[3] >= 0 ? -1 : 0; + __m256 x3_val = _mm256_maskload_ps(srcptr + offset_ptr[3], _mm256_set1_epi32(in_bound)); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + offset_ptr += 4; } __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]); @@ -151,6 +158,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds _mm256_storeu_ps(dstptr, _v); dstptr += 8; + offset_value_ptr += 18; } } } @@ -192,24 +200,24 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds for (int i = 0; i < grid_size; i++) { - cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set_ps1(*offset_value_ptr++)); - cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set_ps1(*offset_value_ptr++)); + cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set_ps1(offset_value_ptr[0])); + cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set_ps1(offset_value_ptr[1])); + + const int* offset_ptr = (int*)offset_value_ptr + 2; for (int ii = 0; ii < 4; ii++) { - __m128 x0_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 x1_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 x2_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 x3_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; + __m128 x0_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 x1_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); + __m128 x2_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); + __m128 x3_val = offset_ptr[3] >= 0 ? _mm_load_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]); + + offset_ptr += 4; } __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]); @@ -219,9 +227,11 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds _mm_storeu_ps(dstptr, _v); dstptr += 4; + offset_value_ptr += 18; } } } + #endif // __SSE2__ static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2, float& coeffs3, float fx) @@ -259,27 +269,24 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3; float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3; float value_f[4]; - cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, *offset_value_ptr++); - cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, *offset_value_ptr++); + cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, offset_value_ptr[0]); + cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, offset_value_ptr[1]); + + const int* offset_ptr = (int*)offset_value_ptr + 2; for (int ii = 0; ii < 4; ii++) { - float x0_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - - float x1_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - - float x2_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - - float x3_val = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; + float x0_val = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0; + float x1_val = offset_ptr[1] >= 0 ? *(srcptr + offset_ptr[1]) : 0; + float x2_val = offset_ptr[2] >= 0 ? *(srcptr + offset_ptr[2]) : 0; + float x3_val = offset_ptr[3] >= 0 ? *(srcptr + offset_ptr[3]) : 0; value_f[ii] = x_coeffs0 * x0_val; value_f[ii] = x_coeffs1 * x1_val + value_f[ii]; value_f[ii] = x_coeffs2 * x2_val + value_f[ii]; value_f[ii] = x_coeffs3 * x3_val + value_f[ii]; + + offset_ptr += 4; } float _v = y_coeffs0 * value_f[0]; @@ -289,6 +296,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& ds *dstptr = _v; dstptr++; + offset_value_ptr += 18; } } } \ No newline at end of file diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index bd46e0d961d..9006153d9d2 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -31,15 +31,10 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gx = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + transpose2x8_ps(gx, gy); gx = unormalize(_mm256_set1_ps(src.w), gx); gy = unormalize(_mm256_set1_ps(src.h), gy); @@ -83,24 +78,25 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of v1_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f[i], _mm256_and_ps(x1_in_range, y_in_range)); v2_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f[i], _mm256_and_ps(x2_in_range, y_in_range)); v3_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f[i], _mm256_and_ps(x3_in_range, y_in_range)); + + v0_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v0_offset_f[i])); + v1_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v1_offset_f[i])); + v2_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v2_offset_f[i])); + v3_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v3_offset_f[i])); } transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); _mm256_storeu_ps(offset_value_ptr, tx); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, ty); - offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr + 8, ty); + offset_value_ptr += 16; for (int i = 0; i < 4; i++) { _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, v1_offset_f[i]); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, v2_offset_f[i]); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, v3_offset_f[i]); - offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]); + offset_value_ptr += 32; } gridptr += 16; } @@ -121,8 +117,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of int x2 = x1 + 1; int x3 = x1 + 2; - *offset_value_ptr++ = sample_x - static_cast(x1); - *offset_value_ptr++ = sample_y - static_cast(y1); + offset_value_ptr[0] = sample_x - static_cast(x1); + offset_value_ptr[1] = sample_y - static_cast(y1); x1 = get_coord(src.w, x1); x0 = get_coord(src.w, x0); @@ -134,6 +130,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of bool x2_in_range = (x2 > -1) & (x2 < src.w); bool x3_in_range = (x3 > -1) & (x3 < src.w); + int* offset_ptr = (int*)offset_value_ptr + 2; + for (int i = 0; i < 4; i++) { int gy = y1 + i - 1; @@ -147,13 +145,16 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of bool v2_in_bound = (x2_in_range & y_in_range); bool v3_in_bound = (x3_in_range & y_in_range); - *offset_value_ptr++ = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; - *offset_value_ptr++ = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; - *offset_value_ptr++ = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; - *offset_value_ptr++ = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; + offset_ptr[0] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0; + offset_ptr[1] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0; + offset_ptr[2] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0; + offset_ptr[3] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0; + + offset_ptr += 4; } gridptr += 2; + offset_value_ptr += 18; } } } @@ -203,34 +204,34 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w)); - volatile float epack = src.elempack; - v0_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(epack)); - v1_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(epack)); - v2_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(epack)); - v3_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(epack)); + v0_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(src.elempack)); + v1_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(src.elempack)); + v2_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(src.elempack)); + v3_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(src.elempack)); v0_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f[i], _mm256_and_ps(x0_in_range, y_in_range)); v1_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f[i], _mm256_and_ps(x1_in_range, y_in_range)); v2_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f[i], _mm256_and_ps(x2_in_range, y_in_range)); v3_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f[i], _mm256_and_ps(x3_in_range, y_in_range)); + + v0_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v0_offset_f[i])); + v1_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v1_offset_f[i])); + v2_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v2_offset_f[i])); + v3_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v3_offset_f[i])); } transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); _mm256_storeu_ps(offset_value_ptr, tx); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, ty); - offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr + 8, ty); + offset_value_ptr += 16; for (int i = 0; i < 4; i++) { _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, v1_offset_f[i]); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, v2_offset_f[i]); - offset_value_ptr += 8; - _mm256_storeu_ps(offset_value_ptr, v3_offset_f[i]); - offset_value_ptr += 8; + _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]); + offset_value_ptr += 32; } gridptr_x += 8; @@ -253,8 +254,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of int x2 = x1 + 1; int x3 = x1 + 2; - *offset_value_ptr++ = sample_x - static_cast(x1); - *offset_value_ptr++ = sample_y - static_cast(y1); + offset_value_ptr[0] = sample_x - static_cast(x1); + offset_value_ptr[1] = sample_y - static_cast(y1); x1 = get_coord(src.w, x1); x0 = get_coord(src.w, x0); @@ -266,6 +267,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of bool x2_in_range = (x2 > -1) & (x2 < src.w); bool x3_in_range = (x3 > -1) & (x3 < src.w); + int* offset_ptr = (int*)offset_value_ptr + 2; + for (int i = 0; i < 4; i++) { int gy = y1 + i - 1; @@ -279,14 +282,18 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of bool v2_in_bound = (x2_in_range & y_in_range); bool v3_in_bound = (x3_in_range & y_in_range); - *offset_value_ptr++ = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0f; - *offset_value_ptr++ = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0f; - *offset_value_ptr++ = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0f; - *offset_value_ptr++ = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0f; + offset_ptr[0] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0; + offset_ptr[1] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0; + offset_ptr[2] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0; + offset_ptr[3] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0; + + offset_ptr += 4; } gridptr_x++; gridptr_y++; + + offset_value_ptr += 18; } } } diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 55f97f4500e..7a78338c988 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -15,6 +15,9 @@ #if __SSE2__ #if __AVX__ #if __AVX512F__ +#if _MSC_VER >= 1910 && _MSC_VER < 1920 +#pragma optimize("", off) +#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { const int channels = dst.c; @@ -32,31 +35,32 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& for (int i = 0; i < grid_size; i++) { - __mmask16 in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v00_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v01_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v10_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v11_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - - __m512 value = _mm512_set1_ps(*offset_value_ptr++); - __m512 v0 = _mm512_fmadd_ps(v01_val, value, _mm512_fnmadd_ps(v00_val, value, v00_val)); - __m512 v1 = _mm512_fmadd_ps(v11_val, value, _mm512_fnmadd_ps(v10_val, value, v10_val)); - - value = _mm512_set1_ps(*offset_value_ptr++); - __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0)); + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 4; + + __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; + __m512 v00_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); + in_bound = offset_ptr[1] >= 0 ? 0xFFFF : 0; + __m512 v01_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[1]); + in_bound = offset_ptr[2] >= 0 ? 0xFFFF : 0; + __m512 v10_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[2]); + in_bound = offset_ptr[3] >= 0 ? 0xFFFF : 0; + __m512 v11_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[3]); + + __m512 value1 = _mm512_set1_ps(value_ptr[0]); + __m512 v0 = _mm512_fmadd_ps(v01_val, value1, _mm512_fnmadd_ps(v00_val, value1, v00_val)); + __m512 v1 = _mm512_fmadd_ps(v11_val, value1, _mm512_fnmadd_ps(v10_val, value1, v10_val)); + + __m512 value2 = _mm512_set1_ps(value_ptr[1]); + __m512 _v = _mm512_fmadd_ps(v1, value2, _mm512_fnmadd_ps(v0, value2, v0)); _mm512_storeu_ps(dstptr, _v); dstptr += 16; + offset_value_ptr += 6; } } } + static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { const int channels = dst.c; @@ -75,50 +79,50 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& for (int i = 0; i < grid_size; i++) { - __mmask16 in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v000_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v001_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v010_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v011_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v100_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v101_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v110_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? 0xFFFF : 0; - __m512 v111_val = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_value_ptr)); - offset_value_ptr++; - - __m512 value = _mm512_set1_ps(*offset_value_ptr++); + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 8; + + __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; + __m512 v000_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); + in_bound = offset_ptr[1] >= 0 ? 0xFFFF : 0; + __m512 v001_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[1]); + in_bound = offset_ptr[2] >= 0 ? 0xFFFF : 0; + __m512 v010_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[2]); + in_bound = offset_ptr[3] >= 0 ? 0xFFFF : 0; + __m512 v011_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[3]); + + in_bound = offset_ptr[4] >= 0 ? 0xFFFF : 0; + __m512 v100_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[4]); + in_bound = offset_ptr[5] >= 0 ? 0xFFFF : 0; + __m512 v101_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[5]); + in_bound = offset_ptr[6] >= 0 ? 0xFFFF : 0; + __m512 v110_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[6]); + in_bound = offset_ptr[7] >= 0 ? 0xFFFF : 0; + __m512 v111_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[7]); + + + __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val)); __m512 v10 = _mm512_fmadd_ps(v101_val, value, _mm512_fnmadd_ps(v100_val, value, v100_val)); __m512 v11 = _mm512_fmadd_ps(v111_val, value, _mm512_fnmadd_ps(v110_val, value, v110_val)); - value = _mm512_set1_ps(*offset_value_ptr++); + value = _mm512_set1_ps(value_ptr[1]); __m512 v0 = _mm512_fmadd_ps(v01, value, _mm512_fnmadd_ps(v00, value, v00)); __m512 v1 = _mm512_fmadd_ps(v11, value, _mm512_fnmadd_ps(v10, value, v10)); - value = _mm512_set1_ps(*offset_value_ptr++); + value = _mm512_set1_ps(value_ptr[2]); __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0)); _mm512_storeu_ps(dstptr, _v); dstptr += 16; + offset_value_ptr += 11; } } } +#if _MSC_VER >= 1910 && _MSC_VER < 1920 +#pragma optimize("", on) +#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 #endif // __AVX512F__ @@ -139,28 +143,28 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v00_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v01_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v10_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v11_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - - __m256 value = _mm256_set1_ps(*offset_value_ptr++); - __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value, _mm256_comp_fnmadd_ps(v00_val, value, v00_val)); - __m256 v1 = _mm256_comp_fmadd_ps(v11_val, value, _mm256_comp_fnmadd_ps(v10_val, value, v10_val)); - - value = _mm256_set1_ps(*offset_value_ptr++); - __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0)); + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 4; + + int in_bound = offset_ptr[0] >= 0 ? -1 : 0; + __m256 v00_val = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[1] >= 0 ? -1 : 0; + __m256 v01_val = _mm256_maskload_ps(srcptr + offset_ptr[1], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[2] >= 0 ? -1 : 0; + __m256 v10_val = _mm256_maskload_ps(srcptr + offset_ptr[2], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[3] >= 0 ? -1 : 0; + __m256 v11_val = _mm256_maskload_ps(srcptr + offset_ptr[3], _mm256_set1_epi32(in_bound)); + + __m256 value1 = _mm256_set1_ps(value_ptr[0]); + __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value1, _mm256_comp_fnmadd_ps(v00_val, value1, v00_val)); + __m256 v1 = _mm256_comp_fmadd_ps(v11_val, value1, _mm256_comp_fnmadd_ps(v10_val, value1, v10_val)); + + __m256 value2 = _mm256_set1_ps(value_ptr[1]); + __m256 _v = _mm256_comp_fmadd_ps(v1, value2, _mm256_comp_fnmadd_ps(v0, value2, v0)); _mm256_storeu_ps(dstptr, _v); dstptr += 8; + offset_value_ptr += 6; } } } @@ -182,47 +186,43 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - int in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v000_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v001_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v010_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v011_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v100_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v101_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v110_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - in_bound = *reinterpret_cast(offset_value_ptr) >= 0 ? -1 : 0; - __m256 v111_val = _mm256_maskload_ps(srcptr + static_cast(*offset_value_ptr), _mm256_set1_epi32(in_bound)); - offset_value_ptr++; - - __m256 value = _mm256_set1_ps(*offset_value_ptr++); + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 8; + + int in_bound = offset_ptr[0] >= 0 ? -1 : 0; + __m256 v000_val = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[1] >= 0 ? -1 : 0; + __m256 v001_val = _mm256_maskload_ps(srcptr + offset_ptr[1], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[2] >= 0 ? -1 : 0; + __m256 v010_val = _mm256_maskload_ps(srcptr + offset_ptr[2], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[3] >= 0 ? -1 : 0; + __m256 v011_val = _mm256_maskload_ps(srcptr + offset_ptr[3], _mm256_set1_epi32(in_bound)); + + in_bound = offset_ptr[4] >= 0 ? -1 : 0; + __m256 v100_val = _mm256_maskload_ps(srcptr + offset_ptr[4], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[5] >= 0 ? -1 : 0; + __m256 v101_val = _mm256_maskload_ps(srcptr + offset_ptr[5], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[6] >= 0 ? -1 : 0; + __m256 v110_val = _mm256_maskload_ps(srcptr + offset_ptr[6], _mm256_set1_epi32(in_bound)); + in_bound = offset_ptr[7] >= 0 ? -1 : 0; + __m256 v111_val = _mm256_maskload_ps(srcptr + offset_ptr[7], _mm256_set1_epi32(in_bound)); + + __m256 value = _mm256_set1_ps(value_ptr[0]); __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val)); __m256 v01 = _mm256_comp_fmadd_ps(v011_val, value, _mm256_comp_fnmadd_ps(v010_val, value, v010_val)); __m256 v10 = _mm256_comp_fmadd_ps(v101_val, value, _mm256_comp_fnmadd_ps(v100_val, value, v100_val)); __m256 v11 = _mm256_comp_fmadd_ps(v111_val, value, _mm256_comp_fnmadd_ps(v110_val, value, v110_val)); - value = _mm256_set1_ps(*offset_value_ptr++); + value = _mm256_set1_ps(value_ptr[1]); __m256 v0 = _mm256_comp_fmadd_ps(v01, value, _mm256_comp_fnmadd_ps(v00, value, v00)); __m256 v1 = _mm256_comp_fmadd_ps(v11, value, _mm256_comp_fnmadd_ps(v10, value, v10)); - value = _mm256_set1_ps(*offset_value_ptr++); + value = _mm256_set1_ps(value_ptr[2]); __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0)); _mm256_storeu_ps(dstptr, _v); dstptr += 8; + offset_value_ptr += 11; } } } @@ -244,24 +244,24 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - __m128 v00_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v01_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v10_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v11_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - - __m128 value = _mm_set1_ps(*offset_value_ptr++); - __m128 v0 = _mm_comp_fmadd_ps(v01_val, value, _mm_comp_fnmadd_ps(v00_val, value, v00_val)); - __m128 v1 = _mm_comp_fmadd_ps(v11_val, value, _mm_comp_fnmadd_ps(v10_val, value, v10_val)); - - value = _mm_set1_ps(*offset_value_ptr++); - __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0)); + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 4; + + __m128 v00_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 v01_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); + __m128 v10_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); + __m128 v11_val = offset_ptr[3] >= 0 ? _mm_load_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); + + __m128 value1 = _mm_set1_ps(value_ptr[0]); + __m128 v0 = _mm_comp_fmadd_ps(v01_val, value1, _mm_comp_fnmadd_ps(v00_val, value1, v00_val)); + __m128 v1 = _mm_comp_fmadd_ps(v11_val, value1, _mm_comp_fnmadd_ps(v10_val, value1, v10_val)); + + __m128 value2 = _mm_set1_ps(value_ptr[1]); + __m128 _v = _mm_comp_fmadd_ps(v1, value2, _mm_comp_fnmadd_ps(v0, value2, v0)); _mm_storeu_ps(dstptr, _v); dstptr += 4; + offset_value_ptr += 6; } } } @@ -283,42 +283,41 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d for (int i = 0; i < grid_size; i++) { - __m128 v000_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v001_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v010_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v011_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - - __m128 v100_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v101_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v110_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - __m128 v111_val = *reinterpret_cast(offset_value_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_value_ptr)) : _mm_set1_ps(0); - offset_value_ptr++; - - __m128 value = _mm_set1_ps(*offset_value_ptr++); + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 8; + + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); + __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); + __m128 v011_val = offset_ptr[3] >= 0 ? _mm_load_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); + + __m128 v100_val = offset_ptr[4] >= 0 ? _mm_load_ps(srcptr + offset_ptr[4]) : _mm_set1_ps(0); + __m128 v101_val = offset_ptr[5] >= 0 ? _mm_load_ps(srcptr + offset_ptr[5]) : _mm_set1_ps(0); + __m128 v110_val = offset_ptr[6] >= 0 ? _mm_load_ps(srcptr + offset_ptr[6]) : _mm_set1_ps(0); + __m128 v111_val = offset_ptr[7] >= 0 ? _mm_load_ps(srcptr + offset_ptr[7]) : _mm_set1_ps(0); + + __m128 value = _mm_set1_ps(value_ptr[0]); __m128 v00 = _mm_comp_fmadd_ps(v001_val, value, _mm_comp_fnmadd_ps(v000_val, value, v000_val)); __m128 v01 = _mm_comp_fmadd_ps(v011_val, value, _mm_comp_fnmadd_ps(v010_val, value, v010_val)); __m128 v10 = _mm_comp_fmadd_ps(v101_val, value, _mm_comp_fnmadd_ps(v100_val, value, v100_val)); __m128 v11 = _mm_comp_fmadd_ps(v111_val, value, _mm_comp_fnmadd_ps(v110_val, value, v110_val)); - value = _mm_set1_ps(*offset_value_ptr++); + value = _mm_set1_ps(value_ptr[1]); __m128 v0 = _mm_comp_fmadd_ps(v01, value, _mm_comp_fnmadd_ps(v00, value, v00)); __m128 v1 = _mm_comp_fmadd_ps(v11, value, _mm_comp_fnmadd_ps(v10, value, v10)); - value = _mm_set1_ps(*offset_value_ptr++); + value = _mm_set1_ps(value_ptr[2]); __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0)); _mm_storeu_ps(dstptr, _v); dstptr += 4; + offset_value_ptr += 11; } } } +#pragma fenv_access(off) + +#pragma float_control(precise, off) #endif // __SSE2__ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) @@ -338,23 +337,21 @@ static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d for (int x = 0; x < grid_size; x++) { - float v00 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v01 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v10 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v11 = *offset_value_ptr >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - - float v0 = v00 * (1 - *offset_value_ptr) + v01 * *offset_value_ptr; - float v1 = v10 * (1 - *offset_value_ptr) + v11 * *offset_value_ptr; - offset_value_ptr++; - - *dstptr = v0 * (1 - *offset_value_ptr) + v1 * *offset_value_ptr; - offset_value_ptr++; + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 4; + + float v00 = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0; + float v01 = offset_ptr[1] >= 0 ? *(srcptr + offset_ptr[1]) : 0; + float v10 = offset_ptr[2] >= 0 ? *(srcptr + offset_ptr[2]) : 0; + float v11 = offset_ptr[3] >= 0 ? *(srcptr + offset_ptr[3]) : 0; + + float v0 = v00 * (1 - value_ptr[0]) + v01 * value_ptr[0]; + float v1 = v10 * (1 - value_ptr[0]) + v11 * value_ptr[0]; + + *dstptr = v0 * (1 - value_ptr[1]) + v1 * value_ptr[1]; dstptr++; + offset_value_ptr += 6; } } } @@ -377,38 +374,31 @@ static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& d for (int x = 0; x < grid_size; x++) { - float v000 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v001 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v010 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v011 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - - float v100 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v101 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v110 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - float v111 = *reinterpret_cast(offset_value_ptr) >= 0 ? *(srcptr + static_cast(*offset_value_ptr)) : 0; - offset_value_ptr++; - - float v00 = v000 * (1 - *offset_value_ptr) + v001 * *offset_value_ptr; - float v01 = v010 * (1 - *offset_value_ptr) + v011 * *offset_value_ptr; - float v10 = v100 * (1 - *offset_value_ptr) + v101 * *offset_value_ptr; - float v11 = v110 * (1 - *offset_value_ptr) + v111 * *offset_value_ptr; - offset_value_ptr++; - - float v0 = v00 * (1 - *offset_value_ptr) + v01 * *offset_value_ptr; - float v1 = v10 * (1 - *offset_value_ptr) + v11 * *offset_value_ptr; - offset_value_ptr++; - - *dstptr = v0 * (1 - *offset_value_ptr) + v1 * *offset_value_ptr; - offset_value_ptr++; + const int* offset_ptr = (int*)offset_value_ptr; + const float* value_ptr = offset_value_ptr + 8; + + float v000 = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0; + float v001 = offset_ptr[1] >= 0 ? *(srcptr + offset_ptr[1]) : 0; + float v010 = offset_ptr[2] >= 0 ? *(srcptr + offset_ptr[2]) : 0; + float v011 = offset_ptr[3] >= 0 ? *(srcptr + offset_ptr[3]) : 0; + + float v100 = offset_ptr[4] >= 0 ? *(srcptr + offset_ptr[4]) : 0; + float v101 = offset_ptr[5] >= 0 ? *(srcptr + offset_ptr[5]) : 0; + float v110 = offset_ptr[6] >= 0 ? *(srcptr + offset_ptr[6]) : 0; + float v111 = offset_ptr[7] >= 0 ? *(srcptr + offset_ptr[7]) : 0; + + float v00 = v000 * (1 - value_ptr[0]) + v001 * value_ptr[0]; + float v01 = v010 * (1 - value_ptr[0]) + v011 * value_ptr[0]; + float v10 = v100 * (1 - value_ptr[0]) + v101 * value_ptr[0]; + float v11 = v110 * (1 - value_ptr[0]) + v111 * value_ptr[0]; + + float v0 = v00 * (1 - value_ptr[1]) + v01 * value_ptr[1]; + float v1 = v10 * (1 - value_ptr[1]) + v11 * value_ptr[1]; + + *dstptr = v0 * (1 - value_ptr[2]) + v1 * value_ptr[2]; dstptr++; + offset_value_ptr += 11; } } } \ No newline at end of file diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 33fb0f120aa..bf2d80f4ec1 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -31,15 +31,10 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gx = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); - - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); + transpose2x8_ps(gx, gy); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -72,13 +67,19 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range); se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range); + + nw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(nw_offset)); + ne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(ne_offset)); + sw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(sw_offset)); + se_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(se_offset)); + __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); _mm256_storeu_ps(offset_value_ptr, nw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); + _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); _mm256_storeu_ps(offset_value_ptr + 24, se_offset); @@ -116,15 +117,19 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_10 = x0_in_bound & y1_in_bound; bool in_bound_11 = x1_in_bound & y1_in_bound; - *offset_value_ptr++ = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; + int* offset_ptr = (int*)offset_value_ptr; + float* value_ptr = offset_value_ptr + 4; + + offset_ptr[0] = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0; + offset_ptr[1] = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0; + offset_ptr[2] = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0; + offset_ptr[3] = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0; - *offset_value_ptr++ = sample_x - x0; - *offset_value_ptr++ = sample_y - y0; + value_ptr[0] = sample_x - x0; + value_ptr[1] = sample_y - y0; gridptr += 2; + offset_value_ptr += 6; } } } @@ -171,6 +176,12 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range); sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range); se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range); + + nw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(nw_offset)); + ne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(ne_offset)); + sw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(sw_offset)); + se_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(se_offset)); + __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); @@ -217,16 +228,20 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_10 = x0_in_bound & y1_in_bound; bool in_bound_11 = x1_in_bound & y1_in_bound; - *offset_value_ptr++ = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0f; + int* offset_ptr = (int*)offset_value_ptr; + float* value_ptr = offset_value_ptr + 4; + + offset_ptr[0] = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0; + offset_ptr[1] = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0; + offset_ptr[2] = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0; + offset_ptr[3] = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0; - *offset_value_ptr++ = sample_x - x0; - *offset_value_ptr++ = sample_y - y0; + value_ptr[0] = sample_x - x0; + value_ptr[1] = sample_y - y0; gridptr_x++; gridptr_y++; + offset_value_ptr += 6; } } } @@ -250,20 +265,11 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 23 < grid_size; x += 24) { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gz = _mm256_loadu_ps(gridptr + 16); - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + transpose3x8_ps(gx, gy, gz); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -329,6 +335,16 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range); bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range); + tnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tnw_offset)); + tne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tne_offset)); + tsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tsw_offset)); + tse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tse_offset)); + + bnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bnw_offset)); + bne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bne_offset)); + bsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bsw_offset)); + bse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bse_offset)); + __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); __m256 gamma = _mm256_sub_ps(gz, z_t); @@ -340,10 +356,10 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); - _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset); - _mm256_storeu_ps(offset_value_ptr + 40, bne_offset); - _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset); - _mm256_storeu_ps(offset_value_ptr + 56, bse_offset); + _mm256_storeu_ps(offset_value_ptr, bnw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, bne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, bsw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, bse_offset); _mm256_storeu_ps(offset_value_ptr + 64, alpha); _mm256_storeu_ps(offset_value_ptr + 72, beta); @@ -399,21 +415,25 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_110 = v10_in_range & z1_in_range; bool in_bound_111 = v11_in_range & z1_in_range; - *offset_value_ptr++ = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + int* offset_ptr = (int*)offset_value_ptr; + float* value_ptr = offset_value_ptr + 8; - *offset_value_ptr++ = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + offset_ptr[0] = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[1] = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[2] = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[3] = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + + offset_ptr[4] = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[5] = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[6] = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[7] = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; - *offset_value_ptr++ = sample_x - x0; - *offset_value_ptr++ = sample_y - y0; - *offset_value_ptr++ = sample_z - z0; + value_ptr[0] = sample_x - x0; + value_ptr[1] = sample_y - y0; + value_ptr[2] = sample_z - z0; gridptr += 3; + offset_value_ptr += 11; } } } @@ -495,6 +515,16 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range); bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range); + tnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tnw_offset)); + tne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tne_offset)); + tsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tsw_offset)); + tse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tse_offset)); + + bnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bnw_offset)); + bne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bne_offset)); + bsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bsw_offset)); + bse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bse_offset)); + __m256 alpha = _mm256_sub_ps(gx, x_w); __m256 beta = _mm256_sub_ps(gy, y_n); __m256 gamma = _mm256_sub_ps(gz, z_t); @@ -506,10 +536,10 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); - _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset); - _mm256_storeu_ps(offset_value_ptr + 40, bne_offset); - _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset); - _mm256_storeu_ps(offset_value_ptr + 56, bse_offset); + _mm256_storeu_ps(offset_value_ptr, bnw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, bne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, bsw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, bse_offset); _mm256_storeu_ps(offset_value_ptr + 64, alpha); _mm256_storeu_ps(offset_value_ptr + 72, beta); @@ -567,23 +597,27 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o bool in_bound_110 = v10_in_range & z1_in_range; bool in_bound_111 = v11_in_range & z1_in_range; - *offset_value_ptr++ = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + int* offset_ptr = (int*)offset_value_ptr; + float* value_ptr = offset_value_ptr + 8; + + offset_ptr[0] = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[1] = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[2] = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[3] = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; - *offset_value_ptr++ = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; - *offset_value_ptr++ = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0f; + offset_ptr[4] = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[5] = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[6] = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; + offset_ptr[7] = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; - *offset_value_ptr++ = sample_x - x0; - *offset_value_ptr++ = sample_y - y0; - *offset_value_ptr++ = sample_z - z0; + value_ptr[0] = sample_x - x0; + value_ptr[1] = sample_y - y0; + value_ptr[2] = sample_z - z0; gridptr_x++; gridptr_y++; gridptr_z++; + offset_value_ptr += 11; } } } diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index f7b92cc6bd3..fa4c9a505f7 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -29,12 +29,12 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset_value.channel(0); + const int* offset_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - __mmask16 in_bound = *reinterpret_cast(offset_ptr) >= 0 ? 0xFFFF : 0; - __m512 _v = _mm512_maskz_load_ps(in_bound, srcptr + static_cast(*offset_ptr)); + __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; + __m512 _v = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); offset_ptr++; _mm512_storeu_ps(dstptr, _v); @@ -58,12 +58,12 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset_value.channel(0); + const int* offset_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - int in_bound = *reinterpret_cast(offset_ptr) >= 0 ? -1 : 0; - __m256 _v = _mm256_maskload_ps(srcptr + static_cast(*offset_ptr), _mm256_set1_epi32(in_bound)); + int in_bound = offset_ptr[0] >= 0 ? -1 : 0; + __m256 _v = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); offset_ptr++; _mm256_storeu_ps(dstptr, _v); @@ -86,11 +86,11 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset_value.channel(0); + const int* offset_ptr = offset_value.channel(0); for (int i = 0; i < grid_size; i++) { - __m128 _v = *reinterpret_cast(offset_ptr) >= 0 ? _mm_load_ps(srcptr + static_cast(*offset_ptr)) : _mm_set1_ps(0); + __m128 _v = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); offset_ptr++; _mm_storeu_ps(dstptr, _v); @@ -115,11 +115,11 @@ static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const float* srcptr = src.channel(q); float* dstptr = dst.channel(q); - const float* offset_ptr = offset_value.channel(0); + const int* offset_ptr = offset_value.channel(0); for (int x = 0; x < grid_size; x++) { - *dstptr = *reinterpret_cast(offset_ptr) >= 0 ? *(srcptr + static_cast(*offset_ptr)) : 0; + *dstptr = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0; offset_ptr++; dstptr++; diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index f3e59ce33d5..a7a12066d21 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -31,24 +31,16 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 tmp_x = _mm256_loadu_ps(gridptr); + __m256 gx = _mm256_loadu_ps(gridptr); __m256 gy = _mm256_loadu_ps(gridptr + 8); - __m256 gx = _mm256_permute2f128_ps(tmp_x, gy, 0b00100000); - gy = _mm256_permute2f128_ps(tmp_x, gy, 0b00110001); - tmp_x = _mm256_or_ps(gx, _mm256_setzero_ps()); + transpose2x8_ps(gx, gy); - gx = _mm256_shuffle_ps(gx, gy, 0b10001000); - gy = _mm256_shuffle_ps(tmp_x, gy, 0b11011101); - - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - } + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -58,7 +50,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(src.elempack)); - offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); _mm256_storeu_ps(offset_ptr, offset); @@ -83,7 +75,9 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of int y0 = static_cast(floorf(sample_y + 0.5f)); bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)); - *offset_ptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0f; + + int* iptr = (int*)offset_ptr; + *iptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0; gridptr += 2; offset_ptr++; @@ -102,14 +96,11 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gx = _mm256_loadu_ps(gridptr_x); __m256 gy = _mm256_loadu_ps(gridptr_y); - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); - } + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -119,7 +110,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(src.elempack)); - offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); _mm256_storeu_ps(offset_ptr, offset); @@ -146,7 +137,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h)); - *offset_ptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0f; + int* iptr = (int*)offset_ptr; + *iptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0; gridptr_x++; gridptr_y++; @@ -175,32 +167,20 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 23 < grid_size; x += 24) { - __m256 tmp_x = _mm256_loadu_ps(gridptr); - __m256 tmp_y = _mm256_loadu_ps(gridptr + 8); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); __m256 gz = _mm256_loadu_ps(gridptr + 16); - __m256 gx = _mm256_permute2f128_ps(tmp_x, tmp_y, 0b00110000); - __m256 gy = _mm256_permute2f128_ps(tmp_x, gz, 0b00100001); - gz = _mm256_permute2f128_ps(tmp_y, gz, 0b00110000); - - tmp_x = _mm256_shuffle_ps(gx, gy, 0b01001001); - tmp_y = _mm256_shuffle_ps(gy, gz, 0b10011110); - - gy = _mm256_shuffle_ps(tmp_x, tmp_y, 0b11011000); - gx = _mm256_shuffle_ps(gx, tmp_y, 0b10001100); - gz = _mm256_shuffle_ps(tmp_x, gz, 0b11001101); + transpose3x8_ps(gx, gy, gz); - // compute coord - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - gz = get_coord(_mm256_set1_ps(src.d), gz); - } + gz = unormalize(_mm256_set1_ps(src.d), gz); + gz = get_coord(_mm256_set1_ps(src.d), gz); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -214,7 +194,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), _mm256_set1_ps(src.elempack)); - offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); _mm256_storeu_ps(offset_ptr, offset); @@ -245,7 +225,8 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)); - *offset_ptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + int* iptr = (int*)offset_ptr; + *iptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; gridptr += 3; offset_ptr++; @@ -266,17 +247,14 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of __m256 gy = _mm256_loadu_ps(gridptr_y); __m256 gz = _mm256_loadu_ps(gridptr_z); - // compute coord= - { - gx = unormalize(_mm256_set1_ps(src.w), gx); - gx = get_coord(_mm256_set1_ps(src.w), gx); + gx = unormalize(_mm256_set1_ps(src.w), gx); + gx = get_coord(_mm256_set1_ps(src.w), gx); - gy = unormalize(_mm256_set1_ps(src.h), gy); - gy = get_coord(_mm256_set1_ps(src.h), gy); + gy = unormalize(_mm256_set1_ps(src.h), gy); + gy = get_coord(_mm256_set1_ps(src.h), gy); - gz = unormalize(_mm256_set1_ps(src.d), gz); - gz = get_coord(_mm256_set1_ps(src.d), gz); - } + gz = unormalize(_mm256_set1_ps(src.d), gz); + gz = get_coord(_mm256_set1_ps(src.d), gz); gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f))); gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f))); @@ -290,7 +268,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)), _mm256_set1_ps(src.elempack)); - offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), offset, v_in_range); + offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); _mm256_storeu_ps(offset_ptr, offset); @@ -324,7 +302,8 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d)); - *offset_ptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0f; + int* iptr = (int*)offset_ptr; + *iptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; gridptr_x++; gridptr_y++; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index db9efc85287..312c5e17d25 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -31,7 +31,9 @@ namespace ncnn { #include "gridsample_compute_blob.h" -#include "gridsample_apply_interpolation.h" +#include "gridsample_bilinear_apply_interpolation.h" +#include "gridsample_bicubic_apply_interpolation.h" +#include "gridsample_nearest_apply_interpolation.h" GridSample_x86::GridSample_x86() { @@ -53,7 +55,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Sun, 6 Aug 2023 00:25:21 +0000 Subject: [PATCH 119/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_apply_interpolation.h | 3 +-- src/layer/x86/gridsample_bilinear_compute_blob.h | 6 +++--- src/layer/x86/gridsample_x86.cpp | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 7a78338c988..23ae4253dea 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -100,7 +100,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& in_bound = offset_ptr[7] >= 0 ? 0xFFFF : 0; __m512 v111_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[7]); - __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val)); @@ -285,7 +284,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d { const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index bf2d80f4ec1..caf9208bdde 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -79,7 +79,7 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); _mm256_storeu_ps(offset_value_ptr, nw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); + _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); _mm256_storeu_ps(offset_value_ptr + 24, se_offset); @@ -339,7 +339,7 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o tne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tne_offset)); tsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tsw_offset)); tse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tse_offset)); - + bnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bnw_offset)); bne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bne_offset)); bsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bsw_offset)); @@ -422,7 +422,7 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o offset_ptr[1] = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; offset_ptr[2] = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; offset_ptr[3] = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0; - + offset_ptr[4] = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; offset_ptr[5] = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; offset_ptr[6] = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0; diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp index 312c5e17d25..004bc4d0895 100644 --- a/src/layer/x86/gridsample_x86.cpp +++ b/src/layer/x86/gridsample_x86.cpp @@ -55,7 +55,7 @@ int GridSample_x86::forward(const std::vector& bottom_blobs, std::vector Date: Sun, 13 Aug 2023 22:39:27 +0800 Subject: [PATCH 120/127] fix msvc-v141-toolkits mm512_maskz_load bug --- .../gridsample_bicubic_apply_interpolation.h | 18 ++----- .../gridsample_bilinear_apply_interpolation.h | 47 ++++++------------- 2 files changed, 19 insertions(+), 46 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index a596cf36060..f7e702f7476 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -29,9 +29,6 @@ static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2 coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(_mm512_set1_ps(1.0f), coeffs0), coeffs1), coeffs2); } -#if _MSC_VER >= 1910 && _MSC_VER < 1920 -#pragma optimize("", off) -#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt) { const int channels = dst.c; @@ -59,14 +56,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d for (int ii = 0; ii < 4; ii++) { - __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; - __m512 x0_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); - in_bound = offset_ptr[1] >= 0 ? 0xFFFF : 0; - __m512 x1_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[1]); - in_bound = offset_ptr[2] >= 0 ? 0xFFFF : 0; - __m512 x2_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[2]); - in_bound = offset_ptr[3] >= 0 ? 0xFFFF : 0; - __m512 x3_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[3]); + __m512 x0_val = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 x1_val = offset_ptr[1] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); + __m512 x2_val = offset_ptr[2] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); + __m512 x3_val = offset_ptr[3] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -87,9 +80,6 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d } } } -#if _MSC_VER >= 1910 && _MSC_VER < 1920 -#pragma optimize("", on) -#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 #endif // __AVX512F__ static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx) diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 23ae4253dea..d272df01042 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -15,9 +15,6 @@ #if __SSE2__ #if __AVX__ #if __AVX512F__ -#if _MSC_VER >= 1910 && _MSC_VER < 1920 -#pragma optimize("", off) -#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt) { const int channels = dst.c; @@ -38,14 +35,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 4; - __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; - __m512 v00_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); - in_bound = offset_ptr[1] >= 0 ? 0xFFFF : 0; - __m512 v01_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[1]); - in_bound = offset_ptr[2] >= 0 ? 0xFFFF : 0; - __m512 v10_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[2]); - in_bound = offset_ptr[3] >= 0 ? 0xFFFF : 0; - __m512 v11_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[3]); + __m512 v00_val = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 v01_val = offset_ptr[1] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); + __m512 v10_val = offset_ptr[2] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); + __m512 v11_val = offset_ptr[3] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); __m512 value1 = _mm512_set1_ps(value_ptr[0]); __m512 v0 = _mm512_fmadd_ps(v01_val, value1, _mm512_fnmadd_ps(v00_val, value1, v00_val)); @@ -82,23 +75,16 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; - __m512 v000_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); - in_bound = offset_ptr[1] >= 0 ? 0xFFFF : 0; - __m512 v001_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[1]); - in_bound = offset_ptr[2] >= 0 ? 0xFFFF : 0; - __m512 v010_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[2]); - in_bound = offset_ptr[3] >= 0 ? 0xFFFF : 0; - __m512 v011_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[3]); - - in_bound = offset_ptr[4] >= 0 ? 0xFFFF : 0; - __m512 v100_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[4]); - in_bound = offset_ptr[5] >= 0 ? 0xFFFF : 0; - __m512 v101_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[5]); - in_bound = offset_ptr[6] >= 0 ? 0xFFFF : 0; - __m512 v110_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[6]); - in_bound = offset_ptr[7] >= 0 ? 0xFFFF : 0; - __m512 v111_val = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[7]); + __m512 v000_val = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 v001_val = offset_ptr[1] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); + __m512 v010_val = offset_ptr[2] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); + __m512 v011_val = offset_ptr[3] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); + + __m512 v100_val = offset_ptr[4] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[4]) : _mm512_set1_ps(0); + __m512 v101_val = offset_ptr[5] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[5]) : _mm512_set1_ps(0); + __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0); + __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0); + __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); @@ -119,9 +105,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& } } } -#if _MSC_VER >= 1910 && _MSC_VER < 1920 -#pragma optimize("", on) -#endif // _MSC_VER >= 1910 && _MSC_VER < 1920 #endif // __AVX512F__ @@ -284,7 +267,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d { const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); From 8eb034a9e858ca92b1f0be2d7f6d4df5d2497428 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Sun, 13 Aug 2023 15:08:49 +0000 Subject: [PATCH 121/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_apply_interpolation.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index d272df01042..4ce77cf1724 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -85,7 +85,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0); __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0); - __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val)); @@ -267,7 +266,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d { const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); From 6708bb3d70d7bf2117ff522634e28504aa757566 Mon Sep 17 00:00:00 2001 From: Yoh Date: Sun, 13 Aug 2023 23:20:24 +0800 Subject: [PATCH 122/127] fix Segmentation fault bug --- .../gridsample_bicubic_apply_interpolation.h | 12 ++---- .../gridsample_bilinear_apply_interpolation.h | 41 +++++++------------ .../gridsample_nearest_apply_interpolation.h | 6 +-- 3 files changed, 21 insertions(+), 38 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index f7e702f7476..5647bf8928e 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -124,14 +124,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { - int in_bound = offset_ptr[0] >= 0 ? -1 : 0; - __m256 x0_val = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[1] >= 0 ? -1 : 0; - __m256 x1_val = _mm256_maskload_ps(srcptr + offset_ptr[1], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[2] >= 0 ? -1 : 0; - __m256 x2_val = _mm256_maskload_ps(srcptr + offset_ptr[2], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[3] >= 0 ? -1 : 0; - __m256 x3_val = _mm256_maskload_ps(srcptr + offset_ptr[3], _mm256_set1_epi32(in_bound)); + __m256 x0_val = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 x1_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); + __m256 x2_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); + __m256 x3_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 4ce77cf1724..8484e8e28f8 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -85,6 +85,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0); __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0); + __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val)); @@ -127,14 +128,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 4; - int in_bound = offset_ptr[0] >= 0 ? -1 : 0; - __m256 v00_val = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[1] >= 0 ? -1 : 0; - __m256 v01_val = _mm256_maskload_ps(srcptr + offset_ptr[1], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[2] >= 0 ? -1 : 0; - __m256 v10_val = _mm256_maskload_ps(srcptr + offset_ptr[2], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[3] >= 0 ? -1 : 0; - __m256 v11_val = _mm256_maskload_ps(srcptr + offset_ptr[3], _mm256_set1_epi32(in_bound)); + __m256 v00_val = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 v01_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); + __m256 v10_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); + __m256 v11_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); __m256 value1 = _mm256_set1_ps(value_ptr[0]); __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value1, _mm256_comp_fnmadd_ps(v00_val, value1, v00_val)); @@ -170,23 +167,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - int in_bound = offset_ptr[0] >= 0 ? -1 : 0; - __m256 v000_val = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[1] >= 0 ? -1 : 0; - __m256 v001_val = _mm256_maskload_ps(srcptr + offset_ptr[1], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[2] >= 0 ? -1 : 0; - __m256 v010_val = _mm256_maskload_ps(srcptr + offset_ptr[2], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[3] >= 0 ? -1 : 0; - __m256 v011_val = _mm256_maskload_ps(srcptr + offset_ptr[3], _mm256_set1_epi32(in_bound)); - - in_bound = offset_ptr[4] >= 0 ? -1 : 0; - __m256 v100_val = _mm256_maskload_ps(srcptr + offset_ptr[4], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[5] >= 0 ? -1 : 0; - __m256 v101_val = _mm256_maskload_ps(srcptr + offset_ptr[5], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[6] >= 0 ? -1 : 0; - __m256 v110_val = _mm256_maskload_ps(srcptr + offset_ptr[6], _mm256_set1_epi32(in_bound)); - in_bound = offset_ptr[7] >= 0 ? -1 : 0; - __m256 v111_val = _mm256_maskload_ps(srcptr + offset_ptr[7], _mm256_set1_epi32(in_bound)); + __m256 v000_val = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 v001_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); + __m256 v010_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); + __m256 v011_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); + + __m256 v100_val = offset_ptr[4] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[4]) : _mm256_set1_ps(0); + __m256 v101_val = offset_ptr[5] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[5]) : _mm256_set1_ps(0); + __m256 v110_val = offset_ptr[6] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[6]) : _mm256_set1_ps(0); + __m256 v111_val = offset_ptr[7] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[7]) : _mm256_set1_ps(0); __m256 value = _mm256_set1_ps(value_ptr[0]); __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val)); @@ -266,7 +255,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d { const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index fa4c9a505f7..eee5fd32316 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -33,8 +33,7 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - __mmask16 in_bound = offset_ptr[0] >= 0 ? 0xFFFF : 0; - __m512 _v = _mm512_maskz_load_ps(in_bound, srcptr + offset_ptr[0]); + __m512 _v = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); offset_ptr++; _mm512_storeu_ps(dstptr, _v); @@ -62,8 +61,7 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - int in_bound = offset_ptr[0] >= 0 ? -1 : 0; - __m256 _v = _mm256_maskload_ps(srcptr + offset_ptr[0], _mm256_set1_epi32(in_bound)); + __m256 _v = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); offset_ptr++; _mm256_storeu_ps(dstptr, _v); From 34802ee7436134ac4ceb204f6d9545aa8febecd1 Mon Sep 17 00:00:00 2001 From: Yoh-Z Date: Sun, 13 Aug 2023 15:22:10 +0000 Subject: [PATCH 123/127] apply code-format changes --- src/layer/x86/gridsample_bilinear_apply_interpolation.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 8484e8e28f8..0f94b4e0a60 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -85,7 +85,6 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0); __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0); - __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val)); @@ -171,7 +170,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256 v001_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); __m256 v010_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); __m256 v011_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); - + __m256 v100_val = offset_ptr[4] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[4]) : _mm256_set1_ps(0); __m256 v101_val = offset_ptr[5] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[5]) : _mm256_set1_ps(0); __m256 v110_val = offset_ptr[6] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[6]) : _mm256_set1_ps(0); @@ -255,7 +254,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d { const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); From ba1c24040f363eb6da268909d1213aec4986b934 Mon Sep 17 00:00:00 2001 From: Yoh Date: Mon, 14 Aug 2023 00:17:43 +0800 Subject: [PATCH 124/127] fix data unaligned --- .../x86/gridsample_bicubic_compute_blob.h | 8 ++++---- .../x86/gridsample_bilinear_compute_blob.h | 20 +++++++++---------- .../x86/gridsample_nearest_compute_blob.h | 20 +++++++++---------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index 9006153d9d2..d6ceeed2e51 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -31,8 +31,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); + __m256 gx = _mm256_load_ps(gridptr); + __m256 gy = _mm256_load_ps(gridptr + 8); transpose2x8_ps(gx, gy); @@ -167,8 +167,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gx = _mm256_load_ps(gridptr_x); + __m256 gy = _mm256_load_ps(gridptr_y); gx = unormalize(_mm256_set1_ps(src.w), gx); gy = unormalize(_mm256_set1_ps(src.h), gy); diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index caf9208bdde..84458f9d3ee 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -31,8 +31,8 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); + __m256 gx = _mm256_load_ps(gridptr); + __m256 gy = _mm256_load_ps(gridptr + 8); transpose2x8_ps(gx, gy); @@ -142,8 +142,8 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gx = _mm256_load_ps(gridptr_x); + __m256 gy = _mm256_load_ps(gridptr_y); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -265,9 +265,9 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 23 < grid_size; x += 24) { - __m256 gx = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - __m256 gz = _mm256_loadu_ps(gridptr + 16); + __m256 gx = _mm256_load_ps(gridptr); + __m256 gy = _mm256_load_ps(gridptr + 8); + __m256 gz = _mm256_load_ps(gridptr + 16); transpose3x8_ps(gx, gy, gz); @@ -447,9 +447,9 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); - __m256 gz = _mm256_loadu_ps(gridptr_z); + __m256 gx = _mm256_load_ps(gridptr_x); + __m256 gy = _mm256_load_ps(gridptr_y); + __m256 gz = _mm256_load_ps(gridptr_z); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index a7a12066d21..f66ca7437be 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -31,8 +31,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); + __m256 gx = _mm256_load_ps(gridptr); + __m256 gy = _mm256_load_ps(gridptr + 8); transpose2x8_ps(gx, gy); @@ -93,8 +93,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gx = _mm256_load_ps(gridptr_x); + __m256 gy = _mm256_load_ps(gridptr_y); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -167,9 +167,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 23 < grid_size; x += 24) { - __m256 gx = _mm256_loadu_ps(gridptr); - __m256 gy = _mm256_loadu_ps(gridptr + 8); - __m256 gz = _mm256_loadu_ps(gridptr + 16); + __m256 gx = _mm256_load_ps(gridptr); + __m256 gy = _mm256_load_ps(gridptr + 8); + __m256 gz = _mm256_load_ps(gridptr + 16); transpose3x8_ps(gx, gy, gz); @@ -243,9 +243,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_loadu_ps(gridptr_x); - __m256 gy = _mm256_loadu_ps(gridptr_y); - __m256 gz = _mm256_loadu_ps(gridptr_z); + __m256 gx = _mm256_load_ps(gridptr_x); + __m256 gy = _mm256_load_ps(gridptr_y); + __m256 gz = _mm256_load_ps(gridptr_z); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); From 2ca7c7d4c758fc7f47de08a51faebdc007f065c4 Mon Sep 17 00:00:00 2001 From: Yoh Date: Mon, 14 Aug 2023 02:07:58 +0800 Subject: [PATCH 125/127] fix data store bug --- .../gridsample_bicubic_apply_interpolation.h | 6 +- .../x86/gridsample_bicubic_compute_blob.h | 24 +++---- .../gridsample_bilinear_apply_interpolation.h | 12 ++-- .../x86/gridsample_bilinear_compute_blob.h | 68 +++++++++---------- .../gridsample_nearest_apply_interpolation.h | 6 +- .../x86/gridsample_nearest_compute_blob.h | 8 +-- 6 files changed, 62 insertions(+), 62 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index 5647bf8928e..bb2e69f01a4 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -73,7 +73,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm512_storeu_ps(dstptr, _v); + _mm512_store_ps(dstptr, _v); dstptr += 16; offset_value_ptr += 18; @@ -141,7 +141,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm256_storeu_ps(dstptr, _v); + _mm256_store_ps(dstptr, _v); dstptr += 8; offset_value_ptr += 18; @@ -210,7 +210,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm_storeu_ps(dstptr, _v); + _mm_store_ps(dstptr, _v); dstptr += 4; offset_value_ptr += 18; diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index d6ceeed2e51..f1248ba923e 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -87,15 +87,15 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); - _mm256_storeu_ps(offset_value_ptr, tx); - _mm256_storeu_ps(offset_value_ptr + 8, ty); + _mm256_store_ps(offset_value_ptr, tx); + _mm256_store_ps(offset_value_ptr + 8, ty); offset_value_ptr += 16; for (int i = 0; i < 4; i++) { - _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); - _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]); - _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]); - _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]); + _mm256_store_ps(offset_value_ptr, v0_offset_f[i]); + _mm256_store_ps(offset_value_ptr + 8, v1_offset_f[i]); + _mm256_store_ps(offset_value_ptr + 16, v2_offset_f[i]); + _mm256_store_ps(offset_value_ptr + 24, v3_offset_f[i]); offset_value_ptr += 32; } gridptr += 16; @@ -222,15 +222,15 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); - _mm256_storeu_ps(offset_value_ptr, tx); - _mm256_storeu_ps(offset_value_ptr + 8, ty); + _mm256_store_ps(offset_value_ptr, tx); + _mm256_store_ps(offset_value_ptr + 8, ty); offset_value_ptr += 16; for (int i = 0; i < 4; i++) { - _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); - _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]); - _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]); - _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]); + _mm256_store_ps(offset_value_ptr, v0_offset_f[i]); + _mm256_store_ps(offset_value_ptr + 8, v1_offset_f[i]); + _mm256_store_ps(offset_value_ptr + 16, v2_offset_f[i]); + _mm256_store_ps(offset_value_ptr + 24, v3_offset_f[i]); offset_value_ptr += 32; } diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index 0f94b4e0a60..ad11d5832f5 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -46,7 +46,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512 value2 = _mm512_set1_ps(value_ptr[1]); __m512 _v = _mm512_fmadd_ps(v1, value2, _mm512_fnmadd_ps(v0, value2, v0)); - _mm512_storeu_ps(dstptr, _v); + _mm512_store_ps(dstptr, _v); dstptr += 16; offset_value_ptr += 6; @@ -97,7 +97,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& value = _mm512_set1_ps(value_ptr[2]); __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0)); - _mm512_storeu_ps(dstptr, _v); + _mm512_store_ps(dstptr, _v); dstptr += 16; offset_value_ptr += 11; @@ -138,7 +138,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256 value2 = _mm256_set1_ps(value_ptr[1]); __m256 _v = _mm256_comp_fmadd_ps(v1, value2, _mm256_comp_fnmadd_ps(v0, value2, v0)); - _mm256_storeu_ps(dstptr, _v); + _mm256_store_ps(dstptr, _v); dstptr += 8; offset_value_ptr += 6; @@ -188,7 +188,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d value = _mm256_set1_ps(value_ptr[2]); __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0)); - _mm256_storeu_ps(dstptr, _v); + _mm256_store_ps(dstptr, _v); dstptr += 8; offset_value_ptr += 11; @@ -227,7 +227,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d __m128 value2 = _mm_set1_ps(value_ptr[1]); __m128 _v = _mm_comp_fmadd_ps(v1, value2, _mm_comp_fnmadd_ps(v0, value2, v0)); - _mm_storeu_ps(dstptr, _v); + _mm_store_ps(dstptr, _v); dstptr += 4; offset_value_ptr += 6; @@ -277,7 +277,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d value = _mm_set1_ps(value_ptr[2]); __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0)); - _mm_storeu_ps(dstptr, _v); + _mm_store_ps(dstptr, _v); dstptr += 4; offset_value_ptr += 11; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index 84458f9d3ee..cc7bcd80ae4 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -78,13 +78,13 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); - _mm256_storeu_ps(offset_value_ptr, nw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); - _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); - _mm256_storeu_ps(offset_value_ptr + 24, se_offset); + _mm256_store_ps(offset_value_ptr, nw_offset); + _mm256_store_ps(offset_value_ptr + 8, ne_offset); + _mm256_store_ps(offset_value_ptr + 16, sw_offset); + _mm256_store_ps(offset_value_ptr + 24, se_offset); - _mm256_storeu_ps(offset_value_ptr + 32, alpha); - _mm256_storeu_ps(offset_value_ptr + 40, beta); + _mm256_store_ps(offset_value_ptr + 32, alpha); + _mm256_store_ps(offset_value_ptr + 40, beta); gridptr += 16; offset_value_ptr += 48; @@ -187,13 +187,13 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); - _mm256_storeu_ps(offset_value_ptr, nw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); - _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); - _mm256_storeu_ps(offset_value_ptr + 24, se_offset); + _mm256_store_ps(offset_value_ptr, nw_offset); + _mm256_store_ps(offset_value_ptr + 8, ne_offset); + _mm256_store_ps(offset_value_ptr + 16, sw_offset); + _mm256_store_ps(offset_value_ptr + 24, se_offset); - _mm256_storeu_ps(offset_value_ptr + 32, alpha); - _mm256_storeu_ps(offset_value_ptr + 40, beta); + _mm256_store_ps(offset_value_ptr + 32, alpha); + _mm256_store_ps(offset_value_ptr + 40, beta); gridptr_x += 8; gridptr_y += 8; @@ -351,19 +351,19 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma); - _mm256_storeu_ps(offset_value_ptr, tnw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, tne_offset); - _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); - _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); + _mm256_store_ps(offset_value_ptr, tnw_offset); + _mm256_store_ps(offset_value_ptr + 8, tne_offset); + _mm256_store_ps(offset_value_ptr + 16, tsw_offset); + _mm256_store_ps(offset_value_ptr + 24, tse_offset); - _mm256_storeu_ps(offset_value_ptr, bnw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, bne_offset); - _mm256_storeu_ps(offset_value_ptr + 16, bsw_offset); - _mm256_storeu_ps(offset_value_ptr + 24, bse_offset); + _mm256_store_ps(offset_value_ptr + 32, bnw_offset); + _mm256_store_ps(offset_value_ptr + 40, bne_offset); + _mm256_store_ps(offset_value_ptr + 48, bsw_offset); + _mm256_store_ps(offset_value_ptr + 56, bse_offset); - _mm256_storeu_ps(offset_value_ptr + 64, alpha); - _mm256_storeu_ps(offset_value_ptr + 72, beta); - _mm256_storeu_ps(offset_value_ptr + 80, gamma); + _mm256_store_ps(offset_value_ptr + 64, alpha); + _mm256_store_ps(offset_value_ptr + 72, beta); + _mm256_store_ps(offset_value_ptr + 80, gamma); gridptr += 24; @@ -531,19 +531,19 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma); - _mm256_storeu_ps(offset_value_ptr, tnw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, tne_offset); - _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); - _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); + _mm256_store_ps(offset_value_ptr, tnw_offset); + _mm256_store_ps(offset_value_ptr + 8, tne_offset); + _mm256_store_ps(offset_value_ptr + 16, tsw_offset); + _mm256_store_ps(offset_value_ptr + 24, tse_offset); - _mm256_storeu_ps(offset_value_ptr, bnw_offset); - _mm256_storeu_ps(offset_value_ptr + 8, bne_offset); - _mm256_storeu_ps(offset_value_ptr + 16, bsw_offset); - _mm256_storeu_ps(offset_value_ptr + 24, bse_offset); + _mm256_store_ps(offset_value_ptr + 32, bnw_offset); + _mm256_store_ps(offset_value_ptr + 40, bne_offset); + _mm256_store_ps(offset_value_ptr + 48, bsw_offset); + _mm256_store_ps(offset_value_ptr + 56, bse_offset); - _mm256_storeu_ps(offset_value_ptr + 64, alpha); - _mm256_storeu_ps(offset_value_ptr + 72, beta); - _mm256_storeu_ps(offset_value_ptr + 80, gamma); + _mm256_store_ps(offset_value_ptr + 64, alpha); + _mm256_store_ps(offset_value_ptr + 72, beta); + _mm256_store_ps(offset_value_ptr + 80, gamma); gridptr_x += 8; gridptr_y += 8; diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index eee5fd32316..b17f5786528 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -36,7 +36,7 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, __m512 _v = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); offset_ptr++; - _mm512_storeu_ps(dstptr, _v); + _mm512_store_ps(dstptr, _v); dstptr += 16; } } @@ -64,7 +64,7 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, __m256 _v = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); offset_ptr++; - _mm256_storeu_ps(dstptr, _v); + _mm256_store_ps(dstptr, _v); dstptr += 8; } } @@ -91,7 +91,7 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, __m128 _v = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); offset_ptr++; - _mm_storeu_ps(dstptr, _v); + _mm_store_ps(dstptr, _v); dstptr += 4; } } diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index f66ca7437be..fb203323100 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -52,7 +52,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_storeu_ps(offset_ptr, offset); + _mm256_store_ps(offset_ptr, offset); gridptr += 16; offset_ptr += 8; @@ -112,7 +112,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_storeu_ps(offset_ptr, offset); + _mm256_store_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; @@ -196,7 +196,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_storeu_ps(offset_ptr, offset); + _mm256_store_ps(offset_ptr, offset); gridptr += 24; offset_ptr += 8; @@ -270,7 +270,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_storeu_ps(offset_ptr, offset); + _mm256_store_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; From 73bd9a736d6850757cd2fca64109f0655fe59e8e Mon Sep 17 00:00:00 2001 From: Yoh Date: Mon, 14 Aug 2023 02:52:19 +0800 Subject: [PATCH 126/127] fix unaligned data load and store --- .../gridsample_bicubic_apply_interpolation.h | 30 +++---- .../x86/gridsample_bicubic_compute_blob.h | 32 +++---- .../gridsample_bilinear_apply_interpolation.h | 84 +++++++++--------- .../x86/gridsample_bilinear_compute_blob.h | 88 +++++++++---------- .../gridsample_nearest_apply_interpolation.h | 12 +-- .../x86/gridsample_nearest_compute_blob.h | 28 +++--- 6 files changed, 137 insertions(+), 137 deletions(-) diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h index bb2e69f01a4..0b7be771d3b 100644 --- a/src/layer/x86/gridsample_bicubic_apply_interpolation.h +++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h @@ -56,10 +56,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d for (int ii = 0; ii < 4; ii++) { - __m512 x0_val = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); - __m512 x1_val = offset_ptr[1] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); - __m512 x2_val = offset_ptr[2] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); - __m512 x3_val = offset_ptr[3] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); + __m512 x0_val = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 x1_val = offset_ptr[1] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); + __m512 x2_val = offset_ptr[2] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); + __m512 x3_val = offset_ptr[3] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -73,7 +73,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& d _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm512_store_ps(dstptr, _v); + _mm512_storeu_ps(dstptr, _v); dstptr += 16; offset_value_ptr += 18; @@ -124,10 +124,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { - __m256 x0_val = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); - __m256 x1_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); - __m256 x2_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); - __m256 x3_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); + __m256 x0_val = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 x1_val = offset_ptr[1] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); + __m256 x2_val = offset_ptr[2] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); + __m256 x3_val = offset_ptr[3] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -141,7 +141,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& ds _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm256_store_ps(dstptr, _v); + _mm256_storeu_ps(dstptr, _v); dstptr += 8; offset_value_ptr += 18; @@ -193,10 +193,10 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds for (int ii = 0; ii < 4; ii++) { - __m128 x0_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); - __m128 x1_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); - __m128 x2_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); - __m128 x3_val = offset_ptr[3] >= 0 ? _mm_load_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); + __m128 x0_val = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 x1_val = offset_ptr[1] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); + __m128 x2_val = offset_ptr[2] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); + __m128 x3_val = offset_ptr[3] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val); value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]); @@ -210,7 +210,7 @@ static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& ds _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v); _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v); _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v); - _mm_store_ps(dstptr, _v); + _mm_storeu_ps(dstptr, _v); dstptr += 4; offset_value_ptr += 18; diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h index f1248ba923e..9006153d9d2 100644 --- a/src/layer/x86/gridsample_bicubic_compute_blob.h +++ b/src/layer/x86/gridsample_bicubic_compute_blob.h @@ -31,8 +31,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_load_ps(gridptr); - __m256 gy = _mm256_load_ps(gridptr + 8); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); transpose2x8_ps(gx, gy); @@ -87,15 +87,15 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); - _mm256_store_ps(offset_value_ptr, tx); - _mm256_store_ps(offset_value_ptr + 8, ty); + _mm256_storeu_ps(offset_value_ptr, tx); + _mm256_storeu_ps(offset_value_ptr + 8, ty); offset_value_ptr += 16; for (int i = 0; i < 4; i++) { - _mm256_store_ps(offset_value_ptr, v0_offset_f[i]); - _mm256_store_ps(offset_value_ptr + 8, v1_offset_f[i]); - _mm256_store_ps(offset_value_ptr + 16, v2_offset_f[i]); - _mm256_store_ps(offset_value_ptr + 24, v3_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]); offset_value_ptr += 32; } gridptr += 16; @@ -167,8 +167,8 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_load_ps(gridptr_x); - __m256 gy = _mm256_load_ps(gridptr_y); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); gx = unormalize(_mm256_set1_ps(src.w), gx); gy = unormalize(_mm256_set1_ps(src.h), gy); @@ -222,15 +222,15 @@ void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& of transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]); - _mm256_store_ps(offset_value_ptr, tx); - _mm256_store_ps(offset_value_ptr + 8, ty); + _mm256_storeu_ps(offset_value_ptr, tx); + _mm256_storeu_ps(offset_value_ptr + 8, ty); offset_value_ptr += 16; for (int i = 0; i < 4; i++) { - _mm256_store_ps(offset_value_ptr, v0_offset_f[i]); - _mm256_store_ps(offset_value_ptr + 8, v1_offset_f[i]); - _mm256_store_ps(offset_value_ptr + 16, v2_offset_f[i]); - _mm256_store_ps(offset_value_ptr + 24, v3_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]); + _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]); offset_value_ptr += 32; } diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h index ad11d5832f5..0af661b6ca0 100644 --- a/src/layer/x86/gridsample_bilinear_apply_interpolation.h +++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h @@ -35,10 +35,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 4; - __m512 v00_val = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); - __m512 v01_val = offset_ptr[1] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); - __m512 v10_val = offset_ptr[2] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); - __m512 v11_val = offset_ptr[3] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); + __m512 v00_val = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 v01_val = offset_ptr[1] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); + __m512 v10_val = offset_ptr[2] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); + __m512 v11_val = offset_ptr[3] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); __m512 value1 = _mm512_set1_ps(value_ptr[0]); __m512 v0 = _mm512_fmadd_ps(v01_val, value1, _mm512_fnmadd_ps(v00_val, value1, v00_val)); @@ -46,7 +46,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& __m512 value2 = _mm512_set1_ps(value_ptr[1]); __m512 _v = _mm512_fmadd_ps(v1, value2, _mm512_fnmadd_ps(v0, value2, v0)); - _mm512_store_ps(dstptr, _v); + _mm512_storeu_ps(dstptr, _v); dstptr += 16; offset_value_ptr += 6; @@ -75,15 +75,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - __m512 v000_val = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); - __m512 v001_val = offset_ptr[1] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); - __m512 v010_val = offset_ptr[2] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); - __m512 v011_val = offset_ptr[3] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); + __m512 v000_val = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 v001_val = offset_ptr[1] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0); + __m512 v010_val = offset_ptr[2] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0); + __m512 v011_val = offset_ptr[3] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0); - __m512 v100_val = offset_ptr[4] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[4]) : _mm512_set1_ps(0); - __m512 v101_val = offset_ptr[5] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[5]) : _mm512_set1_ps(0); - __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0); - __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0); + __m512 v100_val = offset_ptr[4] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[4]) : _mm512_set1_ps(0); + __m512 v101_val = offset_ptr[5] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[5]) : _mm512_set1_ps(0); + __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0); + __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0); __m512 value = _mm512_set1_ps(value_ptr[0]); __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val)); @@ -97,7 +97,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& value = _mm512_set1_ps(value_ptr[2]); __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0)); - _mm512_store_ps(dstptr, _v); + _mm512_storeu_ps(dstptr, _v); dstptr += 16; offset_value_ptr += 11; @@ -127,10 +127,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 4; - __m256 v00_val = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); - __m256 v01_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); - __m256 v10_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); - __m256 v11_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); + __m256 v00_val = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 v01_val = offset_ptr[1] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); + __m256 v10_val = offset_ptr[2] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); + __m256 v11_val = offset_ptr[3] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); __m256 value1 = _mm256_set1_ps(value_ptr[0]); __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value1, _mm256_comp_fnmadd_ps(v00_val, value1, v00_val)); @@ -138,7 +138,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d __m256 value2 = _mm256_set1_ps(value_ptr[1]); __m256 _v = _mm256_comp_fmadd_ps(v1, value2, _mm256_comp_fnmadd_ps(v0, value2, v0)); - _mm256_store_ps(dstptr, _v); + _mm256_storeu_ps(dstptr, _v); dstptr += 8; offset_value_ptr += 6; @@ -166,15 +166,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - __m256 v000_val = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); - __m256 v001_val = offset_ptr[1] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); - __m256 v010_val = offset_ptr[2] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); - __m256 v011_val = offset_ptr[3] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); + __m256 v000_val = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 v001_val = offset_ptr[1] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0); + __m256 v010_val = offset_ptr[2] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0); + __m256 v011_val = offset_ptr[3] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0); - __m256 v100_val = offset_ptr[4] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[4]) : _mm256_set1_ps(0); - __m256 v101_val = offset_ptr[5] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[5]) : _mm256_set1_ps(0); - __m256 v110_val = offset_ptr[6] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[6]) : _mm256_set1_ps(0); - __m256 v111_val = offset_ptr[7] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[7]) : _mm256_set1_ps(0); + __m256 v100_val = offset_ptr[4] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[4]) : _mm256_set1_ps(0); + __m256 v101_val = offset_ptr[5] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[5]) : _mm256_set1_ps(0); + __m256 v110_val = offset_ptr[6] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[6]) : _mm256_set1_ps(0); + __m256 v111_val = offset_ptr[7] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[7]) : _mm256_set1_ps(0); __m256 value = _mm256_set1_ps(value_ptr[0]); __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val)); @@ -188,7 +188,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& d value = _mm256_set1_ps(value_ptr[2]); __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0)); - _mm256_store_ps(dstptr, _v); + _mm256_storeu_ps(dstptr, _v); dstptr += 8; offset_value_ptr += 11; @@ -216,10 +216,10 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 4; - __m128 v00_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); - __m128 v01_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); - __m128 v10_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); - __m128 v11_val = offset_ptr[3] >= 0 ? _mm_load_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); + __m128 v00_val = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 v01_val = offset_ptr[1] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); + __m128 v10_val = offset_ptr[2] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); + __m128 v11_val = offset_ptr[3] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); __m128 value1 = _mm_set1_ps(value_ptr[0]); __m128 v0 = _mm_comp_fmadd_ps(v01_val, value1, _mm_comp_fnmadd_ps(v00_val, value1, v00_val)); @@ -227,7 +227,7 @@ static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d __m128 value2 = _mm_set1_ps(value_ptr[1]); __m128 _v = _mm_comp_fmadd_ps(v1, value2, _mm_comp_fnmadd_ps(v0, value2, v0)); - _mm_store_ps(dstptr, _v); + _mm_storeu_ps(dstptr, _v); dstptr += 4; offset_value_ptr += 6; @@ -255,15 +255,15 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d const int* offset_ptr = (int*)offset_value_ptr; const float* value_ptr = offset_value_ptr + 8; - __m128 v000_val = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); - __m128 v001_val = offset_ptr[1] >= 0 ? _mm_load_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); - __m128 v010_val = offset_ptr[2] >= 0 ? _mm_load_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); - __m128 v011_val = offset_ptr[3] >= 0 ? _mm_load_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); + __m128 v000_val = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 v001_val = offset_ptr[1] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0); + __m128 v010_val = offset_ptr[2] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0); + __m128 v011_val = offset_ptr[3] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0); - __m128 v100_val = offset_ptr[4] >= 0 ? _mm_load_ps(srcptr + offset_ptr[4]) : _mm_set1_ps(0); - __m128 v101_val = offset_ptr[5] >= 0 ? _mm_load_ps(srcptr + offset_ptr[5]) : _mm_set1_ps(0); - __m128 v110_val = offset_ptr[6] >= 0 ? _mm_load_ps(srcptr + offset_ptr[6]) : _mm_set1_ps(0); - __m128 v111_val = offset_ptr[7] >= 0 ? _mm_load_ps(srcptr + offset_ptr[7]) : _mm_set1_ps(0); + __m128 v100_val = offset_ptr[4] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[4]) : _mm_set1_ps(0); + __m128 v101_val = offset_ptr[5] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[5]) : _mm_set1_ps(0); + __m128 v110_val = offset_ptr[6] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[6]) : _mm_set1_ps(0); + __m128 v111_val = offset_ptr[7] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[7]) : _mm_set1_ps(0); __m128 value = _mm_set1_ps(value_ptr[0]); __m128 v00 = _mm_comp_fmadd_ps(v001_val, value, _mm_comp_fnmadd_ps(v000_val, value, v000_val)); @@ -277,7 +277,7 @@ static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& d value = _mm_set1_ps(value_ptr[2]); __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0)); - _mm_store_ps(dstptr, _v); + _mm_storeu_ps(dstptr, _v); dstptr += 4; offset_value_ptr += 11; diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h index cc7bcd80ae4..f78e017c1c7 100644 --- a/src/layer/x86/gridsample_bilinear_compute_blob.h +++ b/src/layer/x86/gridsample_bilinear_compute_blob.h @@ -31,8 +31,8 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_load_ps(gridptr); - __m256 gy = _mm256_load_ps(gridptr + 8); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); transpose2x8_ps(gx, gy); @@ -78,13 +78,13 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); - _mm256_store_ps(offset_value_ptr, nw_offset); - _mm256_store_ps(offset_value_ptr + 8, ne_offset); - _mm256_store_ps(offset_value_ptr + 16, sw_offset); - _mm256_store_ps(offset_value_ptr + 24, se_offset); + _mm256_storeu_ps(offset_value_ptr, nw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, se_offset); - _mm256_store_ps(offset_value_ptr + 32, alpha); - _mm256_store_ps(offset_value_ptr + 40, beta); + _mm256_storeu_ps(offset_value_ptr + 32, alpha); + _mm256_storeu_ps(offset_value_ptr + 40, beta); gridptr += 16; offset_value_ptr += 48; @@ -142,8 +142,8 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_load_ps(gridptr_x); - __m256 gy = _mm256_load_ps(gridptr_y); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -187,13 +187,13 @@ void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta); - _mm256_store_ps(offset_value_ptr, nw_offset); - _mm256_store_ps(offset_value_ptr + 8, ne_offset); - _mm256_store_ps(offset_value_ptr + 16, sw_offset); - _mm256_store_ps(offset_value_ptr + 24, se_offset); + _mm256_storeu_ps(offset_value_ptr, nw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, ne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, sw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, se_offset); - _mm256_store_ps(offset_value_ptr + 32, alpha); - _mm256_store_ps(offset_value_ptr + 40, beta); + _mm256_storeu_ps(offset_value_ptr + 32, alpha); + _mm256_storeu_ps(offset_value_ptr + 40, beta); gridptr_x += 8; gridptr_y += 8; @@ -265,9 +265,9 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 23 < grid_size; x += 24) { - __m256 gx = _mm256_load_ps(gridptr); - __m256 gy = _mm256_load_ps(gridptr + 8); - __m256 gz = _mm256_load_ps(gridptr + 16); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); transpose3x8_ps(gx, gy, gz); @@ -351,19 +351,19 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma); - _mm256_store_ps(offset_value_ptr, tnw_offset); - _mm256_store_ps(offset_value_ptr + 8, tne_offset); - _mm256_store_ps(offset_value_ptr + 16, tsw_offset); - _mm256_store_ps(offset_value_ptr + 24, tse_offset); + _mm256_storeu_ps(offset_value_ptr, tnw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, tne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); - _mm256_store_ps(offset_value_ptr + 32, bnw_offset); - _mm256_store_ps(offset_value_ptr + 40, bne_offset); - _mm256_store_ps(offset_value_ptr + 48, bsw_offset); - _mm256_store_ps(offset_value_ptr + 56, bse_offset); + _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset); + _mm256_storeu_ps(offset_value_ptr + 40, bne_offset); + _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset); + _mm256_storeu_ps(offset_value_ptr + 56, bse_offset); - _mm256_store_ps(offset_value_ptr + 64, alpha); - _mm256_store_ps(offset_value_ptr + 72, beta); - _mm256_store_ps(offset_value_ptr + 80, gamma); + _mm256_storeu_ps(offset_value_ptr + 64, alpha); + _mm256_storeu_ps(offset_value_ptr + 72, beta); + _mm256_storeu_ps(offset_value_ptr + 80, gamma); gridptr += 24; @@ -447,9 +447,9 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_load_ps(gridptr_x); - __m256 gy = _mm256_load_ps(gridptr_y); - __m256 gz = _mm256_load_ps(gridptr_z); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -531,19 +531,19 @@ void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& o transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma); - _mm256_store_ps(offset_value_ptr, tnw_offset); - _mm256_store_ps(offset_value_ptr + 8, tne_offset); - _mm256_store_ps(offset_value_ptr + 16, tsw_offset); - _mm256_store_ps(offset_value_ptr + 24, tse_offset); + _mm256_storeu_ps(offset_value_ptr, tnw_offset); + _mm256_storeu_ps(offset_value_ptr + 8, tne_offset); + _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset); + _mm256_storeu_ps(offset_value_ptr + 24, tse_offset); - _mm256_store_ps(offset_value_ptr + 32, bnw_offset); - _mm256_store_ps(offset_value_ptr + 40, bne_offset); - _mm256_store_ps(offset_value_ptr + 48, bsw_offset); - _mm256_store_ps(offset_value_ptr + 56, bse_offset); + _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset); + _mm256_storeu_ps(offset_value_ptr + 40, bne_offset); + _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset); + _mm256_storeu_ps(offset_value_ptr + 56, bse_offset); - _mm256_store_ps(offset_value_ptr + 64, alpha); - _mm256_store_ps(offset_value_ptr + 72, beta); - _mm256_store_ps(offset_value_ptr + 80, gamma); + _mm256_storeu_ps(offset_value_ptr + 64, alpha); + _mm256_storeu_ps(offset_value_ptr + 72, beta); + _mm256_storeu_ps(offset_value_ptr + 80, gamma); gridptr_x += 8; gridptr_y += 8; diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h index b17f5786528..e84cdc7de25 100644 --- a/src/layer/x86/gridsample_nearest_apply_interpolation.h +++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h @@ -33,10 +33,10 @@ static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - __m512 _v = offset_ptr[0] >= 0 ? _mm512_load_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); + __m512 _v = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0); offset_ptr++; - _mm512_store_ps(dstptr, _v); + _mm512_storeu_ps(dstptr, _v); dstptr += 16; } } @@ -61,10 +61,10 @@ static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - __m256 _v = offset_ptr[0] >= 0 ? _mm256_load_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); + __m256 _v = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0); offset_ptr++; - _mm256_store_ps(dstptr, _v); + _mm256_storeu_ps(dstptr, _v); dstptr += 8; } } @@ -88,10 +88,10 @@ static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, for (int i = 0; i < grid_size; i++) { - __m128 _v = offset_ptr[0] >= 0 ? _mm_load_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); + __m128 _v = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0); offset_ptr++; - _mm_store_ps(dstptr, _v); + _mm_storeu_ps(dstptr, _v); dstptr += 4; } } diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h index fb203323100..a7a12066d21 100644 --- a/src/layer/x86/gridsample_nearest_compute_blob.h +++ b/src/layer/x86/gridsample_nearest_compute_blob.h @@ -31,8 +31,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 15 < grid_size; x += 16) { - __m256 gx = _mm256_load_ps(gridptr); - __m256 gy = _mm256_load_ps(gridptr + 8); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); transpose2x8_ps(gx, gy); @@ -52,7 +52,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_store_ps(offset_ptr, offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr += 16; offset_ptr += 8; @@ -93,8 +93,8 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_load_ps(gridptr_x); - __m256 gy = _mm256_load_ps(gridptr_y); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -112,7 +112,7 @@ void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_store_ps(offset_ptr, offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; @@ -167,9 +167,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 23 < grid_size; x += 24) { - __m256 gx = _mm256_load_ps(gridptr); - __m256 gy = _mm256_load_ps(gridptr + 8); - __m256 gz = _mm256_load_ps(gridptr + 16); + __m256 gx = _mm256_loadu_ps(gridptr); + __m256 gy = _mm256_loadu_ps(gridptr + 8); + __m256 gz = _mm256_loadu_ps(gridptr + 16); transpose3x8_ps(gx, gy, gz); @@ -196,7 +196,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_store_ps(offset_ptr, offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr += 24; offset_ptr += 8; @@ -243,9 +243,9 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of #if __AVX__ for (; x + 7 < grid_size; x += 8) { - __m256 gx = _mm256_load_ps(gridptr_x); - __m256 gy = _mm256_load_ps(gridptr_y); - __m256 gz = _mm256_load_ps(gridptr_z); + __m256 gx = _mm256_loadu_ps(gridptr_x); + __m256 gy = _mm256_loadu_ps(gridptr_y); + __m256 gz = _mm256_loadu_ps(gridptr_z); gx = unormalize(_mm256_set1_ps(src.w), gx); gx = get_coord(_mm256_set1_ps(src.w), gx); @@ -270,7 +270,7 @@ void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& of offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range); - _mm256_store_ps(offset_ptr, offset); + _mm256_storeu_ps(offset_ptr, offset); gridptr_x += 8; gridptr_y += 8; From e97c97a6d4075867e695cd83856205e61f74a28b Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 20 Oct 2023 16:56:23 +0800 Subject: [PATCH 127/127] fix build --- src/layer/x86/x86_usability.h | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index a091697814e..1571cdf4928 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -610,36 +610,6 @@ static void transpose8x18_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, _rh = _mm256_permute2f128_ps(_tmpy, _tmpz, _MM_SHUFFLE(0, 3, 0, 1)); } -static NCNN_FORCEINLINE void transpose8x8_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7) -{ - __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1); - __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1); - __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3); - __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3); - __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5); - __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5); - __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7); - __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7); - - __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2); - __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2); - __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3); - __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3); - __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6); - __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6); - __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7); - __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7); - - _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc); - _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc); - _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd); - _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd); - _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe); - _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe); - _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf); - _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf); -} - static NCNN_FORCEINLINE __m256 HorizontalSums(__m256& v0, __m256& v1, __m256& v2, __m256& v3, __m256& v4, __m256& v5, __m256& v6, __m256& v7) { const __m256 s01 = _mm256_hadd_ps(v0, v1);