Skip to content

Commit

Permalink
perf: arm64 performance optimizations (#4288)
Browse files Browse the repository at this point in the history
* Add arm64 optmizations flags

* Transport SSE intrinsics through sse2neon on ARM

* expose sse2neon to cmake

* Update VWFlags.cmake

* Update lda_core.cc

fix format

* Update lda_core.cc

* Update lda_core.cc

Co-authored-by: Jack Gerrits <[email protected]>
Co-authored-by: Jack Gerrits <[email protected]>
Co-authored-by: zwd-ms <[email protected]>
  • Loading branch information
4 people authored Dec 12, 2022
1 parent f651329 commit ca0c23c
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@
[submodule "ext_libs/vcpkg"]
path = ext_libs/vcpkg
url = ../../microsoft/vcpkg.git
[submodule "ext_libs/sse2neon"]
path = ext_libs/sse2neon/sse2neon
url = https://github.com/DLTcollab/sse2neon
5 changes: 5 additions & 0 deletions ext_libs/ext_libs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,8 @@ else()
add_library(eigen INTERFACE)
target_include_directories(eigen SYSTEM INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/eigen>)
endif()

add_library(sse2neon INTERFACE)
# This submodule is placed into a nested subdirectory since it exposes its
# header at the root of the repo rather than its own nested sse2neon/ dir
target_include_directories(sse2neon SYSTEM INTERFACE "${CMAKE_CURRENT_LIST_DIR}/sse2neon")
1 change: 1 addition & 0 deletions ext_libs/sse2neon/sse2neon
Submodule sse2neon added at 270cf6
2 changes: 1 addition & 1 deletion vowpalwabbit/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ vw_add_library(
# Use BUILD_INTERFACE to prevent them from being exported, i.e. treat them as PRIVATE
# https://gitlab.kitware.com/cmake/cmake/issues/15415
${CMAKE_DL_LIBS} ${LINK_THREADS} vw_io $<BUILD_INTERFACE:${boost_math_target}> $<BUILD_INTERFACE:RapidJSON>
$<BUILD_INTERFACE:eigen>
$<BUILD_INTERFACE:eigen> $<BUILD_INTERFACE:sse2neon>
DESCRIPTION "This contains all remaining VW code, all reduction implementations, driver, option handling"
EXCEPTION_DESCRIPTION "Yes"
ENABLE_INSTALL
Expand Down
13 changes: 12 additions & 1 deletion vowpalwabbit/core/src/reductions/lda_core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ VW_WARNING_STATE_POP
#include "vw/core/vw_versions.h"
#include "vw/io/logger.h"

#if defined(__ARM_NEON)
# include <sse2neon/sse2neon.h>
#endif

#include <algorithm>
#include <cassert>
#include <cmath>
Expand Down Expand Up @@ -164,7 +168,7 @@ inline float fastdigamma(float x)

#if !defined(VW_NO_INLINE_SIMD)

# if defined(__SSE2__) || defined(__SSE3__) || defined(__SSE4_1__)
# if defined(__SSE2__) || defined(__SSE3__) || defined(__SSE4_1__) || defined(__ARM_NEON)

namespace
{
Expand All @@ -186,6 +190,13 @@ inline bool is_aligned16(void* ptr)
# include <smmintrin.h>
# endif

// Transport SSE intrinsics through sse2neon on ARM:
# if defined(__ARM_NEON)
# define __SSE2__ 1
# define __SSE3__ 1
# define __SSE4_1__ 1
# endif

# define HAVE_SIMD_MATHMODE

typedef __m128 v4sf;
Expand Down

0 comments on commit ca0c23c

Please sign in to comment.