Skip to content

Commit

Permalink
Merge pull request #986 from Cyan4973/autovec_x86
Browse files Browse the repository at this point in the history
enable by default runtime detection of vector extension on x86/64 target
  • Loading branch information
Cyan4973 authored Dec 27, 2024
2 parents 6fe4f19 + 5c6dd83 commit cf9e2dc
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 73 deletions.
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ################################################################
# xxHash Makefile
# Copyright (C) 2012-2021 Yann Collet
# Copyright (C) 2012-2024 Yann Collet
#
# GPL v2 License
#
Expand Down Expand Up @@ -55,6 +55,13 @@ else
EXT =
endif

# automatically enable runtime vector dispatch on x86/64 targets
detect_x86_arch = $(shell $(CC) -dumpmachine | grep -E 'i[3-6]86|x86_64')
ifneq ($(strip $(call detect_x86_arch)),)
#note: can be overridden at compile time, by setting DISPATCH=0
DISPATCH ?= 1
endif

ifeq ($(NODE_JS),1)
# Link in unrestricted filesystem support
LDFLAGS += -sNODERAWFS
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ The following macros can be set at compilation time to modify `libxxhash`'s beha

#### Makefile variables
When compiling the Command Line Interface `xxhsum` using `make`, the following environment variables can also be set :
- `DISPATCH=1` : use `xxh_x86dispatch.c`, to automatically select between `scalar`, `sse2`, `avx2` or `avx512` instruction set _at runtime_, depending on local host. This option is only valid for `x86`/`x64` systems.
- `DISPATCH=1` : use `xxh_x86dispatch.c`, select at runtime between `scalar`, `sse2`, `avx2` or `avx512` instruction set. This option is only valid for `x86`/`x64` systems. It is enabled by default when target `x86`/`x64` is detected. It can be forcefully turned off using `DISPATCH=0`.
- `XXH_1ST_SPEED_TARGET` : select an initial speed target, expressed in MB/s, for the first speed test in benchmark mode. Benchmark will adjust the target at subsequent iterations, but the first test is made "blindly" by targeting this speed. Currently conservatively set to 10 MB/s, to support very slow (emulated) platforms.
- `NODE_JS=1` : When compiling `xxhsum` for Node.js with Emscripten, this links the `NODERAWFS` library for unrestricted filesystem access and patches `isatty` to make the command line utility correctly detect the terminal. This does make the binary specific to Node.js.

Expand Down
4 changes: 1 addition & 3 deletions cli/xsum_arch.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* xxhsum - Command line interface for xxhash algorithms
* Copyright (C) 2013-2021 Yann Collet
* Copyright (C) 2013-2024 Yann Collet
*
* GPL v2 License
*
Expand Down Expand Up @@ -29,8 +29,6 @@ int g_xsumarch_avoid_empty_unit = 0;
#if defined(XXHSUM_DISPATCH)

#include "../xxh_x86dispatch.h"
#define XXH_INLINE_ALL /* XXH_* vector types */
#include "../xxhash.h"

const char* XSUM_autox86(void)
{
Expand Down
34 changes: 3 additions & 31 deletions xxh_x86dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@
*
* Optional add-on.
*
* **Compile this file with the default flags for your target.** Do not compile
* with flags like `-mavx*`, `-march=native`, or `/arch:AVX*`, there will be
* an error. See @ref XXH_X86DISPATCH_ALLOW_AVX for details.
* **Compile this file with the default flags for your target.**
* Note that compiling with flags like `-mavx*`, `-march=native`, or `/arch:AVX*`
* will make the resulting binary incompatible with cpus not supporting the requested instruction set.
*
* @defgroup dispatch x86 Dispatcher
* @{
Expand Down Expand Up @@ -70,34 +70,6 @@ extern "C" {
#endif
/*! @endcond */

/*!
* @def XXH_X86DISPATCH_ALLOW_AVX
* @brief Disables the AVX sanity check.
*
* xxh_x86dispatch.c is intended to be compiled for the minimum target, and
* it selectively enables SSE2, AVX2, and AVX512 when it is needed.
*
* Compiling with options like `-mavx*`, `-march=native`, or `/arch:AVX*`
* _globally_ will always enable this feature, and therefore makes it
* undefined behavior to execute on any CPU without said feature.
*
* Even if the source code isn't directly using AVX intrinsics in a function,
* the compiler can still generate AVX code from autovectorization and by
* "upgrading" SSE2 intrinsics to use the VEX prefixes (a.k.a. AVX128).
*
* Define XXH_X86DISPATCH_ALLOW_AVX to ignore this check,
* thus accepting that the produced binary will not work correctly
* on any CPU with less features than the ones stated at compilation time.
*/
#ifdef XXH_DOXYGEN
# define XXH_X86DISPATCH_ALLOW_AVX
#endif

#if defined(__AVX__) && !defined(XXH_X86DISPATCH_ALLOW_AVX)
# error "Error: if xxh_x86dispatch.c is compiled with AVX enabled, the resulting binary will crash on sse2-only cpus !! " \
"If you nonetheless want to do that, please enable the XXH_X86DISPATCH_ALLOW_AVX build variable"
#endif

/*!
* @def XXH_DISPATCH_SCALAR
* @brief Enables/dispatching the scalar code path.
Expand Down
2 changes: 1 addition & 1 deletion xxh_x86dispatch.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* xxHash - XXH3 Dispatcher for x86-based targets
* Copyright (C) 2020-2021 Yann Collet
* Copyright (C) 2020-2024 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
Expand Down
53 changes: 17 additions & 36 deletions xxhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,23 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const
*
* The API supports one-shot hashing, streaming mode, and custom secrets.
*/

/*!
* @ingroup tuning
* @brief Possible values for @ref XXH_VECTOR.
*
* Unless set explicitly, determined automatically.
*/
# define XXH_SCALAR 0 /*!< Portable scalar version */
# define XXH_SSE2 1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */
# define XXH_AVX2 2 /*!< AVX2 for Haswell and Bulldozer */
# define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */
# define XXH_NEON 4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */
# define XXH_VSX 5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
# define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */
# define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */


/*-**********************************************************************
* XXH3 64-bit variant
************************************************************************/
Expand Down Expand Up @@ -3853,34 +3870,6 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
* implementation.
*/
# define XXH_VECTOR XXH_SCALAR
/*!
* @ingroup tuning
* @brief Possible values for @ref XXH_VECTOR.
*
* Note that these are actually implemented as macros.
*
* If this is not defined, it is detected automatically.
* internal macro XXH_X86DISPATCH overrides this.
*/
enum XXH_VECTOR_TYPE /* fake enum */ {
XXH_SCALAR = 0, /*!< Portable scalar version */
XXH_SSE2 = 1, /*!<
* SSE2 for Pentium 4, Opteron, all x86_64.
*
* @note SSE2 is also guaranteed on Windows 10, macOS, and
* Android x86.
*/
XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */
XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
XXH_NEON = 4, /*!<
* NEON for most ARMv7-A, all AArch64, and WASM SIMD128
* via the SIMDeverywhere polyfill provided with the
* Emscripten SDK.
*/
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
XXH_LSX = 7, /*!< LSX (128-bit SIMD) for LoongArch64 */
};
/*!
* @ingroup tuning
* @brief Selects the minimum alignment for XXH3's accumulators.
Expand All @@ -3895,14 +3884,6 @@ enum XXH_VECTOR_TYPE /* fake enum */ {

/* Actual definition */
#ifndef XXH_DOXYGEN
# define XXH_SCALAR 0
# define XXH_SSE2 1
# define XXH_AVX2 2
# define XXH_AVX512 3
# define XXH_NEON 4
# define XXH_VSX 5
# define XXH_SVE 6
# define XXH_LSX 7
#endif

#ifndef XXH_VECTOR /* can be defined on command line */
Expand Down

0 comments on commit cf9e2dc

Please sign in to comment.