From 6fed514171498cfb3499c13e70dda1b111394ac5 Mon Sep 17 00:00:00 2001 From: Colin Davidson Date: Fri, 19 Jul 2024 16:01:14 +0100 Subject: [PATCH] Add support for host target setting of support of features Allow features to be added using env variable or cmake using standard mechanism of "+v,+zfence" etc --- doc/developer-guide.md | 43 ++-- modules/compiler/targets/host/CMakeLists.txt | 25 +- .../compiler/targets/host/source/target.cpp | 231 ++++++++++++------ 3 files changed, 195 insertions(+), 104 deletions(-) diff --git a/doc/developer-guide.md b/doc/developer-guide.md index 27cd48097..590b2dfbf 100644 --- a/doc/developer-guide.md +++ b/doc/developer-guide.md @@ -318,24 +318,28 @@ The builtin CMake options used when invoking CMake on the command line. use the installed OpenCL or Vulkan library. Do disable this behaviour set `-DCMAKE_SKIP_RPATH=ON` when configuring CMake in build directory. -* `CA_HOST_TARGET__CPU`: This option is used by the `host` target to - optimize for performance on a given CPU.`arch` should be a capitalized - version of the `host` target architecture e.g. `X86_64`, `RISCV64` or - `AARCH64`. If set to "native" host will optimize for the CPU being used to - compile it. Otherwise a CPU name can be provided, for example "skylake", but - be warned that this string will be passed directly to the llvm backend so make - sure it's a valid CPU name. Information about your host CPU can be found by - running`llc --version`, and a list of host CPUs supported by your installed - version of LLVM can be found by running`llc --march=[your-arch] --mcpu=help`. +* `CA_HOST_TARGET__CPU`, `CA_HOST_TARGET__FEATURES`: These options + are used by the `host` target to optimize for performance on a given CPU. + `arch` should be a capitalized version of the `host` target architecture e.g. + `X86_64`, `RISCV64` or `AARCH64`. + + `CPU` can be set to `native` to optimize for the CPU being used to compile. + Otherwise a CPU name can be provided, e.g. `skylake`. This string will be + passed directly to the LLVM backend; it has to be a valid CPU name. A list of + CPUs supported by LLVM can be found by running `clang -mcpu=help`. + + `FEATURES` should be a comma-separated list of features preceded by either `+` + or `-` to enable or disable them, e.g. `+v,-zfencei`. The features are the + same as those supported by the `-mattr` option in LLVM tools such as `llc` and + `opt` and add to the features supported by default. + + If no `CPU` or `FEATURES` are specified, kernels will be compiled to run on + any CPU that meets our minimal assumptions. - Be aware that if `host` is compiled with this option set, running it on a - different CPU from the one specified (or the one compiled with if "native" was - specified) isn't supported and bad things may happen. When the oneAPI - Construction Kit is built in debug mode, the environment variable - `CA_HOST_TARGET_CPU` is also respected across all `host` targets, which can - help track down codegen differences among different machine targets. The - caveats above apply, and this may result in an illegal instruction crash if - your CPU doesn't support the generated instructions. + Beware that if `host` is compiled with this option set, running kernels on a + CPU that is not compatible with the one specified (or the one compiled with if + `native` was specified) is not supported and may result in attempts to execute + instructions not supported by that CPU. * `CA_USE_SPLIT_DWARF`: When building with gcc, enable split dwarf debuginfo. This significantly reduces binary size (especially when static linking) and @@ -1116,6 +1120,11 @@ options without having to modify the source. be used. * `CA_HOST_NUM_THREADS`: Sets the maximum number of threads the `host` device will create. `host` may create fewer threads than this value. +* `CA_HOST_TARGET_CPU`, `CA_HOST_TARGET_FEATURES`: These environment variables + can be used in debug builds to override the default CPU and features. They + behave the same way as the `CA_HOST_TARGET__CPU` and + `CA_HOST_TARGET__FEATURES` CMake options and the same caveats about + `"native"` apply here. ## Debugging the LLVM compiler diff --git a/modules/compiler/targets/host/CMakeLists.txt b/modules/compiler/targets/host/CMakeLists.txt index 73645f1a4..71b7539a6 100644 --- a/modules/compiler/targets/host/CMakeLists.txt +++ b/modules/compiler/targets/host/CMakeLists.txt @@ -188,7 +188,13 @@ if(CA_HOST_CROSS_COMPILERS) target_compile_definitions(compiler-host PRIVATE CA_HOST_TARGET_${HOST_ARCH_UPPER}_CPU="${CA_HOST_TARGET_${HOST_ARCH_UPPER}_CPU}") endif() - + ca_option(CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES STRING + "Feature list that host ${HOST_ARCH_UPPER} should enable or disable as a comma separated + or - list e.g. '+v,+zfh'" "") + if(CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES) + message(STATUS "Features ${HOST_ARCH_UPPER} name ${CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES}") + target_compile_definitions(compiler-host PRIVATE + CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES="${CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES}") + endif() if(hostCrossCompilers) # Validate the user specified cross compiler list. foreach(CrossCompiler ${hostCrossCompilers}) @@ -225,11 +231,18 @@ if(CA_HOST_CROSS_COMPILERS) HOST_CROSS_DEVICE_NAME_${CROSS_COMPILER}="${crossDeviceName}") ca_option(CA_HOST_TARGET_${CROSS_COMPILER}_CPU STRING "Name of the CPU that host ${CROSS_COMPILER} should optimize for, or 'native'" "") - if(CA_HOST_TARGET_${CROSS_COMPILER}_CPU) - message(STATUS "CPU ${CROSS_COMPILER} name ${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}") - target_compile_definitions(compiler-host PRIVATE - CA_HOST_TARGET_${CROSS_COMPILER}_CPU="${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}") - endif() + if(CA_HOST_TARGET_${CROSS_COMPILER}_CPU) + message(STATUS "CPU ${CROSS_COMPILER} name ${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}") + target_compile_definitions(compiler-host PRIVATE + CA_HOST_TARGET_${CROSS_COMPILER}_CPU="${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}") + endif() + ca_option(CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES STRING + "Feature list that host ${HOST_ARCH_UPPER} should enable or disable as a comma separated + or - list e.g. '+v,+zfh'" "") + if(CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES) + message(STATUS "Features ${CROSS_COMPILER} name ${CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES}") + target_compile_definitions(compiler-host PRIVATE + CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES="${CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES}") + endif() endforeach() endif() endif() diff --git a/modules/compiler/targets/host/source/target.cpp b/modules/compiler/targets/host/source/target.cpp index 17e6f7d69..74e99a56c 100644 --- a/modules/compiler/targets/host/source/target.cpp +++ b/modules/compiler/targets/host/source/target.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #if LLVM_VERSION_GREATER_EQUAL(18, 0) @@ -234,95 +235,176 @@ compiler::Result HostTarget::initWithBuiltins( break; } - llvm::StringRef CPUName = ""; - llvm::StringMap FeatureMap; - - if (llvm::Triple::arm == triple.getArch()) { - FeatureMap["strict-align"] = true; - // We do not support denormals for single precision floating points, but we - // do for double precision. To support that we use neon (which is FTZ) for - // single precision floating points, and use the VFP with denormal support - // enabled for doubles. The neonfp feature enables the use of neon for - // single precision floating points. - FeatureMap["neonfp"] = true; - FeatureMap["neon"] = true; - // Hardware division instructions might not exist on all ARMv7 CPUs, but - // they probably exist on all the ones we might care about. - FeatureMap["hwdiv"] = true; - FeatureMap["hwdiv-arm"] = true; - if (host_device_info.half_capabilities) { - FeatureMap["fp16"] = true; - } -#if defined(CA_HOST_TARGET_ARM_CPU) - CPUName = CA_HOST_TARGET_ARM_CPU; + std::string CPU; + llvm::SubtargetFeatures Features; + + switch (triple.getArch()) { +#ifdef HOST_LLVM_ARM + case llvm::Triple::arm: + // We do not support denormals for single precision floating points, but + // we do for double precision. To support that we use neon (which is FTZ) + // for single precision floating points, and use the VFP with denormal + // support enabled for doubles. The neonfp feature enables the use of neon + // for single precision floating points. + Features.AddFeature("strict-align", true); + Features.AddFeature("neonfp", true); + Features.AddFeature("neon", true); + // Hardware division instructions might not exist on all ARMv7 CPUs, but + // they probably exist on all the ones we might care about. + Features.AddFeature("hwdiv", true); + Features.AddFeature("hwdiv-arm", true); + if (host_device_info.half_capabilities) { + Features.AddFeature("fp16", true); + } + break; +#endif +#ifdef HOST_LLVM_RISCV + case llvm::Triple::riscv32: + case llvm::Triple::riscv64: + CPU = triple.getArch() == llvm::Triple::riscv32 ? "generic-rv32" + : "generic-rv64"; + // The following features are important for OpenCL, and generally + // constitute a minimum requirement for non-embedded profile. Without + // these features, we'd need compiler-rt support. Atomics are absolutely + // essential. + Features.AddFeature("m", true); // Integer multiplication and division + Features.AddFeature("f", true); // Floating point support + Features.AddFeature("a", true); // Atomics +#if defined(CA_HOST_ENABLE_FP64) + Features.AddFeature("d", true); // Double support #endif - } else if (llvm::Triple::aarch64 == triple.getArch()) { -#if defined(CA_HOST_TARGET_AARCH64_CPU) - CPUName = CA_HOST_TARGET_AARCH64_CPU; +#if defined(CA_HOST_ENABLE_FP16) + Features.AddFeature("zfh", true); // Half support #endif - } else if (triple.isX86()) { - CPUName = "x86-64-v3"; // Default only, may be overridden below. - if (triple.isArch32Bit()) { -#if defined(CA_HOST_TARGET_X86_CPU) - CPUName = CA_HOST_TARGET_X86_CPU; + break; #endif - } else { -#if defined(CA_HOST_TARGET_X86_64_CPU) - CPUName = CA_HOST_TARGET_X86_64_CPU; +#ifdef HOST_LLVM_X86 + case llvm::Triple::x86: + case llvm::Triple::x86_64: + CPU = "x86-64-v3"; + break; #endif + default: + break; + } + + auto SetCPUFeatures = [&](std::string NewCPU, std::string NewFeatures) { + if (!NewCPU.empty()) { + CPU = NewCPU; + Features = llvm::SubtargetFeatures(); + } + if (!NewFeatures.empty()) { + std::vector NewFeatureVector; + llvm::SubtargetFeatures::Split(NewFeatureVector, NewFeatures); + for (auto &NewFeature : NewFeatureVector) Features.AddFeature(NewFeature); } - } else if (triple.isRISCV()) { - // These are reasonable defaults, which has been used for various RISC-V - // target so far. We should allow overriding of the ABI in the future - if (triple.isArch32Bit()) { - CPUName = "generic-rv32"; -#if defined(CA_HOST_TARGET_RISCV32_CPU) - CPUName = CA_HOST_TARGET_RISCV32_CPU; + }; + + switch (triple.getArch()) { +#ifdef HOST_LLVM_ARM + case llvm::Triple::arm: +#ifndef CA_HOST_TARGET_ARM_CPU +#define CA_HOST_TARGET_ARM_CPU "" #endif - } else { - CPUName = "generic-rv64"; -#if defined(CA_HOST_TARGET_RISCV64_CPU) - CPUName = CA_HOST_TARGET_RISCV64_CPU; +#ifndef CA_HOST_TARGET_ARM_FEATURES +#define CA_HOST_TARGET_ARM_FEATURES "" #endif - } - // The following features are important for OpenCL, and generally constitute - // a minimum requirement for non-embedded profile. Without these features, - // we'd need compiler-rt support. Atomics are absolutely essential. - // TODO: Allow overriding of the input features. - FeatureMap["m"] = true; // Integer multiplication and division - FeatureMap["f"] = true; // Floating point support - FeatureMap["a"] = true; // Atomics -#if defined(CA_HOST_ENABLE_FP64) - FeatureMap["d"] = true; // Double support + SetCPUFeatures(CA_HOST_TARGET_ARM_CPU, CA_HOST_TARGET_ARM_FEATURES); + break; #endif -#if defined(CA_HOST_ENABLE_FP16) - FeatureMap["zfh"] = true; // Half support +#ifdef HOST_LLVM_AARCH64 + case llvm::Triple::aarch64: +#ifndef CA_HOST_TARGET_AARCH64_CPU +#define CA_HOST_TARGET_AARCH64_CPU "" +#endif +#ifndef CA_HOST_TARGET_AARCH64_FEATURES +#define CA_HOST_TARGET_AARCH64_FEATURES "" +#endif + SetCPUFeatures(CA_HOST_TARGET_AARCH64_CPU, + CA_HOST_TARGET_AARCH64_FEATURES); + break; #endif +#ifdef HOST_LLVM_RISCV + case llvm::Triple::riscv32: +#ifndef CA_HOST_TARGET_RISCV32_CPU +#define CA_HOST_TARGET_RISCV32_CPU "" +#endif +#ifndef CA_HOST_TARGET_RISCV32_FEATURES +#define CA_HOST_TARGET_RISCV32_FEATURES "" +#endif + SetCPUFeatures(CA_HOST_TARGET_RISCV32_CPU, + CA_HOST_TARGET_RISCV32_FEATURES); + break; + case llvm::Triple::riscv64: +#ifndef CA_HOST_TARGET_RISCV64_CPU +#define CA_HOST_TARGET_RISCV64_CPU "" +#endif +#ifndef CA_HOST_TARGET_RISCV64_FEATURES +#define CA_HOST_TARGET_RISCV64_FEATURES "" +#endif + SetCPUFeatures(CA_HOST_TARGET_RISCV64_CPU, + CA_HOST_TARGET_RISCV64_FEATURES); + break; +#endif +#ifdef HOST_LLVM_X86 + case llvm::Triple::x86: +#ifndef CA_HOST_TARGET_X86_CPU +#define CA_HOST_TARGET_X86_CPU "" +#endif +#ifndef CA_HOST_TARGET_X86_FEATURES +#define CA_HOST_TARGET_X86_FEATURES "" +#endif + SetCPUFeatures(CA_HOST_TARGET_X86_CPU, CA_HOST_TARGET_X86_FEATURES); + break; + case llvm::Triple::x86_64: +#ifndef CA_HOST_TARGET_X86_64_CPU +#define CA_HOST_TARGET_X86_64_CPU "" +#endif +#ifndef CA_HOST_TARGET_X86_64_FEATURES +#define CA_HOST_TARGET_X86_64_FEATURES "" +#endif + SetCPUFeatures(CA_HOST_TARGET_X86_64_CPU, CA_HOST_TARGET_X86_64_FEATURES); + break; +#endif + default: + break; } -#ifndef NDEBUG - if (const char *E = getenv("CA_HOST_TARGET_CPU")) { - CPUName = E; +#if !defined(NDEBUG) || defined(CA_ENABLE_DEBUG_SUPPORT) + { + auto GetEnv = [](const char *Name) -> std::string { + auto *Value = std::getenv(Name); + return Value ? Value : ""; + }; + SetCPUFeatures(GetEnv("CA_HOST_TARGET_CPU"), + GetEnv("CA_HOST_TARGET_FEATURES")); + ; } #endif - if (CPUName == "native") { - CPUName = llvm::sys::getHostCPUName(); + if (CPU == "native") { + CPU = llvm::sys::getHostCPUName(); + + llvm::SubtargetFeatures NativeFeatures; + #if LLVM_VERSION_GREATER_EQUAL(19, 0) - FeatureMap = llvm::sys::getHostCPUFeatures(); + auto FeatureMap = llvm::sys::getHostCPUFeatures(); #else - FeatureMap.clear(); + llvm::StringMap FeatureMap; llvm::sys::getHostCPUFeatures(FeatureMap); #endif + for (auto &[FeatureName, IsEnabled] : FeatureMap) + NativeFeatures.AddFeature(FeatureName, IsEnabled); + + NativeFeatures.addFeaturesVector(Features.getFeatures()); + Features = std::move(NativeFeatures); } if (compiler_info->supports_deferred_compilation) { llvm::orc::JITTargetMachineBuilder TMBuilder(triple); - TMBuilder.setCPU(CPUName.str()); + TMBuilder.setCPU(CPU); TMBuilder.setCodeGenOptLevel(multi_llvm::CodeGenOptLevel::Aggressive); - for (auto &Feature : FeatureMap) { - TMBuilder.getFeatures().AddFeature(Feature.first(), Feature.second); - } + TMBuilder.getFeatures().addFeaturesVector(Features.getFeatures()); auto Builder = llvm::orc::LLJITBuilder(); Builder.setJITTargetMachineBuilder(TMBuilder); @@ -373,21 +455,8 @@ compiler::Result HostTarget::initWithBuiltins( target_machine = std::move(*TM); } else { // No JIT support so create target machine directly. - std::string Features; - bool first = true; - for (auto &[FeatureName, IsEnabled] : FeatureMap) { - if (first) { - first = false; - } else { - Features += ","; - } - if (IsEnabled) { - Features += '+' + FeatureName.str(); - } else { - Features += '-' + FeatureName.str(); - } - } - target_machine.reset(createTargetMachine(triple, CPUName, Features)); + target_machine.reset( + createTargetMachine(triple, CPU, Features.getString())); } return compiler::Result::SUCCESS;