From 6fed514171498cfb3499c13e70dda1b111394ac5 Mon Sep 17 00:00:00 2001
From: Colin Davidson <colin.davidson@codeplay.com>
Date: Fri, 19 Jul 2024 16:01:14 +0100
Subject: [PATCH] Add support for host target setting of support of features

Allow features to be added using env variable or cmake using standard
mechanism of "+v,+zfence" etc
---
 doc/developer-guide.md                        |  43 ++--
 modules/compiler/targets/host/CMakeLists.txt  |  25 +-
 .../compiler/targets/host/source/target.cpp   | 231 ++++++++++++------
 3 files changed, 195 insertions(+), 104 deletions(-)
diff --git a/doc/developer-guide.md b/doc/developer-guide.md
index 27cd48097..590b2dfbf 100644
--- a/doc/developer-guide.md
+++ b/doc/developer-guide.md
@@ -318,24 +318,28 @@ The builtin CMake options used when invoking CMake on the command line.
   use the installed OpenCL or Vulkan library. Do disable this behaviour set
   `-DCMAKE_SKIP_RPATH=ON` when configuring CMake in build directory.
 
-* `CA_HOST_TARGET_<arch>_CPU`: This option is used by the `host` target to
-  optimize for performance on a given CPU.`arch` should be a capitalized
-  version of the `host` target architecture e.g. `X86_64`, `RISCV64` or
-  `AARCH64`. If set to "native" host will optimize for the CPU being used to
-  compile it. Otherwise a CPU name can be provided, for example "skylake", but
-  be warned that this string will be passed directly to the llvm backend so make
-  sure it's a valid CPU name. Information about your host CPU can be found by
-  running`llc --version`, and a list of host CPUs supported by your installed
-  version of LLVM can be found by running`llc --march=[your-arch] --mcpu=help`.
+* `CA_HOST_TARGET_<arch>_CPU`, `CA_HOST_TARGET_<arch>_FEATURES`: These options
+  are used by the `host` target to optimize for performance on a given CPU.
+  `arch` should be a capitalized version of the `host` target architecture e.g.
+  `X86_64`, `RISCV64` or `AARCH64`.
+
+  `CPU` can be set to `native` to optimize for the CPU being used to compile.
+  Otherwise a CPU name can be provided, e.g. `skylake`. This string will be
+  passed directly to the LLVM backend; it has to be a valid CPU name. A list of
+  CPUs supported by LLVM can be found by running `clang -mcpu=help`.
+
+  `FEATURES` should be a comma-separated list of features preceded by either `+`
+  or `-` to enable or disable them, e.g. `+v,-zfencei`. The features are the
+  same as those supported by the `-mattr` option in LLVM tools such as `llc` and
+  `opt` and add to the features supported by default.
+
+  If no `CPU` or `FEATURES` are specified, kernels will be compiled to run on
+  any CPU that meets our minimal assumptions.
   
-  Be aware that if `host` is compiled with this option set, running it on a
-  different CPU from the one specified (or the one compiled with if "native" was
-  specified) isn't supported and bad things may happen. When the oneAPI
-  Construction Kit is built in debug mode, the environment variable
-  `CA_HOST_TARGET_CPU` is also respected across all `host` targets, which can
-  help track down codegen differences among different machine targets. The
-  caveats above apply, and this may result in an illegal instruction crash if
-  your CPU doesn't support the generated instructions.
+  Beware that if `host` is compiled with this option set, running kernels on a
+  CPU that is not compatible with the one specified (or the one compiled with if
+  `native` was specified) is not supported and may result in attempts to execute
+  instructions not supported by that CPU.
 
 * `CA_USE_SPLIT_DWARF`: When building with gcc, enable split dwarf debuginfo.
   This significantly reduces binary size (especially when static linking) and
@@ -1116,6 +1120,11 @@ options without having to modify the source.
   be used.
 * `CA_HOST_NUM_THREADS`: Sets the maximum number of threads the `host` device
   will create. `host` may create fewer threads than this value.
+* `CA_HOST_TARGET_CPU`, `CA_HOST_TARGET_FEATURES`: These environment variables
+  can be used in debug builds to override the default CPU and features. They
+  behave the same way as the `CA_HOST_TARGET_<arch>_CPU` and
+  `CA_HOST_TARGET_<arch>_FEATURES` CMake options and the same caveats about
+  `"native"` apply here.
 
 ## Debugging the LLVM compiler
 
diff --git a/modules/compiler/targets/host/CMakeLists.txt b/modules/compiler/targets/host/CMakeLists.txt
index 73645f1a4..71b7539a6 100644
--- a/modules/compiler/targets/host/CMakeLists.txt
+++ b/modules/compiler/targets/host/CMakeLists.txt
@@ -188,7 +188,13 @@ if(CA_HOST_CROSS_COMPILERS)
     target_compile_definitions(compiler-host PRIVATE
     CA_HOST_TARGET_${HOST_ARCH_UPPER}_CPU="${CA_HOST_TARGET_${HOST_ARCH_UPPER}_CPU}")
   endif()
-
+  ca_option(CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES STRING
+    "Feature list that host ${HOST_ARCH_UPPER} should enable or disable as a comma separated + or - list e.g. '+v,+zfh'" "")
+  if(CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES)
+    message(STATUS "Features ${HOST_ARCH_UPPER} name ${CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES}")
+    target_compile_definitions(compiler-host PRIVATE
+    CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES="${CA_HOST_TARGET_${HOST_ARCH_UPPER}_FEATURES}")
+  endif()
   if(hostCrossCompilers)
     # Validate the user specified cross compiler list.
     foreach(CrossCompiler ${hostCrossCompilers})
@@ -225,11 +231,18 @@ if(CA_HOST_CROSS_COMPILERS)
         HOST_CROSS_DEVICE_NAME_${CROSS_COMPILER}="${crossDeviceName}")
         ca_option(CA_HOST_TARGET_${CROSS_COMPILER}_CPU STRING
         "Name of the CPU that host ${CROSS_COMPILER} should optimize for, or 'native'" "")  
-          if(CA_HOST_TARGET_${CROSS_COMPILER}_CPU)
-            message(STATUS "CPU ${CROSS_COMPILER} name ${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}")
-            target_compile_definitions(compiler-host PRIVATE
-            CA_HOST_TARGET_${CROSS_COMPILER}_CPU="${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}")
-          endif()
+        if(CA_HOST_TARGET_${CROSS_COMPILER}_CPU)
+          message(STATUS "CPU ${CROSS_COMPILER} name ${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}")
+          target_compile_definitions(compiler-host PRIVATE
+          CA_HOST_TARGET_${CROSS_COMPILER}_CPU="${CA_HOST_TARGET_${CROSS_COMPILER}_CPU}")
+        endif()
+        ca_option(CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES STRING
+          "Feature list that host ${HOST_ARCH_UPPER} should enable or disable as a comma separated + or - list e.g. '+v,+zfh'" "")
+        if(CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES)
+          message(STATUS "Features ${CROSS_COMPILER} name ${CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES}")
+          target_compile_definitions(compiler-host PRIVATE
+          CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES="${CA_HOST_TARGET_${CROSS_COMPILER}_FEATURES}")
+        endif()
     endforeach()
   endif()
 endif()
diff --git a/modules/compiler/targets/host/source/target.cpp b/modules/compiler/targets/host/source/target.cpp
index 17e6f7d69..74e99a56c 100644
--- a/modules/compiler/targets/host/source/target.cpp
+++ b/modules/compiler/targets/host/source/target.cpp
@@ -33,6 +33,7 @@
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
+#include <llvm/TargetParser/SubtargetFeature.h>
 #include <multi_llvm/multi_llvm.h>
 
 #if LLVM_VERSION_GREATER_EQUAL(18, 0)
@@ -234,95 +235,176 @@ compiler::Result HostTarget::initWithBuiltins(
       break;
   }
 
-  llvm::StringRef CPUName = "";
-  llvm::StringMap<bool> FeatureMap;
-
-  if (llvm::Triple::arm == triple.getArch()) {
-    FeatureMap["strict-align"] = true;
-    // We do not support denormals for single precision floating points, but we
-    // do for double precision. To support that we use neon (which is FTZ) for
-    // single precision floating points, and use the VFP with denormal support
-    // enabled for doubles. The neonfp feature enables the use of neon for
-    // single precision floating points.
-    FeatureMap["neonfp"] = true;
-    FeatureMap["neon"] = true;
-    // Hardware division instructions might not exist on all ARMv7 CPUs, but
-    // they probably exist on all the ones we might care about.
-    FeatureMap["hwdiv"] = true;
-    FeatureMap["hwdiv-arm"] = true;
-    if (host_device_info.half_capabilities) {
-      FeatureMap["fp16"] = true;
-    }
-#if defined(CA_HOST_TARGET_ARM_CPU)
-    CPUName = CA_HOST_TARGET_ARM_CPU;
+  std::string CPU;
+  llvm::SubtargetFeatures Features;
+
+  switch (triple.getArch()) {
+#ifdef HOST_LLVM_ARM
+    case llvm::Triple::arm:
+      // We do not support denormals for single precision floating points, but
+      // we do for double precision. To support that we use neon (which is FTZ)
+      // for single precision floating points, and use the VFP with denormal
+      // support enabled for doubles. The neonfp feature enables the use of neon
+      // for single precision floating points.
+      Features.AddFeature("strict-align", true);
+      Features.AddFeature("neonfp", true);
+      Features.AddFeature("neon", true);
+      // Hardware division instructions might not exist on all ARMv7 CPUs, but
+      // they probably exist on all the ones we might care about.
+      Features.AddFeature("hwdiv", true);
+      Features.AddFeature("hwdiv-arm", true);
+      if (host_device_info.half_capabilities) {
+        Features.AddFeature("fp16", true);
+      }
+      break;
+#endif
+#ifdef HOST_LLVM_RISCV
+    case llvm::Triple::riscv32:
+    case llvm::Triple::riscv64:
+      CPU = triple.getArch() == llvm::Triple::riscv32 ? "generic-rv32"
+                                                      : "generic-rv64";
+      // The following features are important for OpenCL, and generally
+      // constitute a minimum requirement for non-embedded profile. Without
+      // these features, we'd need compiler-rt support. Atomics are absolutely
+      // essential.
+      Features.AddFeature("m", true);  // Integer multiplication and division
+      Features.AddFeature("f", true);  // Floating point support
+      Features.AddFeature("a", true);  // Atomics
+#if defined(CA_HOST_ENABLE_FP64)
+      Features.AddFeature("d", true);  // Double support
 #endif
-  } else if (llvm::Triple::aarch64 == triple.getArch()) {
-#if defined(CA_HOST_TARGET_AARCH64_CPU)
-    CPUName = CA_HOST_TARGET_AARCH64_CPU;
+#if defined(CA_HOST_ENABLE_FP16)
+      Features.AddFeature("zfh", true);  // Half support
 #endif
-  } else if (triple.isX86()) {
-    CPUName = "x86-64-v3";  // Default only, may be overridden below.
-    if (triple.isArch32Bit()) {
-#if defined(CA_HOST_TARGET_X86_CPU)
-      CPUName = CA_HOST_TARGET_X86_CPU;
+      break;
 #endif
-    } else {
-#if defined(CA_HOST_TARGET_X86_64_CPU)
-      CPUName = CA_HOST_TARGET_X86_64_CPU;
+#ifdef HOST_LLVM_X86
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      CPU = "x86-64-v3";
+      break;
 #endif
+    default:
+      break;
+  }
+
+  auto SetCPUFeatures = [&](std::string NewCPU, std::string NewFeatures) {
+    if (!NewCPU.empty()) {
+      CPU = NewCPU;
+      Features = llvm::SubtargetFeatures();
+    }
+    if (!NewFeatures.empty()) {
+      std::vector<std::string> NewFeatureVector;
+      llvm::SubtargetFeatures::Split(NewFeatureVector, NewFeatures);
+      for (auto &NewFeature : NewFeatureVector) Features.AddFeature(NewFeature);
     }
-  } else if (triple.isRISCV()) {
-    // These are reasonable defaults, which has been used for various RISC-V
-    // target so far. We should allow overriding of the ABI in the future
-    if (triple.isArch32Bit()) {
-      CPUName = "generic-rv32";
-#if defined(CA_HOST_TARGET_RISCV32_CPU)
-      CPUName = CA_HOST_TARGET_RISCV32_CPU;
+  };
+
+  switch (triple.getArch()) {
+#ifdef HOST_LLVM_ARM
+    case llvm::Triple::arm:
+#ifndef CA_HOST_TARGET_ARM_CPU
+#define CA_HOST_TARGET_ARM_CPU ""
 #endif
-    } else {
-      CPUName = "generic-rv64";
-#if defined(CA_HOST_TARGET_RISCV64_CPU)
-      CPUName = CA_HOST_TARGET_RISCV64_CPU;
+#ifndef CA_HOST_TARGET_ARM_FEATURES
+#define CA_HOST_TARGET_ARM_FEATURES ""
 #endif
-    }
-    // The following features are important for OpenCL, and generally constitute
-    // a minimum requirement for non-embedded profile. Without these features,
-    // we'd need compiler-rt support. Atomics are absolutely essential.
-    // TODO: Allow overriding of the input features.
-    FeatureMap["m"] = true;  // Integer multiplication and division
-    FeatureMap["f"] = true;  // Floating point support
-    FeatureMap["a"] = true;  // Atomics
-#if defined(CA_HOST_ENABLE_FP64)
-    FeatureMap["d"] = true;  // Double support
+      SetCPUFeatures(CA_HOST_TARGET_ARM_CPU, CA_HOST_TARGET_ARM_FEATURES);
+      break;
 #endif
-#if defined(CA_HOST_ENABLE_FP16)
-    FeatureMap["zfh"] = true;  // Half support
+#ifdef HOST_LLVM_AARCH64
+    case llvm::Triple::aarch64:
+#ifndef CA_HOST_TARGET_AARCH64_CPU
+#define CA_HOST_TARGET_AARCH64_CPU ""
+#endif
+#ifndef CA_HOST_TARGET_AARCH64_FEATURES
+#define CA_HOST_TARGET_AARCH64_FEATURES ""
+#endif
+      SetCPUFeatures(CA_HOST_TARGET_AARCH64_CPU,
+                     CA_HOST_TARGET_AARCH64_FEATURES);
+      break;
 #endif
+#ifdef HOST_LLVM_RISCV
+    case llvm::Triple::riscv32:
+#ifndef CA_HOST_TARGET_RISCV32_CPU
+#define CA_HOST_TARGET_RISCV32_CPU ""
+#endif
+#ifndef CA_HOST_TARGET_RISCV32_FEATURES
+#define CA_HOST_TARGET_RISCV32_FEATURES ""
+#endif
+      SetCPUFeatures(CA_HOST_TARGET_RISCV32_CPU,
+                     CA_HOST_TARGET_RISCV32_FEATURES);
+      break;
+    case llvm::Triple::riscv64:
+#ifndef CA_HOST_TARGET_RISCV64_CPU
+#define CA_HOST_TARGET_RISCV64_CPU ""
+#endif
+#ifndef CA_HOST_TARGET_RISCV64_FEATURES
+#define CA_HOST_TARGET_RISCV64_FEATURES ""
+#endif
+      SetCPUFeatures(CA_HOST_TARGET_RISCV64_CPU,
+                     CA_HOST_TARGET_RISCV64_FEATURES);
+      break;
+#endif
+#ifdef HOST_LLVM_X86
+    case llvm::Triple::x86:
+#ifndef CA_HOST_TARGET_X86_CPU
+#define CA_HOST_TARGET_X86_CPU ""
+#endif
+#ifndef CA_HOST_TARGET_X86_FEATURES
+#define CA_HOST_TARGET_X86_FEATURES ""
+#endif
+      SetCPUFeatures(CA_HOST_TARGET_X86_CPU, CA_HOST_TARGET_X86_FEATURES);
+      break;
+    case llvm::Triple::x86_64:
+#ifndef CA_HOST_TARGET_X86_64_CPU
+#define CA_HOST_TARGET_X86_64_CPU ""
+#endif
+#ifndef CA_HOST_TARGET_X86_64_FEATURES
+#define CA_HOST_TARGET_X86_64_FEATURES ""
+#endif
+      SetCPUFeatures(CA_HOST_TARGET_X86_64_CPU, CA_HOST_TARGET_X86_64_FEATURES);
+      break;
+#endif
+    default:
+      break;
   }
 
-#ifndef NDEBUG
-  if (const char *E = getenv("CA_HOST_TARGET_CPU")) {
-    CPUName = E;
+#if !defined(NDEBUG) || defined(CA_ENABLE_DEBUG_SUPPORT)
+  {
+    auto GetEnv = [](const char *Name) -> std::string {
+      auto *Value = std::getenv(Name);
+      return Value ? Value : "";
+    };
+    SetCPUFeatures(GetEnv("CA_HOST_TARGET_CPU"),
+                   GetEnv("CA_HOST_TARGET_FEATURES"));
+    ;
   }
 #endif
 
-  if (CPUName == "native") {
-    CPUName = llvm::sys::getHostCPUName();
+  if (CPU == "native") {
+    CPU = llvm::sys::getHostCPUName();
+
+    llvm::SubtargetFeatures NativeFeatures;
+
 #if LLVM_VERSION_GREATER_EQUAL(19, 0)
-    FeatureMap = llvm::sys::getHostCPUFeatures();
+    auto FeatureMap = llvm::sys::getHostCPUFeatures();
 #else
-    FeatureMap.clear();
+    llvm::StringMap<bool> FeatureMap;
     llvm::sys::getHostCPUFeatures(FeatureMap);
 #endif
+    for (auto &[FeatureName, IsEnabled] : FeatureMap)
+      NativeFeatures.AddFeature(FeatureName, IsEnabled);
+
+    NativeFeatures.addFeaturesVector(Features.getFeatures());
+    Features = std::move(NativeFeatures);
   }
 
   if (compiler_info->supports_deferred_compilation) {
     llvm::orc::JITTargetMachineBuilder TMBuilder(triple);
-    TMBuilder.setCPU(CPUName.str());
+    TMBuilder.setCPU(CPU);
     TMBuilder.setCodeGenOptLevel(multi_llvm::CodeGenOptLevel::Aggressive);
-    for (auto &Feature : FeatureMap) {
-      TMBuilder.getFeatures().AddFeature(Feature.first(), Feature.second);
-    }
+    TMBuilder.getFeatures().addFeaturesVector(Features.getFeatures());
     auto Builder = llvm::orc::LLJITBuilder();
 
     Builder.setJITTargetMachineBuilder(TMBuilder);
@@ -373,21 +455,8 @@ compiler::Result HostTarget::initWithBuiltins(
     target_machine = std::move(*TM);
   } else {
     // No JIT support so create target machine directly.
-    std::string Features;
-    bool first = true;
-    for (auto &[FeatureName, IsEnabled] : FeatureMap) {
-      if (first) {
-        first = false;
-      } else {
-        Features += ",";
-      }
-      if (IsEnabled) {
-        Features += '+' + FeatureName.str();
-      } else {
-        Features += '-' + FeatureName.str();
-      }
-    }
-    target_machine.reset(createTargetMachine(triple, CPUName, Features));
+    target_machine.reset(
+        createTargetMachine(triple, CPU, Features.getString()));
   }
 
   return compiler::Result::SUCCESS;