Merge pull request #447 from bluescarni/pr/new_multi_segment

Tweak parjit spltting logic
bluescarni · Sep 4, 2024 · 12609d7 · 12609d7
2 parents 57feb7d + 5d645c6
commit 12609d7
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 73 deletions.
diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
@@ -45,6 +45,7 @@ jobs:
           cmake ../ -G "Visual Studio 17 2022" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DHEYOKA_WITH_SLEEF=yes
           cmake --build . --config Release -j2
           copy Release\heyoka.dll test\Release\
+          ctest -j4 -V -C Release
   conda_release_static:
     runs-on: ubuntu-latest
     steps:

diff --git a/doc/known_issues.rst b/doc/known_issues.rst
@@ -18,14 +18,18 @@ Unsolved
 
   The root cause is most likely a code-generation/optimisation problem in LLVM.
   This issue is currently under investigation.
-* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled
-  by default on 64-bit ARM processors (this includes the Apple M1 and its successors).
+* The parallel compilation feature (added in heyoka 6.0.0) is currently turned
+  off by default on all platforms and completely disabled on Windows.
   The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities
-  that very rarely results in a multiply-defined symbol, which ultimately leads to compilation
-  failure. The issue is currently under investigation by the LLVM developers. In the
-  meantime, you can explicitly turn on parallel compilation via the ``kw::parjit``
+  which, on Unix systems, rarely results in a multiply-defined symbol, ultimately leading to a compilation
+  failure. On Windows, parallel compilation under heavy loads results in segmentation faults.
+  The issue is currently under investigation by the LLVM developers. In the
+  meantime, you can explicitly turn on parallel compilation on Unix systems via the ``kw::parjit``
   :ref:`keyword argument <kwargs>` when constructing an integrator or a compiled
   function.
+* The option for selecting the code used model for JIT compilation
+  (added in heyoka 6.0.0) is currently disabled on Windows due to what
+  looks like an LLVM bug. The issue is currently under investigation.
 
 Solved
 ======

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
@@ -348,17 +348,18 @@ std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::
 void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc_value);
 
 // The default setting for the parjit flag for llvm_multi_state.
-// There is evidence of an LLVM thread scheduling bug when parallel compilation
-// is active, that rarely results in multiply-defined symbols for external C
-// functions, which leads to compilation failure. So far, we have been able to
-// trigger this issue only on 64-bit arm.
-inline constexpr bool default_parjit =
-#if defined(HEYOKA_ARCH_ARM)
-    false
-#else
-    true
-#endif
-    ;
+//
+// At this time, it seems like parallel compilation in lljit is buggy:
+//
+// - on Unix platforms, parallel compilation occasionally results in
+//   multiply-defined symbols for external C functions, which leads to
+//   compilation failures;
+// - on Windows, it seems like parallel compilation outright results in
+//   segmentation faults under heavy load.
+//
+// The root of the problem seems to be a concurrency issue. Thus, for the time
+// being, let us just disable parallel compilation by default.
+inline constexpr bool default_parjit = false;
 
 } // namespace detail
 

diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
@@ -1697,29 +1697,22 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
     cur_state->builder().SetInsertPoint(
         llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx)));
 
-    // Variable to keep track of how many blocks have been codegenned
-    // in the current state.
-    boost::safe_numerics::safe<unsigned> n_cg_blocks = 0;
+    // Variable to keep track of how many evaluation functions have
+    // been invoked in the current state.
+    boost::safe_numerics::safe<std::size_t> n_evalf = 0;
 
-    // Limit of codegenned blocks per state.
+    // Limit of function evaluations per state.
     // NOTE: this has not been really properly tuned,
     // needs more investigation.
-    // NOTE: it would probably be better here to keep track of the
-    // total number of function calls per segment, rather than
-    // the number of blocks. The reason for this is that each
-    // function call in principle increases the size of the
-    // auxiliary global arrays used by the compact mode
-    // argument generators, which in turn increases the code
-    // generation time.
-    constexpr auto max_n_cg_blocks = 20u;
+    constexpr auto max_n_evalf = 100u;
 
     // Variable to keep track of the u variable
     // on whose definition we are operating.
     auto cur_u_idx = nvars;
 
     // Iterate over the segments in s_dc.
     for (const auto &seg : s_dc) {
-        if (n_cg_blocks > max_n_cg_blocks) {
+        if (n_evalf > max_n_evalf) {
             // We have codegenned enough blocks for this state. Create the return
             // value for the current driver, and move to the next one.
             cur_state->builder().CreateRetVoid();
@@ -1729,7 +1722,7 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
             cur_state = &states.back();
 
             // Reset/update the counters.
-            n_cg_blocks = 0;
+            n_evalf = 0;
             ++cur_state_idx;
 
             // Add the driver declaration to the main state, and invoke it.
@@ -1898,6 +1891,9 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
             assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast<bool>(f); }));
             // LCOV_EXCL_STOP
 
+            // Update the number of invoked evaluation functions.
+            n_evalf += ncalls;
+
             // We will be manually unrolling loops if ncalls is small enough.
             // This seems to help with compilation times.
             constexpr auto max_unroll_n = 5u;
@@ -1942,9 +1938,6 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
             }
         }
 
-        // Update the number of codegenned blocks.
-        n_cg_blocks += seg_map.size();
-
         // LCOV_EXCL_START
         // Update segment_bd if needed.
         if (is_tracing) {

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
@@ -309,17 +309,17 @@ llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model
 
     // LCOV_EXCL_START
 
-#if LLVM_VERSION_MAJOR >= 17
-
     // NOTE: the code model setup is working only on LLVM>=19 (or at least
     // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
     //
     // https://github.com/llvm/llvm-project/issues/88115
     //
     // Additionally, there are indications from our CI that attempting to set
-    // the code model before LLVM 17 might just be buggy, as we see widespread
+    // the code model before LLVM 17 or on Windows might just be buggy, as we see widespread
     // ASAN failures all over the place. Thus, let us not do anything with the code
-    // model setting before LLVM 17.
+    // model setting before LLVM 17 or on Windows.
+
+#if LLVM_VERSION_MAJOR >= 17 && !defined(_WIN32)
 
     // Setup the code model.
     switch (c_model) {
@@ -1667,11 +1667,17 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model,
 
 #else
 
+    // NOTE: never enable parallel compilation on Windows due to
+    // segfaults under heavy load.
+#if !defined(_WIN32)
+
     if (m_parjit) {
         // Set the number of compilation threads.
         lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
     }
 
+#endif
+
 #endif
 
     // Create the jit.

diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
@@ -17,6 +17,7 @@
 #include <limits>
 #include <list>
 #include <map>
+#include <numeric>
 #include <ranges>
 #include <stdexcept>
 #include <type_traits>
@@ -1042,28 +1043,25 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
     cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create(
         cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx)));
 
-    // Variable to keep track of how many blocks have been codegenned
-    // in the current state.
-    boost::safe_numerics::safe<unsigned> n_cg_blocks = 0;
+    // Variable to keep track of how many evaluation functions have
+    // been invoked in the current state.
+    boost::safe_numerics::safe<std::size_t> n_evalf = 0;
 
-    // Limit of codegenned blocks per state.
+    // Limit of function evaluations per state.
     // NOTE: this has not been really properly tuned,
-    // needs more investigation.
-    // NOTE: it would probably be better here to keep track of the
-    // total number of function calls per segment, rather than
-    // the number of blocks. The reason for this is that each
-    // function call in principle increases the size of the
-    // auxiliary global arrays used by the compact mode
-    // argument generators, which in turn increases the code
-    // generation time.
-    constexpr auto max_n_cg_blocks = 20u;
+    // needs more investigation. In any case, this should
+    // be smaller than the corresponding limit in cfunc
+    // because here we are typically more work for function
+    // evaluation (as each function evaluation implements
+    // an AD formula).
+    constexpr auto max_n_evalf = 20u;
 
     // Variable to keep track of the index of the first u variable
     // in a segment.
     auto start_u_idx = n_eq;
 
     // Helper to finalise the current driver function and create a new one.
-    auto start_new_driver = [&cur_state, &states, &main_state, &n_cg_blocks, &cur_state_idx, &main_driver_decls]() {
+    auto start_new_driver = [&cur_state, &states, &main_state, &n_evalf, &cur_state_idx, &main_driver_decls]() {
         // Finalise the current driver.
         cur_state->builder().CreateRetVoid();
 
@@ -1072,7 +1070,7 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         cur_state = &states.back();
 
         // Reset/update the counters.
-        n_cg_blocks = 0;
+        n_evalf = 0;
         ++cur_state_idx;
 
         // Add the driver declaration to the main state.
@@ -1100,7 +1098,7 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         // of the sv funcs.
         const auto is_svf_seg = need_svf_lo && max_svf_idx >= start_u_idx && max_svf_idx < (start_u_idx + seg_n_ex);
 
-        if (n_cg_blocks > max_n_cg_blocks || is_svf_seg) {
+        if (n_evalf > max_n_evalf || is_svf_seg) {
             // Either we have codegenned enough blocks for this state, or we are
             // in the max_svf_idx state. Finalise the current driver and start the new one.
             start_new_driver();
@@ -1119,8 +1117,9 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         const auto seg_map = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars,
                                                             high_accuracy, parallel_mode);
 
-        // Update the number of codegenned blocks.
-        n_cg_blocks += seg_map.size();
+        // Update the number of invoked evaluation functions.
+        n_evalf = std::accumulate(seg_map.begin(), seg_map.end(), n_evalf,
+                                  [](auto a, const auto &p) { return a + p.second.first; });
 
         // Update start_u_idx.
         start_u_idx += seg_n_ex;

diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include <boost/math/constants/constants.hpp>
-#include <boost/multiprecision/cpp_bin_float.hpp>
 
 #include <fmt/format.h>
 
@@ -1671,16 +1670,18 @@ TEST_CASE("eft_product scalar")
 
                 REQUIRE(x == a * b);
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
                 if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                    namespace bmp = boost::multiprecision;
-                    using mp_fp_t
-                        = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
 
-                    REQUIRE(mp_fp_t(x) + mp_fp_t(y) == mp_fp_t(a) * mp_fp_t(b));
+                    using mp_fp_t = mppp::real;
+                    const auto prec = std::numeric_limits<fp_t>::digits * 2;
+
+                    REQUIRE(mp_fp_t(x, prec) + mp_fp_t(y, prec) == mp_fp_t(a, prec) * mp_fp_t(b, prec));
 #if defined(HEYOKA_HAVE_REAL128)
                 }
+#endif
 #endif
             }
         }
@@ -1759,16 +1760,17 @@ TEST_CASE("eft_product batch")
 
                         REQUIRE(xv == a * b);
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
                         if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                            namespace bmp = boost::multiprecision;
-                            using mp_fp_t = bmp::number<
-                                bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
+                            using mp_fp_t = mppp::real;
+                            const auto prec = std::numeric_limits<fp_t>::digits * 2;
 
-                            REQUIRE(mp_fp_t(xv) + mp_fp_t(yv) == mp_fp_t(a) * mp_fp_t(b));
+                            REQUIRE(mp_fp_t(xv, prec) + mp_fp_t(yv, prec) == mp_fp_t(a, prec) * mp_fp_t(b, prec));
 #if defined(HEYOKA_HAVE_REAL128)
                         }
+#endif
 #endif
                     }
                 }
@@ -2526,12 +2528,12 @@ TEST_CASE("dl modulus scalar")
             auto f_ptr
                 = reinterpret_cast<void (*)(fp_t *, fp_t *, fp_t, fp_t, fp_t, fp_t)>(s.jit_lookup("hey_dl_modulus"));
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
             if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                namespace bmp = boost::multiprecision;
-                using mp_fp_t
-                    = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
+                using mp_fp_t = mppp::real;
+                const auto prec = std::numeric_limits<fp_t>::digits * 2;
 
                 std::uniform_real_distribution<fp_t> op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.));
 
@@ -2542,13 +2544,14 @@ TEST_CASE("dl modulus scalar")
 
                     f_ptr(&res_hi, &res_lo, x, 0, y, 0);
 
-                    auto res_mp = mp_fp_t(x) - mp_fp_t(y) * floor(mp_fp_t(x) / mp_fp_t(y));
+                    auto res_mp = mp_fp_t(x, prec) - mp_fp_t(y, prec) * floor(mp_fp_t(x, prec) / mp_fp_t(y, prec));
 
                     REQUIRE(res_hi == approximately(static_cast<fp_t>(res_mp), fp_t(10)));
                 }
 
 #if defined(HEYOKA_HAVE_REAL128)
             }
+#endif
 #endif
         }
     };
@@ -2608,12 +2611,12 @@ TEST_CASE("dl modulus batch")
                 auto f_ptr = reinterpret_cast<void (*)(fp_t *, fp_t *, fp_t *, fp_t *, fp_t *, fp_t *)>(
                     s.jit_lookup("hey_dl_modulus"));
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
                 if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                    namespace bmp = boost::multiprecision;
-                    using mp_fp_t
-                        = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
+                    using mp_fp_t = mppp::real;
+                    const auto prec = std::numeric_limits<fp_t>::digits * 2;
 
                     std::uniform_real_distribution<fp_t> op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.));
 
@@ -2634,15 +2637,17 @@ TEST_CASE("dl modulus batch")
                               b_lo_vec.data());
 
                         for (auto i = 0u; i < batch_size; ++i) {
-                            auto res_mp = mp_fp_t(a_hi_vec[i])
-                                          - mp_fp_t(b_hi_vec[i]) * floor(mp_fp_t(a_hi_vec[i]) / mp_fp_t(b_hi_vec[i]));
+                            auto res_mp = mp_fp_t(a_hi_vec[i], prec)
+                                          - mp_fp_t(b_hi_vec[i], prec)
+                                                * floor(mp_fp_t(a_hi_vec[i], prec) / mp_fp_t(b_hi_vec[i], prec));
 
                             REQUIRE(x_vec[i] == approximately(static_cast<fp_t>(res_mp), fp_t(10)));
                         }
                     }
 
 #if defined(HEYOKA_HAVE_REAL128)
                 }
+#endif
 #endif
             }
         }