From 998d03b9d316e7f298afc1f19ed4bc72337ee539 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 2 Sep 2024 10:15:06 +0200
Subject: [PATCH 01/10] Tweak the way large cfuncs/taylor integrators are split
 in multiple LLVM modules.

---
 src/expression_cfunc.cpp | 27 ++++++++++-----------------
 src/taylor_02.cpp        | 35 +++++++++++++++++------------------
 2 files changed, 27 insertions(+), 35 deletions(-)
diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
index 20b128001..e1875233d 100644
--- a/src/expression_cfunc.cpp
+++ b/src/expression_cfunc.cpp
@@ -1697,21 +1697,14 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
     cur_state->builder().SetInsertPoint(
         llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx)));
 
-    // Variable to keep track of how many blocks have been codegenned
-    // in the current state.
-    boost::safe_numerics::safe<unsigned> n_cg_blocks = 0;
+    // Variable to keep track of how many evaluation functions have
+    // been invoked in the current state.
+    boost::safe_numerics::safe<std::size_t> n_evalf = 0;
 
-    // Limit of codegenned blocks per state.
+    // Limit of function evaluations per state.
     // NOTE: this has not been really properly tuned,
     // needs more investigation.
-    // NOTE: it would probably be better here to keep track of the
-    // total number of function calls per segment, rather than
-    // the number of blocks. The reason for this is that each
-    // function call in principle increases the size of the
-    // auxiliary global arrays used by the compact mode
-    // argument generators, which in turn increases the code
-    // generation time.
-    constexpr auto max_n_cg_blocks = 20u;
+    constexpr auto max_n_evalf = 100u;
 
     // Variable to keep track of the u variable
     // on whose definition we are operating.
@@ -1719,7 +1712,7 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
 
     // Iterate over the segments in s_dc.
     for (const auto &seg : s_dc) {
-        if (n_cg_blocks > max_n_cg_blocks) {
+        if (n_evalf > max_n_evalf) {
             // We have codegenned enough blocks for this state. Create the return
             // value for the current driver, and move to the next one.
             cur_state->builder().CreateRetVoid();
@@ -1729,7 +1722,7 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
             cur_state = &states.back();
 
             // Reset/update the counters.
-            n_cg_blocks = 0;
+            n_evalf = 0;
             ++cur_state_idx;
 
             // Add the driver declaration to the main state, and invoke it.
@@ -1898,6 +1891,9 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
             assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast<bool>(f); }));
             // LCOV_EXCL_STOP
 
+            // Update the number of invoked evaluation functions.
+            n_evalf += ncalls;
+
             // We will be manually unrolling loops if ncalls is small enough.
             // This seems to help with compilation times.
             constexpr auto max_unroll_n = 5u;
@@ -1942,9 +1938,6 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
             }
         }
 
-        // Update the number of codegenned blocks.
-        n_cg_blocks += seg_map.size();
-
         // LCOV_EXCL_START
         // Update segment_bd if needed.
         if (is_tracing) {
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index a4ff3e61d..b5d4354d0 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -17,6 +17,7 @@
 #include <limits>
 #include <list>
 #include <map>
+#include <numeric>
 #include <ranges>
 #include <stdexcept>
 #include <type_traits>
@@ -1042,28 +1043,25 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
     cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create(
         cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx)));
 
-    // Variable to keep track of how many blocks have been codegenned
-    // in the current state.
-    boost::safe_numerics::safe<unsigned> n_cg_blocks = 0;
+    // Variable to keep track of how many evaluation functions have
+    // been invoked in the current state.
+    boost::safe_numerics::safe<std::size_t> n_evalf = 0;
 
-    // Limit of codegenned blocks per state.
+    // Limit of function evaluations per state.
     // NOTE: this has not been really properly tuned,
-    // needs more investigation.
-    // NOTE: it would probably be better here to keep track of the
-    // total number of function calls per segment, rather than
-    // the number of blocks. The reason for this is that each
-    // function call in principle increases the size of the
-    // auxiliary global arrays used by the compact mode
-    // argument generators, which in turn increases the code
-    // generation time.
-    constexpr auto max_n_cg_blocks = 20u;
+    // needs more investigation. In any case, this should
+    // be smaller than the corresponding limit in cfunc
+    // because here we are typically more work for function
+    // evaluation (as each function evaluation implements
+    // an AD formula).
+    constexpr auto max_n_evalf = 20u;
 
     // Variable to keep track of the index of the first u variable
     // in a segment.
     auto start_u_idx = n_eq;
 
     // Helper to finalise the current driver function and create a new one.
-    auto start_new_driver = [&cur_state, &states, &main_state, &n_cg_blocks, &cur_state_idx, &main_driver_decls]() {
+    auto start_new_driver = [&cur_state, &states, &main_state, &n_evalf, &cur_state_idx, &main_driver_decls]() {
         // Finalise the current driver.
         cur_state->builder().CreateRetVoid();
 
@@ -1072,7 +1070,7 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         cur_state = &states.back();
 
         // Reset/update the counters.
-        n_cg_blocks = 0;
+        n_evalf = 0;
         ++cur_state_idx;
 
         // Add the driver declaration to the main state.
@@ -1100,7 +1098,7 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         // of the sv funcs.
         const auto is_svf_seg = need_svf_lo && max_svf_idx >= start_u_idx && max_svf_idx < (start_u_idx + seg_n_ex);
 
-        if (n_cg_blocks > max_n_cg_blocks || is_svf_seg) {
+        if (n_evalf > max_n_evalf || is_svf_seg) {
             // Either we have codegenned enough blocks for this state, or we are
             // in the max_svf_idx state. Finalise the current driver and start the new one.
             start_new_driver();
@@ -1119,8 +1117,9 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         const auto seg_map = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars,
                                                             high_accuracy, parallel_mode);
 
-        // Update the number of codegenned blocks.
-        n_cg_blocks += seg_map.size();
+        // Update the number of invoked evaluation functions.
+        n_evalf = std::accumulate(seg_map.begin(), seg_map.end(), n_evalf,
+                                  [](auto a, const auto &p) { return a + p.second.first; });
 
         // Update start_u_idx.
         start_u_idx += seg_n_ex;

From aff820fb8eb6f61ee3509efeb8329fc6d8f29299 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 2 Sep 2024 15:35:32 +0200
Subject: [PATCH 02/10] clang on windows attempt.

---
 .github/workflows/gha_ci.yml | 20 ++++++++++++++++++++
 CMakeLists.txt               |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index ee5e0f980..78063daa3 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -45,6 +45,26 @@ jobs:
           cmake ../ -G "Visual Studio 17 2022" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DHEYOKA_WITH_SLEEF=yes
           cmake --build . --config Release -j2
           copy Release\heyoka.dll test\Release\
+  windows_2022_llvm_latest_clang:
+    runs-on: windows-2022
+    steps:
+      - uses: actions/checkout@v4
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          auto-update-conda: true
+          python-version: "3.10"
+          channels: conda-forge
+          channel-priority: strict
+      - uses: ilammy/msvc-dev-cmd@v1
+      - name: Build
+        shell: pwsh
+        run: |
+          conda install -y cmake clang ninja llvmdev tbb-devel tbb libboost-devel xtensor xtensor-blas blas blas-devel fmt spdlog sleef zlib libzlib 'mppp=1.*'
+          mkdir build
+          cd build
+          cmake ../ -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DHEYOKA_WITH_SLEEF=yes
+          cmake --build . -j4 -- -v
+          copy Release\heyoka.dll test\Release\
   conda_release_static:
     runs-on: ubuntu-latest
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6cfdd07bf..aed2ec827 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -517,7 +517,7 @@ if(NOT ${Boost_FOUND})
     message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.")
 endif()
 message(STATUS "Found Boost version ${Boost_VERSION}.")
-target_link_libraries(heyoka PUBLIC Boost::boost Boost::serialization)
+target_link_libraries(heyoka PUBLIC Boost::boost Boost::serialization Boost::disable_autolinking Boost::dynamic_linking)
 # NOTE: quench warnings from Boost when building the library.
 target_compile_definitions(heyoka PRIVATE BOOST_ALLOW_DEPRECATED_HEADERS)
 

From 95f29e7067858d9bda31af07efc3451b827a611a Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 2 Sep 2024 15:47:47 +0200
Subject: [PATCH 03/10] Minor.

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aed2ec827..838730306 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -517,7 +517,7 @@ if(NOT ${Boost_FOUND})
     message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.")
 endif()
 message(STATUS "Found Boost version ${Boost_VERSION}.")
-target_link_libraries(heyoka PUBLIC Boost::boost Boost::serialization Boost::disable_autolinking Boost::dynamic_linking)
+target_link_libraries(heyoka PUBLIC Boost::boost Boost::serialization Boost::disable_autolinking)
 # NOTE: quench warnings from Boost when building the library.
 target_compile_definitions(heyoka PRIVATE BOOST_ALLOW_DEPRECATED_HEADERS)
 

From a240214315764cb9053a8e93118a0592f1e8e20e Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 2 Sep 2024 15:48:58 +0200
Subject: [PATCH 04/10] Re-enable tests.

---
 .github/workflows/gha_ci.yml | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml
index 78063daa3..3ea2d68ca 100644
--- a/.github/workflows/gha_ci.yml
+++ b/.github/workflows/gha_ci.yml
@@ -45,26 +45,7 @@ jobs:
           cmake ../ -G "Visual Studio 17 2022" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DHEYOKA_WITH_SLEEF=yes
           cmake --build . --config Release -j2
           copy Release\heyoka.dll test\Release\
-  windows_2022_llvm_latest_clang:
-    runs-on: windows-2022
-    steps:
-      - uses: actions/checkout@v4
-      - uses: conda-incubator/setup-miniconda@v3
-        with:
-          auto-update-conda: true
-          python-version: "3.10"
-          channels: conda-forge
-          channel-priority: strict
-      - uses: ilammy/msvc-dev-cmd@v1
-      - name: Build
-        shell: pwsh
-        run: |
-          conda install -y cmake clang ninja llvmdev tbb-devel tbb libboost-devel xtensor xtensor-blas blas blas-devel fmt spdlog sleef zlib libzlib 'mppp=1.*'
-          mkdir build
-          cd build
-          cmake ../ -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DHEYOKA_WITH_SLEEF=yes
-          cmake --build . -j4 -- -v
-          copy Release\heyoka.dll test\Release\
+          ctest -j4 -V -C Release
   conda_release_static:
     runs-on: ubuntu-latest
     steps:

From cfdce125792b7c8ed13d4cc4af46dabfca4b2f29 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 2 Sep 2024 21:50:42 +0200
Subject: [PATCH 05/10] Small tweak.

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 838730306..6cfdd07bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -517,7 +517,7 @@ if(NOT ${Boost_FOUND})
     message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.")
 endif()
 message(STATUS "Found Boost version ${Boost_VERSION}.")
-target_link_libraries(heyoka PUBLIC Boost::boost Boost::serialization Boost::disable_autolinking)
+target_link_libraries(heyoka PUBLIC Boost::boost Boost::serialization)
 # NOTE: quench warnings from Boost when building the library.
 target_compile_definitions(heyoka PRIVATE BOOST_ALLOW_DEPRECATED_HEADERS)
 

From db35939ad123b7fe0f3419bf39607a5c8ee329ff Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 2 Sep 2024 21:53:22 +0200
Subject: [PATCH 06/10] Tentative Windows fix.

---
 src/llvm_state.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index da7908e14..52f71cc56 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -309,17 +309,17 @@ llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model
 
     // LCOV_EXCL_START
 
-#if LLVM_VERSION_MAJOR >= 17
-
     // NOTE: the code model setup is working only on LLVM>=19 (or at least
     // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug:
     //
     // https://github.com/llvm/llvm-project/issues/88115
     //
     // Additionally, there are indications from our CI that attempting to set
-    // the code model before LLVM 17 might just be buggy, as we see widespread
+    // the code model before LLVM 17 or on Windows might just be buggy, as we see widespread
     // ASAN failures all over the place. Thus, let us not do anything with the code
-    // model setting before LLVM 17.
+    // model setting before LLVM 17 or on Windows.
+
+#if LLVM_VERSION_MAJOR >= 17 && !defined(_WIN32)
 
     // Setup the code model.
     switch (c_model) {

From b615886de072c8e451424058335a5bbc13e1ee8d Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 3 Sep 2024 10:21:09 +0200
Subject: [PATCH 07/10] Avoid using bmp in a test.

---
 test/llvm_helpers.cpp | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp
index ec13bd327..b96833ac9 100644
--- a/test/llvm_helpers.cpp
+++ b/test/llvm_helpers.cpp
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include <boost/math/constants/constants.hpp>
-#include <boost/multiprecision/cpp_bin_float.hpp>
 
 #include <fmt/format.h>
 
@@ -1671,16 +1670,18 @@ TEST_CASE("eft_product scalar")
 
                 REQUIRE(x == a * b);
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
                 if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                    namespace bmp = boost::multiprecision;
-                    using mp_fp_t
-                        = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
 
-                    REQUIRE(mp_fp_t(x) + mp_fp_t(y) == mp_fp_t(a) * mp_fp_t(b));
+                    using mp_fp_t = mppp::real;
+                    const auto prec = std::numeric_limits<fp_t>::digits * 2;
+
+                    REQUIRE(mp_fp_t(x, prec) + mp_fp_t(y, prec) == mp_fp_t(a, prec) * mp_fp_t(b, prec));
 #if defined(HEYOKA_HAVE_REAL128)
                 }
+#endif
 #endif
             }
         }
@@ -1759,16 +1760,17 @@ TEST_CASE("eft_product batch")
 
                         REQUIRE(xv == a * b);
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
                         if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                            namespace bmp = boost::multiprecision;
-                            using mp_fp_t = bmp::number<
-                                bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
+                            using mp_fp_t = mppp::real;
+                            const auto prec = std::numeric_limits<fp_t>::digits * 2;
 
-                            REQUIRE(mp_fp_t(xv) + mp_fp_t(yv) == mp_fp_t(a) * mp_fp_t(b));
+                            REQUIRE(mp_fp_t(xv, prec) + mp_fp_t(yv, prec) == mp_fp_t(a, prec) * mp_fp_t(b, prec));
 #if defined(HEYOKA_HAVE_REAL128)
                         }
+#endif
 #endif
                     }
                 }
@@ -2526,12 +2528,12 @@ TEST_CASE("dl modulus scalar")
             auto f_ptr
                 = reinterpret_cast<void (*)(fp_t *, fp_t *, fp_t, fp_t, fp_t, fp_t)>(s.jit_lookup("hey_dl_modulus"));
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
             if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                namespace bmp = boost::multiprecision;
-                using mp_fp_t
-                    = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
+                using mp_fp_t = mppp::real;
+                const auto prec = std::numeric_limits<fp_t>::digits * 2;
 
                 std::uniform_real_distribution<fp_t> op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.));
 
@@ -2542,13 +2544,14 @@ TEST_CASE("dl modulus scalar")
 
                     f_ptr(&res_hi, &res_lo, x, 0, y, 0);
 
-                    auto res_mp = mp_fp_t(x) - mp_fp_t(y) * floor(mp_fp_t(x) / mp_fp_t(y));
+                    auto res_mp = mp_fp_t(x, prec) - mp_fp_t(y, prec) * floor(mp_fp_t(x, prec) / mp_fp_t(y, prec));
 
                     REQUIRE(res_hi == approximately(static_cast<fp_t>(res_mp), fp_t(10)));
                 }
 
 #if defined(HEYOKA_HAVE_REAL128)
             }
+#endif
 #endif
         }
     };
@@ -2608,12 +2611,12 @@ TEST_CASE("dl modulus batch")
                 auto f_ptr = reinterpret_cast<void (*)(fp_t *, fp_t *, fp_t *, fp_t *, fp_t *, fp_t *)>(
                     s.jit_lookup("hey_dl_modulus"));
 
+#if defined(HEYOKA_HAVE_REAL)
 #if defined(HEYOKA_HAVE_REAL128)
                 if constexpr (!std::is_same_v<fp_t, mppp::real128>) {
 #endif
-                    namespace bmp = boost::multiprecision;
-                    using mp_fp_t
-                        = bmp::number<bmp::cpp_bin_float<std::numeric_limits<fp_t>::digits * 2, bmp::digit_base_2>>;
+                    using mp_fp_t = mppp::real;
+                    const auto prec = std::numeric_limits<fp_t>::digits * 2;
 
                     std::uniform_real_distribution<fp_t> op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.));
 
@@ -2634,8 +2637,9 @@ TEST_CASE("dl modulus batch")
                               b_lo_vec.data());
 
                         for (auto i = 0u; i < batch_size; ++i) {
-                            auto res_mp = mp_fp_t(a_hi_vec[i])
-                                          - mp_fp_t(b_hi_vec[i]) * floor(mp_fp_t(a_hi_vec[i]) / mp_fp_t(b_hi_vec[i]));
+                            auto res_mp = mp_fp_t(a_hi_vec[i], prec)
+                                          - mp_fp_t(b_hi_vec[i], prec)
+                                                * floor(mp_fp_t(a_hi_vec[i], prec) / mp_fp_t(b_hi_vec[i], prec));
 
                             REQUIRE(x_vec[i] == approximately(static_cast<fp_t>(res_mp), fp_t(10)));
                         }
@@ -2643,6 +2647,7 @@ TEST_CASE("dl modulus batch")
 
 #if defined(HEYOKA_HAVE_REAL128)
                 }
+#endif
 #endif
             }
         }

From 5250e91625c1883bd851a11531352b07a7666a27 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 3 Sep 2024 10:21:21 +0200
Subject: [PATCH 08/10] [skip ci]


From 30e8f0c69941de7c7161e2ccf9ce0211fc222a5b Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 3 Sep 2024 20:33:15 +0200
Subject: [PATCH 09/10] Set parallel compilation to disabled by default, and
 just forbid it in Windows.

---
 include/heyoka/llvm_state.hpp | 23 ++++++++++++-----------
 src/llvm_state.cpp            |  6 ++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 4ee929cbf..a6fb77d52 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -348,17 +348,18 @@ std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::
 void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc_value);
 
 // The default setting for the parjit flag for llvm_multi_state.
-// There is evidence of an LLVM thread scheduling bug when parallel compilation
-// is active, that rarely results in multiply-defined symbols for external C
-// functions, which leads to compilation failure. So far, we have been able to
-// trigger this issue only on 64-bit arm.
-inline constexpr bool default_parjit =
-#if defined(HEYOKA_ARCH_ARM)
-    false
-#else
-    true
-#endif
-    ;
+//
+// At this time, it seems like parallel compilation in lljit is buggy:
+//
+// - on Unix platforms, parallel compilation occasionally results in
+//   multiply-defined symbols for external C functions, which leads to
+//   compilation failures;
+// - on Windows, it seems like parallel compilation outright results in
+//   segmentation faults under heavy load.
+//
+// The root of the problem seems to be a concurrency issue. Thus, for the time
+// being, let us just disable parallel compilation by default.
+inline constexpr bool default_parjit = false;
 
 } // namespace detail
 
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 52f71cc56..af5fc6d6d 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1667,11 +1667,17 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model,
 
 #else
 
+    // NOTE: never enable parallel compilation on Windows due to
+    // segfaults under heavy load.
+#if !defined(_WIN32)
+
     if (m_parjit) {
         // Set the number of compilation threads.
         lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
     }
 
+#endif
+
 #endif
 
     // Create the jit.

From 5d645c6de0756c10de0cc2daadd8fed7a2f4291e Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 3 Sep 2024 22:55:10 +0200
Subject: [PATCH 10/10] Update the known issues page.

---
 doc/known_issues.rst | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/doc/known_issues.rst b/doc/known_issues.rst
index 495739430..7972f0689 100644
--- a/doc/known_issues.rst
+++ b/doc/known_issues.rst
@@ -18,14 +18,18 @@ Unsolved
 
   The root cause is most likely a code-generation/optimisation problem in LLVM.
   This issue is currently under investigation.
-* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled
-  by default on 64-bit ARM processors (this includes the Apple M1 and its successors).
+* The parallel compilation feature (added in heyoka 6.0.0) is currently turned
+  off by default on all platforms and completely disabled on Windows.
   The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities
-  that very rarely results in a multiply-defined symbol, which ultimately leads to compilation
-  failure. The issue is currently under investigation by the LLVM developers. In the
-  meantime, you can explicitly turn on parallel compilation via the ``kw::parjit``
+  which, on Unix systems, rarely results in a multiply-defined symbol, ultimately leading to a compilation
+  failure. On Windows, parallel compilation under heavy loads results in segmentation faults.
+  The issue is currently under investigation by the LLVM developers. In the
+  meantime, you can explicitly turn on parallel compilation on Unix systems via the ``kw::parjit``
   :ref:`keyword argument <kwargs>` when constructing an integrator or a compiled
   function.
+* The option for selecting the code used model for JIT compilation
+  (added in heyoka 6.0.0) is currently disabled on Windows due to what
+  looks like an LLVM bug. The issue is currently under investigation.
 
 Solved
 ======