From 8a540732cfe4c1a8ccf8411b3c5c6b5cba4ec070 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 1 Jan 2025 00:17:55 +0100 Subject: [PATCH 1/5] No inlining, fix bugs in the hypotheses, improve the macros --- numerics/numerics.vcxproj | 15 ++++++ numerics/sin_cos.cpp | 109 +++++++++++++++++++++----------------- numerics/sin_cos.hpp | 17 ------ 3 files changed, 74 insertions(+), 67 deletions(-) diff --git a/numerics/numerics.vcxproj b/numerics/numerics.vcxproj index 6e3b3e9b8b..0de586e31a 100644 --- a/numerics/numerics.vcxproj +++ b/numerics/numerics.vcxproj @@ -10,6 +10,21 @@ + + + AssemblyCode + + + + + AssemblyCode + + + + + AssemblyCode + + diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index b79dda44df..d45469d9c1 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -14,6 +14,36 @@ #include "numerics/polynomial_evaluators.hpp" #include "quantities/elementary_functions.hpp" +#define OSACA_ANALYSED_FUNCTION Cos +#define UNDER_OSACA_HYPOTHESES(expression) \ + [] { \ + constexpr bool UseHardwareFMA = true; \ + constexpr double θ = 3; \ + /* From argument reduction. */ \ + constexpr std::int64_t n = \ + static_cast(θ * (2 / π) + 0.5); \ + constexpr double reduction_value = θ - n * π_over_2_high; \ + constexpr double reduction_error = n * π_over_2_low; \ + /* Used to determine whether a better argument reduction is needed. */ \ + constexpr DoublePrecision θ_reduced = \ + QuickTwoDifference(reduction_value, reduction_error); \ + /* Used in Sin to detect the near-0 case. */ \ + constexpr double abs_x = \ + θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ + /* Used throughout the top-level functions. */ \ + constexpr std::int64_t quadrant = n & 0b11; \ + /* Used in DetectDangerousRounding. */ \ + constexpr double normalized_error = 0; \ + /* Not NaN is the only part that matters; used at the end of the */ \ + /* top-level functions to determine whether to call the slow path. */ \ + constexpr double value = 1; \ + return expression; \ + }() + +#if defined(OSACA_ANALYSED_FUNCTION) +#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION) +#endif + #if PRINCIPIA_USE_OSACA #include "intel/iacaMarks.h" @@ -97,12 +127,14 @@ // them, so they cannot be the end of a loop started unconditionally. Instead // we loop with goto. // — Some volatile reads and writes are used to clarify identity of the -// registers in the generated code (where the names of `OSACA_input` and -// 'OSACA_result' appear in movsd instructions) and to improve the structure -// of the generated graph. +// registers in the generated code (where the names of `OSACA_result` and, if +// `OSACA_CARRY_LOOP_THROUGH_REGISTER`, `OSACA_loop_carry` appear in movsd +// instructions) and to improve the structure of the generated graph. // -// Putting a load of the input from memory in the analysed section makes the -// OSACA dependency graph clearer. However: +// Putting a load of the input from memory in the analysed section prevents the +// compiler from reusing intermediate values in the next iteration, e.g., if the +// absolute value of the result is computed first, the compiler might reuse it +// instead of computing the absolute value of the input. However: // — it adds a spurious move to the latency; // — some tools (IACA, LLVM-MCA) cannot see the dependency through memory. // Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 1 to carry the loop dependency @@ -114,38 +146,42 @@ static bool OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ - double OSACA_INPUT_QUALIFIER OSACA_input = arg; \ + double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \ if constexpr (std::string_view(__func__) == \ STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ IACA_VC64_START; \ } \ - double OSACA_loop_carry = OSACA_input; \ _Pragma("warning(push)"); \ _Pragma("warning(disable : 4102)"); \ OSACA_loop: \ _Pragma("warning(pop)"); \ arg = OSACA_loop_carry -#define OSACA_RETURN(result) \ - do { \ - if constexpr (std::string_view(__func__) == \ - STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ - OSACA_loop_carry = (result); \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - double volatile OSACA_result = OSACA_loop_carry; \ - IACA_VC64_END; \ - return OSACA_result; \ - } else { \ - return (result); \ - } \ +#define OSACA_RETURN(result) \ + do { \ + if constexpr (std::string_view(__func__) == \ + STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ + OSACA_loop_carry = (result); \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + double volatile OSACA_result = OSACA_loop_carry; \ + IACA_VC64_END; \ + /* The second goto prevents the the end marker from being interleaved */ \ + /* with register restoring moves. */ \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + return OSACA_result; \ + } else { \ + return (result); \ + } \ } while (false) #if OSACA_CARRY_LOOP_THROUGH_REGISTER -#define OSACA_INPUT_QUALIFIER +#define OSACA_LOOP_CARRY_QUALIFIER #else -#define OSACA_INPUT_QUALIFIER volatile +#define OSACA_LOOP_CARRY_QUALIFIER volatile #endif // The branch not taken, determined by evaluating the condition @@ -177,33 +213,6 @@ static bool OSACA_loop_terminator = false; #define OSACA_ELSE_IF else OSACA_IF // NOLINT -// Sin- and Cos-specific definitions: - -#define UNDER_OSACA_HYPOTHESES(expression) \ - [&] { \ - constexpr bool UseHardwareFMA = true; \ - constexpr double θ = 0.1; \ - /* From argument reduction. */ \ - constexpr double n_double = θ * (2 / π); \ - constexpr double reduction_value = θ - n_double * π_over_2_high; \ - constexpr double reduction_error = n_double * π_over_2_low; \ - /* Used to determine whether a better argument reduction is needed. */ \ - constexpr DoublePrecision θ_reduced = \ - QuickTwoDifference(reduction_value, reduction_error); \ - /* Used in Sin to detect the near-0 case. */ \ - constexpr double abs_x = \ - θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ - /* Used throughout the top-level functions. */ \ - constexpr std::int64_t quadrant = \ - static_cast(n_double) & 0b11; \ - /* Used in DetectDangerousRounding. */ \ - constexpr double normalized_error = 0; \ - /* Not NaN is the only part that matters; used at the end of the */ \ - /* top-level functions to determine whether to call the slow path. */ \ - constexpr double value = 1; \ - return expression; \ - }() - namespace principia { namespace numerics { diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp index 7d3a7eeb01..b33a11a7c1 100644 --- a/numerics/sin_cos.hpp +++ b/numerics/sin_cos.hpp @@ -7,20 +7,7 @@ namespace numerics { namespace _sin_cos { namespace internal { -#define PRINCIPIA_INLINE_SIN_COS 0 -#define OSACA_ANALYSED_FUNCTION - -#if defined(OSACA_ANALYSED_FUNCTION) -#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION) -#endif - -#if PRINCIPIA_INLINE_SIN_COS -FORCE_INLINE(inline) -#endif double __cdecl Sin(double x); -#if PRINCIPIA_INLINE_SIN_COS -FORCE_INLINE(inline) -#endif double __cdecl Cos(double x); } // namespace internal @@ -31,7 +18,3 @@ using internal::Sin; } // namespace _sin_cos } // namespace numerics } // namespace principia - -#if PRINCIPIA_INLINE_SIN_COS -#include "numerics/sin_cos.cpp" -#endif From d6a22b96eb2fa15fbf6041b67a9cfb9e536b291b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 1 Jan 2025 04:36:01 +0100 Subject: [PATCH 2/5] More improvements to the macros --- numerics/sin_cos.cpp | 113 +++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index d45469d9c1..79fdc3665c 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -15,29 +15,28 @@ #include "quantities/elementary_functions.hpp" #define OSACA_ANALYSED_FUNCTION Cos -#define UNDER_OSACA_HYPOTHESES(expression) \ - [] { \ - constexpr bool UseHardwareFMA = true; \ - constexpr double θ = 3; \ - /* From argument reduction. */ \ - constexpr std::int64_t n = \ - static_cast(θ * (2 / π) + 0.5); \ - constexpr double reduction_value = θ - n * π_over_2_high; \ - constexpr double reduction_error = n * π_over_2_low; \ - /* Used to determine whether a better argument reduction is needed. */ \ - constexpr DoublePrecision θ_reduced = \ - QuickTwoDifference(reduction_value, reduction_error); \ - /* Used in Sin to detect the near-0 case. */ \ - constexpr double abs_x = \ - θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ - /* Used throughout the top-level functions. */ \ - constexpr std::int64_t quadrant = n & 0b11; \ - /* Used in DetectDangerousRounding. */ \ - constexpr double normalized_error = 0; \ - /* Not NaN is the only part that matters; used at the end of the */ \ - /* top-level functions to determine whether to call the slow path. */ \ - constexpr double value = 1; \ - return expression; \ +#define UNDER_OSACA_HYPOTHESES(expression) \ + [&] { \ + constexpr bool UseHardwareFMA = true; \ + constexpr double θ = 0.1; \ + /* From argument reduction. */ \ + constexpr std::int64_t n = static_cast(θ * (2 / π) + 0.5); \ + constexpr double reduction_value = θ - n * π_over_2_high; \ + constexpr double reduction_error = n * π_over_2_low; \ + /* Used to determine whether a better argument reduction is needed. */ \ + constexpr DoublePrecision θ_reduced = \ + QuickTwoDifference(reduction_value, reduction_error); \ + /* Used in Sin to detect the near-0 case. */ \ + constexpr double abs_x = \ + θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ + /* Used throughout the top-level functions. */ \ + constexpr std::int64_t quadrant = n & 0b11; \ + /* Used in DetectDangerousRounding. */ \ + constexpr double normalized_error = 0; \ + /* Not NaN is the only part that matters; used at the end of the */ \ + /* top-level functions to determine whether to call the slow path. */ \ + constexpr double value = 1; \ + return expression; \ }() #if defined(OSACA_ANALYSED_FUNCTION) @@ -126,27 +125,37 @@ // — Return statements may be in if statements, and there may be several of // them, so they cannot be the end of a loop started unconditionally. Instead // we loop with goto. +// — We need to prevent the compiler from moving the start and end markers into +// the middle of register saving and restoring code, which would mess up the +// dependency analysis. This is done with additional conditional gotos. // — Some volatile reads and writes are used to clarify identity of the // registers in the generated code (where the names of `OSACA_result` and, if -// `OSACA_CARRY_LOOP_THROUGH_REGISTER`, `OSACA_loop_carry` appear in movsd -// instructions) and to improve the structure of the generated graph. +// `OSACA_CARRY_LOOP_THROUGH_REGISTER` is set to 0, `OSACA_loop_carry` appear +// in movsd instructions). // -// Putting a load of the input from memory in the analysed section prevents the -// compiler from reusing intermediate values in the next iteration, e.g., if the -// absolute value of the result is computed first, the compiler might reuse it -// instead of computing the absolute value of the input. However: +// Carrying the loop dependency through a memory load and store can make the +// dependency graph easier to understand, as it forces any usage of the input to +// depend on the initial movsd, with the loop carried by a single backward edge +// to that initial movsd. +// If the loop is carried through a register, multiple usages of the input may +// result in multiple back edges from the final instruction that computed the +// result. Carrying the loop through the memory could also potentially prevent +// the compiler from reusing intermediate values in the next iteration, e.g., if +// the the computation of f(x) depends on -x and produces -f(x) before f(x), as +// in an even function defined in terms of its positive half, the compiler might +// reuse -f(x₀)=-x₁ instead of computing -x₁ from x₁=f(x₀). However: // — it adds a spurious move to the latency; -// — some tools (IACA, LLVM-MCA) cannot see the dependency through memory. -// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 1 to carry the loop dependency -// through a register instead. +// — some tools (IACA) cannot see the dependency through memory. +// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 0 to carry the loop through memory. #define OSACA_EVALUATE_CONDITIONS 1 -#define OSACA_CARRY_LOOP_THROUGH_REGISTER 0 +#define OSACA_CARRY_LOOP_THROUGH_REGISTER 1 static bool OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \ + OSACA_outer_loop: \ if constexpr (std::string_view(__func__) == \ STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ IACA_VC64_START; \ @@ -157,25 +166,25 @@ static bool OSACA_loop_terminator = false; _Pragma("warning(pop)"); \ arg = OSACA_loop_carry -#define OSACA_RETURN(result) \ - do { \ - if constexpr (std::string_view(__func__) == \ - STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ - OSACA_loop_carry = (result); \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - double volatile OSACA_result = OSACA_loop_carry; \ - IACA_VC64_END; \ - /* The second goto prevents the the end marker from being interleaved */ \ - /* with register restoring moves. */ \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - return OSACA_result; \ - } else { \ - return (result); \ - } \ +#define OSACA_RETURN(result) \ + do { \ + if constexpr (std::string_view(__func__) == \ + STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ + OSACA_loop_carry = (result); \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + double volatile OSACA_result = OSACA_loop_carry; \ + IACA_VC64_END; \ + /* The outer loop prevents the the start and end marker from being */ \ + /* interleaved with register saving and restoring moves. */ \ + if (!OSACA_loop_terminator) { \ + goto OSACA_outer_loop; \ + } \ + return OSACA_result; \ + } else { \ + return (result); \ + } \ } while (false) #if OSACA_CARRY_LOOP_THROUGH_REGISTER From 500a7884b965fe2e49fdf87a0331a7e1281f67ba Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 1 Jan 2025 05:38:11 +0100 Subject: [PATCH 3/5] volatile terminator --- numerics/sin_cos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 79fdc3665c..a669d28191 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -151,7 +151,7 @@ #define OSACA_EVALUATE_CONDITIONS 1 #define OSACA_CARRY_LOOP_THROUGH_REGISTER 1 -static bool OSACA_loop_terminator = false; +static bool volatile OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \ From d78d05ed847cff97b27f446e64db520ce2259034 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 1 Jan 2025 05:39:02 +0100 Subject: [PATCH 4/5] turn it off --- numerics/sin_cos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index a669d28191..d9b4beee8f 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -14,11 +14,11 @@ #include "numerics/polynomial_evaluators.hpp" #include "quantities/elementary_functions.hpp" -#define OSACA_ANALYSED_FUNCTION Cos +#define OSACA_ANALYSED_FUNCTION #define UNDER_OSACA_HYPOTHESES(expression) \ [&] { \ constexpr bool UseHardwareFMA = true; \ - constexpr double θ = 0.1; \ + constexpr double θ = 3; \ /* From argument reduction. */ \ constexpr std::int64_t n = static_cast(θ * (2 / π) + 0.5); \ constexpr double reduction_value = θ - n * π_over_2_high; \ From 21882585677787710f99cbcae9d734678aa8f9e8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 1 Jan 2025 13:30:50 +0100 Subject: [PATCH 5/5] no force inline --- numerics/sin_cos.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index d9b4beee8f..9315b27e9e 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -456,9 +456,6 @@ Value CosImplementation(DoublePrecision const θ_reduced) { return DetectDangerousRounding(cos_x₀_minus_h_sin_x₀.value, polynomial_term); } -#if PRINCIPIA_INLINE_SIN_COS -FORCE_INLINE(inline) -#endif Value __cdecl Sin(Argument θ) { OSACA_FUNCTION_BEGIN(θ); DoublePrecision θ_reduced; @@ -487,9 +484,6 @@ Value __cdecl Sin(Argument θ) { } } -#if PRINCIPIA_INLINE_SIN_COS -FORCE_INLINE(inline) -#endif Value __cdecl Cos(Argument θ) { OSACA_FUNCTION_BEGIN(θ); DoublePrecision θ_reduced;