Merge pull request #4016 from pleroy/CoeffRounding

Round the polynomial coefficients using Mathematica and take advantage of powers of two
mockingbirdnest · Jun 3, 2024 · 47ea231 · 47ea231
2 parents 9fb3103 + 31a00be
commit 47ea231
Showing 4 changed files with 137 additions and 43 deletions.
diff --git a/benchmarks/elementary_functions_experiments_benchmark.cpp b/benchmarks/elementary_functions_experiments_benchmark.cpp
@@ -152,7 +152,9 @@ class SingleTableImplementation {
     Value cos_x;
   };
 
-  static constexpr Value cos_polynomial_0 = -0.499999999999999999999872434553;
+  // If this was ever changed to a value that is not a power of 2, extra care
+  // would be needed in the computations that use it.
+  static constexpr Value cos_polynomial_0 = -0.5;
 
   Value SinPolynomial(Argument x);
   Value CosPolynomial1(Argument x);
@@ -214,27 +216,22 @@ Value TableSpacingImplementation<table_spacing>::SinPolynomial(
     Argument const x) {
   if constexpr (table_spacing == 2.0 / 256.0) {
     // 71 bits.
-    return -0.166666666666666666666421797625 +
-           0.00833333057503280528178543245797 * x;
+    return -0x1.5555555555555p-3 + 0x1.11110B24ACC74p-7 * x;
   } else if constexpr (table_spacing == 2.0 / 1024.0) {
-    // 85 bits.
-    return -0.166666666666666666666666651721 +
-           0.00833333316093951937646271666739 * x;
+    // 84 bits.
+    return -0x1.5555555555555p-3 + 0x1.111110B24ACB5p-7 * x;
   }
 }
 
 template<Argument table_spacing>
 Value TableSpacingImplementation<table_spacing>::CosPolynomial(
     Argument const x) {
   if constexpr (table_spacing == 2.0 / 256.0) {
-    // 77 bits.
-    return -0.499999999999999999999999974543 +
-           x * (0.0416666666666633318024480868405 -
-                0.00138888829905860875255146938745 * x);
+    // 83 bits.
+    return -0.5 + x * (0x1.5555555555555p-5 - 0x1.6C16BB6B46CA6p-10 * x);
   } else if constexpr (table_spacing == 2.0 / 1024.0) {
     // 72 bits.
-    return -0.499999999999999999999872434553 +
-           0.0416666654823785864634569932662 * x;
+    return -0.5 + 0x1.555554B290E6Ap-5 * x;
   }
 }
 
@@ -324,9 +321,8 @@ void MultiTableImplementation::SelectCutoff(Argument const x,
 }
 
 Value MultiTableImplementation::SinPolynomial(Argument const x) {
-  // 85 bits.  Works for all binades.
-  return -0.166666666666666666666666651721 +
-         0.00833333316093951937646271666739 * x;
+  // 84 bits.  Works for all binades.
+  return -0x1.5555555555555p-3 + 0x1.111110B24ACB5p-7 * x;
 }
 
 Value MultiTableImplementation::CosPolynomial(std::int64_t const i,
@@ -335,17 +331,14 @@ Value MultiTableImplementation::CosPolynomial(std::int64_t const i,
   // we have to use 3 polynomials.
   // i == 1 goes first because it is the largest argument interval.
   if (i == 1) {
-    // 78 bits.
-    return -0.499999999999999999999998006790 +
-           0.0416666663705946726372008045758 * x;
+    // 76 bits.
+    return -0.5 + 0x1.5555552CA439Ep-5 * x;
   } else if (i == 0) {
     // 72 bits.
-    return -0.499999999999999999999872434553 +
-           0.0416666654823785864634569932662 * x;
+    return -0.5 + 0x1.555554B290E6Ap-5 * x;
   } else {
-    // 84 bits.
-    return -0.499999999999999999999999968856 +
-           0.0416666665926486697856340784849 * x;
+    // 78 bits.
+    return -0.5 + 0x1.5555554B290E8p-5 * x;
   }
 }
 
@@ -379,8 +372,7 @@ Value SingleTableImplementation::Sin(Argument const x) {
     // TODO(phl): Error analysis of this computation.
     auto const h² = TwoProduct(h, h);
     auto const h³ = h².value * h;
-    auto const h²_sin_x₀_cos_polynomial_0 =
-        h² * TwoProduct(sin_x₀, cos_polynomial_0);
+    auto const h²_sin_x₀_cos_polynomial_0 = h² * (sin_x₀ * cos_polynomial_0);
     auto const terms_up_to_h² = QuickTwoSum(sin_x₀_plus_h_cos_x₀.value,
                                             h²_sin_x₀_cos_polynomial_0.value);
     return terms_up_to_h².value +
@@ -409,8 +401,7 @@ Value SingleTableImplementation::Cos(Argument const x) {
     // TODO(phl): Error analysis of this computation.
     auto const h² = TwoProduct(h, h);
     auto const h³ = h².value * h;
-    auto const h²_cos_x₀_cos_polynomial_0 =
-        h² * TwoProduct(cos_x₀, cos_polynomial_0);
+    auto const h²_cos_x₀_cos_polynomial_0 = h² * (cos_x₀ * cos_polynomial_0);
     auto const terms_up_to_h² = QuickTwoSum(cos_x₀_minus_h_sin_x₀.value,
                                             h²_cos_x₀_cos_polynomial_0.value);
     return terms_up_to_h².value +
@@ -420,24 +411,19 @@ Value SingleTableImplementation::Cos(Argument const x) {
   }
 }
 
-Value SingleTableImplementation::SinPolynomial(
-    Argument const x) {
-  // 85 bits.
-  return -0.166666666666666666666666651721 +
-         0.00833333316093951937646271666739 * x;
+Value SingleTableImplementation::SinPolynomial(Argument const x) {
+  // 84 bits.  Works for all binades.
+  return -0x1.5555555555555p-3 + 0x1.111110B24ACB5p-7 * x;
 }
 
-Value SingleTableImplementation::CosPolynomial1(
-    Argument const x) {
+Value SingleTableImplementation::CosPolynomial1(Argument const x) {
   // 72 bits.
-  return cos_polynomial_0 + 0.0416666654823785864634569932662 * x;
+  return cos_polynomial_0 + 0x1.555554B290E6Ap-5 * x;
 }
 
-Value SingleTableImplementation::CosPolynomial2(
-    Argument const x) {
-  // 101 bits.
-  return x * (0.04166666666666665363986848039146102332933 -
-              0.001388888852024502693312293343727757316234 * x);
+Value SingleTableImplementation::CosPolynomial2(Argument const x) {
+  // 97 bits.
+  return x * (0x1.5555555555555p-5 - 0x1.6C16C10C09C11p-10 * x);
 }
 
 template<Argument table_spacing>
@@ -461,7 +447,7 @@ void BM_ExperimentSinTableSpacing(benchmark::State& state) {
       // The implementation is not accurate, but let's check that it's not
       // broken.
       auto const absolute_error = Abs(v[i] - std::sin(a[i]));
-      CHECK_LT(absolute_error, 5.6e-17);
+      CHECK_LT(absolute_error, 1.2e-16);
 #endif
     }
     benchmark::DoNotOptimize(v);

diff --git a/mathematica/floating_point.wl b/mathematica/floating_point.wl
@@ -36,9 +36,11 @@ Bits;
 
 
 HexLiteral;
+Attributes[HexLiteral]={Listable}
 
 
 CorrectlyRound;
+Attributes[CorrectlyRound]={Listable};
 
 
 Begin["`Private`"]
@@ -101,7 +103,7 @@ leastFullHexDigitValue:=2^(significandBits-1)/16^fullHexDigits
 leastHexDigitValue:=If[leastFullHexDigitValue>1,leastFullHexDigitValue/16,leastFullHexDigitValue]
 
 
-HexLiteral[n_]:=If[n<0,"-",""]<>
+HexLiteral[n_]:=If[n==0,"0.0",If[n<0,"-",""]<>
 "0x1."<>ToUpperCase[
 IntegerString[
 Mod[IntegerPart[Representation[Abs[n]]/leastFullHexDigitValue],16^fullHexDigits],16,fullHexDigits]<>
@@ -120,7 +122,7 @@ Switch[
 binary32,"f",
 binary64,"",
 x87extended,"l",
-_,"_"<>ToString[significandBits]<>"_sigbits"]
+_,"_"<>ToString[significandBits]<>"_sigbits"]]
 
 
 smol=-12;

diff --git a/numerics/double_precision.hpp b/numerics/double_precision.hpp
@@ -126,18 +126,50 @@ DoublePrecision<Difference<T>> operator+(DoublePrecision<T> const& left);
 template<typename T>
 DoublePrecision<Difference<T>> operator-(DoublePrecision<T> const& left);
 
+template<typename T, typename U>
+DoublePrecision<Sum<T, U>> operator+(T const& left,
+                                     DoublePrecision<U> const& right);
+
+template<typename T, typename U>
+DoublePrecision<Sum<T, U>> operator+(DoublePrecision<T> const& left,
+                                     U const& right);
+
 template<typename T, typename U>
 DoublePrecision<Sum<T, U>> operator+(DoublePrecision<T> const& left,
                                      DoublePrecision<U> const& right);
 
+template<typename T, typename U>
+DoublePrecision<Difference<T, U>> operator-(T const& left,
+                                            DoublePrecision<U> const& right);
+
+template<typename T, typename U>
+DoublePrecision<Difference<T, U>> operator-(DoublePrecision<T> const& left,
+                                            U const& right);
+
 template<typename T, typename U>
 DoublePrecision<Difference<T, U>> operator-(DoublePrecision<T> const& left,
                                             DoublePrecision<U> const& right);
 
+template<typename T, typename U>
+DoublePrecision<Product<T, U>> operator*(T const& left,
+                                         DoublePrecision<U> const& right);
+
+template<typename T, typename U>
+DoublePrecision<Product<T, U>> operator*(DoublePrecision<T> const& left,
+                                         U const& right);
+
 template<typename T, typename U>
 DoublePrecision<Product<T, U>> operator*(DoublePrecision<T> const& left,
                                          DoublePrecision<U> const& right);
 
+template<typename T, typename U>
+DoublePrecision<Quotient<T, U>> operator/(T const& left,
+                                          DoublePrecision<U> const& right);
+
+template<typename T, typename U>
+DoublePrecision<Quotient<T, U>> operator/(DoublePrecision<T> const& left,
+                                          U const& right);
+
 template<typename T, typename U>
 DoublePrecision<Quotient<T, U>> operator/(DoublePrecision<T> const& left,
                                           DoublePrecision<U> const& right);

diff --git a/numerics/double_precision_body.hpp b/numerics/double_precision_body.hpp
@@ -396,6 +396,22 @@ DoublePrecision<Difference<T>> operator-(DoublePrecision<T> const& left) {
   return result;
 }
 
+template<typename T, typename U>
+DoublePrecision<Sum<T, U>> operator+(T const& left,
+                                     DoublePrecision<U> const& right) {
+  // [Lin81], algorithm longadd.
+  auto const sum = TwoSum(left, right.value);
+  return QuickTwoSum(sum.value, sum.error + right.error);
+}
+
+template<typename T, typename U>
+DoublePrecision<Sum<T, U>> operator+(DoublePrecision<T> const& left,
+                                     U const& right) {
+  // [Lin81], algorithm longadd.
+  auto const sum = TwoSum(left.value, right);
+  return QuickTwoSum(sum.value, sum.error + left.error);
+}
+
 template<typename T, typename U>
 DoublePrecision<Sum<T, U>> operator+(DoublePrecision<T> const& left,
                                      DoublePrecision<U> const& right) {
@@ -404,6 +420,22 @@ DoublePrecision<Sum<T, U>> operator+(DoublePrecision<T> const& left,
   return QuickTwoSum(sum.value, (sum.error + left.error) + right.error);
 }
 
+template<typename T, typename U>
+DoublePrecision<Difference<T, U>> operator-(T const& left,
+                                            DoublePrecision<U> const& right) {
+  // [Lin81], algorithm longadd.
+  auto const sum = TwoDifference(left, right.value);
+  return QuickTwoSum(sum.value, sum.error - right.error);
+}
+
+template<typename T, typename U>
+DoublePrecision<Difference<T, U>> operator-(DoublePrecision<T> const& left,
+                                            U const& right) {
+  // [Lin81], algorithm longadd.
+  auto const sum = TwoDifference(left.value, right);
+  return QuickTwoSum(sum.value, sum.error + left.error);
+}
+
 template<typename T, typename U>
 DoublePrecision<Difference<T, U>> operator-(DoublePrecision<T> const& left,
                                             DoublePrecision<U> const& right) {
@@ -412,6 +444,26 @@ DoublePrecision<Difference<T, U>> operator-(DoublePrecision<T> const& left,
   return QuickTwoSum(sum.value, (sum.error + left.error) - right.error);
 }
 
+template<typename T, typename U>
+FORCE_INLINE(inline)
+DoublePrecision<Product<T, U>> operator*(T const& left,
+                                         DoublePrecision<U> const& right) {
+  // [Lin81], algorithm longmul.
+  auto product = TwoProduct(left, right.value);
+  product.error += left.value * right.error;
+  return QuickTwoSum(product.value, product.error);
+}
+
+template<typename T, typename U>
+FORCE_INLINE(inline)
+DoublePrecision<Product<T, U>> operator*(DoublePrecision<T> const& left,
+                                         U const& right) {
+  // [Lin81], algorithm longmul.
+  auto product = TwoProduct(left.value, right);
+  product.error += +left.error * right;
+  return QuickTwoSum(product.value, product.error);
+}
+
 template<typename T, typename U>
 FORCE_INLINE(inline)
 DoublePrecision<Product<T, U>> operator*(DoublePrecision<T> const& left,
@@ -423,6 +475,28 @@ DoublePrecision<Product<T, U>> operator*(DoublePrecision<T> const& left,
   return QuickTwoSum(product.value, product.error);
 }
 
+template<typename T, typename U>
+DoublePrecision<Quotient<T, U>> operator/(T const& left,
+                                          DoublePrecision<U> const& right) {
+  // [Lin81], algorithm longdiv.
+  auto const z = left / right.value;
+  auto const product = TwoProduct(right.value, z);
+  auto const zz = (((left - product.value) - product.error) - z * right.error) /
+                  (right.value + right.error);
+  return QuickTwoSum(z, zz);
+}
+
+template<typename T, typename U>
+DoublePrecision<Quotient<T, U>> operator/(DoublePrecision<T> const& left,
+                                          U const& right) {
+  // [Lin81], algorithm longdiv.
+  auto const z = left.value / right;
+  auto const product = TwoProduct(right, z);
+  auto const zz =
+      (((left.value - product.value) - product.error) + left.error) / right;
+  return QuickTwoSum(z, zz);
+}
+
 template<typename T, typename U>
 DoublePrecision<Quotient<T, U>> operator/(DoublePrecision<T> const& left,
                                           DoublePrecision<U> const& right) {