perf: Try fast pow for EigenStepper step size scaling (acts-project…

…#3153) Currently the step scaling calculation has a big impact on the `EigenStepper` performance. In this PR the `pow(x, 0.25)` is approximated with `fastPow` (similar to fast inverse square root) which relies on the bit representation of the floating point number. The assumption is that we do not really care about the precision of this value but rather that it gives a good approximation for the step size scaling. Edit: After measuring the performance it seems like there is no measurable improvement. I made this approximation compile time optional for now. See this acts-project#3153 (comment) for more details <img width="577" alt="image" src="https://github.com/acts-project/acts/assets/487211/d613caeb-991f-4a89-98fc-bc051be13b7b"> References - https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp
EleniXoch · May 6, 2024 · 5b9ce20 · 5b9ce20
1 parent ad09ca4
commit 5b9ce20
Show file tree

Hide file tree

Showing 7 changed files with 285 additions and 16 deletions.
diff --git a/Core/include/Acts/Propagator/EigenStepper.hpp b/Core/include/Acts/Propagator/EigenStepper.hpp
@@ -431,7 +431,7 @@ class EigenStepper {
   std::shared_ptr<const MagneticFieldProvider> m_bField;
 
   /// Overstep limit
-  double m_overstepLimit;
+  double m_overstepLimit{};
 };
 
 template <typename navigator_t>

diff --git a/Core/include/Acts/Propagator/EigenStepper.ipp b/Core/include/Acts/Propagator/EigenStepper.ipp
@@ -9,6 +9,9 @@
 #include "Acts/EventData/TransformationHelpers.hpp"
 #include "Acts/Propagator/ConstrainedStep.hpp"
 #include "Acts/Propagator/detail/CovarianceEngine.hpp"
+#include "Acts/Utilities/QuickMath.hpp"
+
+#include <limits>
 
 template <typename E, typename A>
 Acts::EigenStepper<E, A>::EigenStepper(
@@ -153,7 +156,8 @@ Acts::Result<double> Acts::EigenStepper<E, A>::step(
     propagator_state_t& state, const navigator_t& navigator) const {
   // Runge-Kutta integrator state
   auto& sd = state.stepping.stepData;
-  double error_estimate = 0.;
+
+  double errorEstimate = 0.;
   double h2 = 0, half_h = 0;
 
   auto pos = position(state.stepping);
@@ -172,6 +176,31 @@ Acts::Result<double> Acts::EigenStepper<E, A>::step(
     return 0.;
   }
 
+  const auto calcStepSizeScaling = [&](const double errorEstimate_) -> double {
+    // For details about these values see ATL-SOFT-PUB-2009-001 for details
+    constexpr double lower = 0.25;
+    constexpr double upper = 4.0;
+    // This is given by the order of the Runge-Kutta method
+    constexpr double exponent = 0.25;
+
+    // Whether to use fast power function if available
+    constexpr bool tryUseFastPow{false};
+
+    double x = state.options.stepTolerance / errorEstimate_;
+
+    if constexpr (exponent == 0.25 && !tryUseFastPow) {
+      // This is 3x faster than std::pow
+      x = std::sqrt(std::sqrt(x));
+    } else if constexpr (std::numeric_limits<double>::is_iec559 &&
+                         tryUseFastPow) {
+      x = fastPow(x, exponent);
+    } else {
+      x = std::pow(x, exponent);
+    }
+
+    return std::clamp(x, lower, upper);
+  };
+
   // The following functor starts to perform a Runge-Kutta step of a certain
   // size, going up to the point where it can return an estimate of the local
   // integration error. The results are stated in the local variables above,
@@ -217,12 +246,13 @@ Acts::Result<double> Acts::EigenStepper<E, A>::step(
     }
 
     // Compute and check the local integration error estimate
-    error_estimate =
+    errorEstimate =
         h2 * ((sd.k1 - sd.k2 - sd.k3 + sd.k4).template lpNorm<1>() +
               std::abs(sd.kQoP[0] - sd.kQoP[1] - sd.kQoP[2] + sd.kQoP[3]));
-    error_estimate = std::max(error_estimate, 1e-20);
+    // Protect against division by zero
+    errorEstimate = std::max(1e-20, errorEstimate);
 
-    return success(error_estimate <= state.options.stepTolerance);
+    return success(errorEstimate <= state.options.stepTolerance);
   };
 
   const double initialH =
@@ -240,12 +270,7 @@ Acts::Result<double> Acts::EigenStepper<E, A>::step(
       break;
     }
 
-    // double std::sqrt is 3x faster than std::pow
-    const double stepSizeScaling =
-        std::clamp(std::sqrt(std::sqrt(state.options.stepTolerance /
-                                       std::abs(2. * error_estimate))),
-                   0.25, 4.0);
-    h *= stepSizeScaling;
+    h *= calcStepSizeScaling(2 * errorEstimate);
 
     // If step size becomes too small the particle remains at the initial
     // place
@@ -321,11 +346,7 @@ Acts::Result<double> Acts::EigenStepper<E, A>::step(
     state.stepping.derivative.template segment<3>(4) = sd.k4;
   }
   state.stepping.pathAccumulated += h;
-  // double std::sqrt is 3x faster than std::pow
-  const double stepSizeScaling =
-      std::clamp(std::sqrt(std::sqrt(state.options.stepTolerance /
-                                     std::abs(error_estimate))),
-                 0.25, 4.0);
+  const double stepSizeScaling = calcStepSizeScaling(errorEstimate);
   const double nextAccuracy = std::abs(h * stepSizeScaling);
   const double previousAccuracy = std::abs(state.stepping.stepSize.accuracy());
   const double initialStepLength = std::abs(initialH);

diff --git a/Core/include/Acts/Utilities/QuickMath.hpp b/Core/include/Acts/Utilities/QuickMath.hpp
@@ -0,0 +1,90 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2024 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Acts {
+
+/// @brief Fast inverse square root function
+/// Taken from https://en.wikipedia.org/wiki/Fast_inverse_square_root
+/// @param x the number to calculate the inverse square root of
+/// @param iterations the number of newton iterations to perform
+template <typename T>
+constexpr T fastInverseSqrt(T x, int iterations = 1) noexcept {
+  static_assert(std::numeric_limits<T>::is_iec559 &&
+                "Only IEEE 754 is supported");
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                "Only float and double are supported");
+  using I = std::conditional_t<std::is_same_v<T, float>, std::uint32_t,
+                               std::uint64_t>;
+
+  constexpr I magic =
+      std::is_same_v<T, float> ? 0x5f3759df : 0x5fe6eb50c7b537a9;
+
+  union {
+    T f;
+    I i;
+  } u = {x};
+
+  u.i = magic - (u.i >> 1);
+
+  for (int i = 0; i < iterations; ++i) {
+    u.f *= 1.5f - 0.5f * x * u.f * u.f;
+  }
+
+  return u.f;
+}
+
+/// @brief Fast power function @see `std::pow`
+/// Taken from
+/// https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp
+/// @param a the base
+/// @param b the exponent
+constexpr double fastPow(double a, double b) {
+  // enable only on IEEE 754
+  static_assert(std::numeric_limits<double>::is_iec559);
+
+  constexpr std::int64_t magic = 0x3FEF127F00000000;
+
+  union {
+    double f;
+    std::int64_t i;
+  } u = {a};
+
+  u.i = static_cast<std::int64_t>(b * (u.i - magic) + magic);
+
+  return u.f;
+}
+
+/// @brief Fast power function more precise than @see `fastPow`
+/// Taken from
+/// https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp
+/// @param a the base
+/// @param b the exponent
+constexpr double fastPowMorePrecise(double a, double b) {
+  // binary exponentiation
+  double r = 1.0;
+  int exp = std::abs(static_cast<int>(b));
+  double base = b > 0 ? a : 1 / a;
+  while (exp != 0) {
+    if ((exp & 1) != 0) {
+      r *= base;
+    }
+    base *= base;
+    exp >>= 1;
+  }
+
+  return r * fastPow(a, b - static_cast<int>(b));
+}
+
+}  // namespace Acts
diff --git a/Tests/Benchmarks/CMakeLists.txt b/Tests/Benchmarks/CMakeLists.txt
@@ -29,3 +29,4 @@ add_benchmark(SurfaceIntersection SurfaceIntersectionBenchmark.cpp)
 add_benchmark(RayFrustum RayFrustumBenchmark.cpp)
 add_benchmark(AnnulusBounds AnnulusBoundsBenchmark.cpp)
 add_benchmark(StraightLineStepper StraightLineStepperBenchmark.cpp)
+add_benchmark(QuickMath QuickMathBenchmark.cpp)
diff --git a/Tests/Benchmarks/QuickMathBenchmark.cpp b/Tests/Benchmarks/QuickMathBenchmark.cpp
@@ -0,0 +1,92 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2024 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <boost/test/data/test_case.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include "Acts/Tests/CommonHelpers/BenchmarkTools.hpp"
+#include "Acts/Utilities/QuickMath.hpp"
+
+#include <cmath>
+#include <random>
+
+namespace bdata = boost::unit_test::data;
+
+namespace Acts::Test {
+
+/// @brief Another fast power function @see `fastPow`
+// Taken from
+// https://martin.ankerl.com/2007/02/11/optimized-exponential-functions-for-java
+/// @param a the base
+/// @param b the exponent
+constexpr double fastPowAnother(double a, double b) {
+  // enable only on IEEE 754
+  static_assert(std::numeric_limits<double>::is_iec559);
+
+  union {
+    double f;
+    std::int64_t i;
+  } u = {};
+
+  u.i = static_cast<std::int64_t>(
+      9076650 * (a - 1) / (a + 1 + 4 * std::sqrt(a)) * b + 1072632447);
+  u.i <<= 32;
+
+  // result seems broken?
+  return u.f;
+}
+
+// Some randomness & number crunching
+const unsigned int nTests = 10;
+const unsigned int nReps = 10000;
+
+BOOST_DATA_TEST_CASE(
+    benchmark_pow_25,
+    bdata::random(
+        (bdata::engine = std::mt19937(), bdata::seed = 21,
+         bdata::distribution = std::uniform_real_distribution<double>(-4, 4))) ^
+        bdata::xrange(nTests),
+    baseExp, index) {
+  (void)index;
+
+  const double base = std::pow(10, baseExp);
+  const double exp = 0.25;
+
+  std::cout << std::endl
+            << "Benchmarking base=" << base << ", exp=" << exp << "..."
+            << std::endl;
+  std::cout << "- void: "
+            << Acts::Test::microBenchmark([&] { return 0; }, nReps)
+            << std::endl;
+  std::cout << "- std::pow: "
+            << Acts::Test::microBenchmark([&] { return std::pow(base, exp); },
+                                          nReps)
+            << std::endl;
+  std::cout << "- std::exp: "
+            << Acts::Test::microBenchmark(
+                   [&] { return std::exp(std::log(base) * exp); }, nReps)
+            << std::endl;
+  std::cout << "- std::sqrt: "
+            << Acts::Test::microBenchmark(
+                   [&] { return std::sqrt(std::sqrt(base)); }, nReps)
+            << std::endl;
+  std::cout << "- fastPow: "
+            << Acts::Test::microBenchmark([&] { return fastPow(base, exp); },
+                                          nReps)
+            << std::endl;
+  std::cout << "- fastPowMorePrecise: "
+            << Acts::Test::microBenchmark(
+                   [&] { return fastPowMorePrecise(base, exp); }, nReps)
+            << std::endl;
+  std::cout << "- fastPowAnother: "
+            << Acts::Test::microBenchmark(
+                   [&] { return fastPowAnother(base, exp); }, nReps)
+            << std::endl;
+}
+
+}  // namespace Acts::Test
diff --git a/Tests/UnitTests/Core/Utilities/CMakeLists.txt b/Tests/UnitTests/Core/Utilities/CMakeLists.txt
@@ -51,3 +51,4 @@ target_compile_definitions(ActsUnitTestAnyDebug PRIVATE _ACTS_ANY_ENABLE_VERBOSE
 add_unittest(ParticleData ParticleDataTests.cpp)
 add_unittest(Zip ZipTests.cpp)
 add_unittest(TransformRange TransformRangeTests.cpp)
+add_unittest(QuickMath QuickMathTests.cpp)
diff --git a/Tests/UnitTests/Core/Utilities/QuickMathTests.cpp b/Tests/UnitTests/Core/Utilities/QuickMathTests.cpp
@@ -0,0 +1,64 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2024 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <boost/test/data/test_case.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include "Acts/Tests/CommonHelpers/FloatComparisons.hpp"
+#include "Acts/Utilities/QuickMath.hpp"
+
+namespace bdata = boost::unit_test::data;
+
+const auto expDist = bdata::random(
+    (bdata::engine = std::mt19937{}, bdata::seed = 0,
+     bdata::distribution = std::uniform_real_distribution<double>(-4, 4)));
+
+BOOST_AUTO_TEST_SUITE(Utilities)
+
+BOOST_DATA_TEST_CASE(fastInverseSqrt, expDist ^ bdata::xrange(100), exp, i) {
+  (void)i;
+
+  const double x = std::pow(10, exp);
+
+  const float fastFloat = Acts::fastInverseSqrt(static_cast<float>(x));
+  const double fastDouble = Acts::fastInverseSqrt(x);
+
+  const double stdFloat = 1.0 / std::sqrt(static_cast<float>(x));
+  const double stdDouble = 1.0 / std::sqrt(x);
+
+  CHECK_CLOSE_REL(stdFloat, fastFloat, 0.01);
+  CHECK_CLOSE_REL(stdDouble, fastDouble, 0.01);
+}
+
+BOOST_DATA_TEST_CASE(fastPow, expDist ^ expDist ^ bdata::xrange(100), baseExp,
+                     exp, i) {
+  (void)i;
+
+  const double base = std::pow(10, baseExp);
+
+  const double fast = Acts::fastPow(base, exp);
+  const double fastMorePrecise = Acts::fastPowMorePrecise(base, exp);
+
+  const double std = std::pow(base, exp);
+
+  CHECK_CLOSE_REL(fast, std, 0.15);
+  CHECK_CLOSE_REL(fastMorePrecise, std, 0.1);
+}
+
+// BOOST_AUTO_TEST_CASE(fastPowChart) {
+//   std::cout << "a ref obs" << std::endl;
+//   for (double aExp = -4; aExp <= 4; aExp += 0.01) {
+//     double a = std::pow(10, aExp);
+//     double ref = std::pow(a, 0.25);
+//     double obs = Acts::fastPow(a, 0.25);
+
+//     std::cout << a << " " << ref << " " << obs << std::endl;
+//   }
+// }
+
+BOOST_AUTO_TEST_SUITE_END()