diff --git a/CMakeLists.txt b/CMakeLists.txt index a415102c42..96d69ab7e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,16 +96,17 @@ option(SINGULARITY_BETTER_DEBUG_FLAGS "Better debug flags for singularity" ON) option(SINGULARITY_HIDE_MORE_WARNINGS "hide more warnings" OFF) # toggle code options -option(SINGULARITY_USE_SINGLE_LOGS - "Use single precision logs. Can harm accuracy." OFF) option(SINGULARITY_USE_TRUE_LOG_GRIDDING - "Use grids that conform to log spacing." OFF) -# TODO(JMM): Should this automatically be activated when true log gridding is -# off? -cmake_dependent_option( - SINGULARITY_USE_HIGH_RISK_MATH - "Use integer aliased logs, may not be portable" OFF - "NOT SINGULARITY_USE_TRUE_LOG_GRIDDING" OFF) + "Use grids that conform to log spacing." OFF) +cmake_dependent_option(SINGULARITY_USE_SINGLE_LOGS + "Use single precision logs. Only available for true log gridding. Can harm accuracy." + OFF "SINGULARITY_USE_TRUE_LOG_GRIDDING" OFF) +option(SINGULARITY_NQT_ORDER_1 + "In NQT logs, use first order. Faster but less accurate." + OFF) +option(SINGULARITY_NQT_PORTABLE + "In NQT logs, use portable, rather than bithacked implementation. Slower, but more likely to function on exotic architectures." + OFF) # misc options option(SINGULARITY_FORCE_SUBMODULE_MODE "Submodule mode" OFF) @@ -126,6 +127,26 @@ set(SINGULARITY_PLUGINS "" CACHE STRING "List of paths to plugin directories") set(SINGULARITY_VARIANT "singularity-eos/eos/default_variant.hpp" CACHE STRING "The include path for the file containing the definition of singularity::EOS.") +# Detect ARM architecture +set(SINGULARITY_ON_ARM OFF CACHE BOOL "We are running on an ARM system") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)") + if (NOT SINGULARITY_USE_CUDA) + message(STATUS + "ARM architecture detected: ${CMAKE_SYSTEM_PROCESSOR}") + set(SINGULARITY_ON_ARM ON CACHE BOOL + "We are running on an ARM system") + endif() +endif() + +if (SINGULAIRTY_ON_ARM) + if (NOT SINGULARITY_USE_TRUE_LOG_GRIDDING) + message(WARNING + "Fast logs not necessarily better on ARM CPU systems. " + "You may wish to build with " + "-DSINGULARITY_USE_TRUE_LOG_GRIDDING=ON.") + endif() +endif() + # ------------------------------------------------------------------------------# # singularity-eos Library # ------------------------------------------------------------------------------# @@ -163,6 +184,15 @@ if(SINGULARITY_USE_FORTRAN) include(CMakeDetermineFortranCompiler) endif() +# Big endianness +include(TestBigEndian) +TEST_BIG_ENDIAN(IS_BIG_ENDIAN) +if (BIG_ENDIAN) + message(WARNING "Big endian detected! " + "Integer aliasing as currently implemented will not function. " + "Please set -DSINGULARITY_NQT_PORTABLE=ON.") +endif() + include(GNUInstallDirs) if(SINGULARITY_BUILD_PYTHON) @@ -255,16 +285,21 @@ if(SINGULARITY_BUILD_SESAME2SPINER) endif() # defines +if (SINGULARITY_USE_TRUE_LOG_GRIDDING) + target_compile_definitions(singularity-eos_Interface + INTERFACE SINGULARITY_USE_TRUE_LOG_GRIDDING) +endif() if(SINGULARITY_USE_SINGLE_LOGS) - target_compile_definitions(singularity-eos_Interface INTERFACE SINGULARITY_USE_SINGLE_LOGS) + target_compile_definitions(singularity-eos_Interface + INTERFACE SINGULARITY_USE_SINGLE_LOGS) endif() -if(SINGULARITY_USE_HIGH_RISK_MATH) +if(SINGULARITY_NQT_ORDER_1) target_compile_definitions(singularity-eos_Interface - INTERFACE SINGULARITY_USE_HIGH_RISK_MATH) + INTERFACE SINGULARITY_NQT_ORDER_1) endif() -if (SINGULARITY_USE_TRUE_LOG_GRIDDING) +if(SINGULARITY_NQT_PORTABLE) target_compile_definitions(singularity-eos_Interface - INTERFACE SINGULARITY_USE_TRUE_LOG_GRIDDING) + INTERFACE SINGULARITY_NQT_PORTABLE) endif() if(SINGULARITY_TEST_SESAME) target_compile_definitions(singularity-eos_Interface INTERFACE SINGULARITY_TEST_SESAME) diff --git a/doc/sphinx/src/building.rst b/doc/sphinx/src/building.rst index d5b4839b4c..6dc37ce57c 100644 --- a/doc/sphinx/src/building.rst +++ b/doc/sphinx/src/building.rst @@ -114,8 +114,10 @@ The main CMake options to configure building are in the following table: ``SINGULARITY_BETTER_DEBUG_FLAGS`` ON Enables nicer GPU debug flags. May interfere with in-tree builds as a submodule. ``SINGULARITY_HIDE_MORE_WARNINGS`` OFF Makes warnings less verbose. May interfere with in-tree builds as a submodule. ``SINGULARITY_FORCE_SUBMODULE_MODE`` OFF Force build in _submodule_ mode. - ``SINGULARITY_USE_SINGLE_LOGS`` OFF Use single precision logarithms (may degrade accuracy). ``SINGULARITY_USE_TRUE_LOG_GRIDDING`` OFF Use grids that conform to logarithmic spacing. + ``SINGULARITY_USE_SINGLE_LOGS`` OFF Use single precision logarithms (may degrade accuracy). + ``SINGULARITY_NQT_ORDER_1`` OFF For fast logs, use the less accurate but faster 1st-order version. + ``SINGULARITY_NQT_PORTABLE`` OFF For fast logs, use the slower but endianness-independent implementation. ====================================== ======= =========================================== More options are available to modify only if certain other options or diff --git a/doc/sphinx/src/contributing.rst b/doc/sphinx/src/contributing.rst index d7526bee59..81b8484db4 100644 --- a/doc/sphinx/src/contributing.rst +++ b/doc/sphinx/src/contributing.rst @@ -667,34 +667,42 @@ of the mantissa plus the exponent: \lg(x) = \lg(m) + e Therefore, if we can find a fast, invertible approximation to -:math:`\lg(m)`, we will have achieved our goal. It turns out the -expression +:math:`\lg(m)`, we will have achieved our goal. The linear +interpolation of :math:`\lg(m)` on the given interval is .. math:: 2 (x - 1) -works pretty well, so we use that. (To convince yourself of this note -that for :math:`x=1/2` this expression returns -1 and for :math:`x=1`, -it returns 0, which are the correct values of :math:`\lg(x)` at the -bounds of the interval.) Thus our approximate, invertible expression -for :math:`\lg` is just +and the quadratic is .. math:: - 2 (m - 1) + e - -for the mantissa and exponent extracted via ``frexp``. This differs -from :math:`lg` by a maximum of about 0.1, which translates to at most -a 25 percent difference. As discussed above, however, the function -itself is an exact representation of itself and the difference from -:math:`lg` is acceptable. - -To invert, we use the built in function that inverts ``frexp``, -``ldexp``, which combines the mantissa and exponent into the original -floating point representation. - -This approach is described in more detail in our `short note`_ on the topic. + -\frac{4}{3} (m -2) (m - 1) + +where the former produces a function that is piecewise :math:`C^1` and +everywhere continuous. The latter produces a function that is +everywhere :math:`C^1` and piecewise :math:`C^2`. Both functions are +exactly exactly invertible. To invert, we use the built in function +that inverts ``frexp``, ``ldexp``, which combines the mantissa and +exponent into the original floating point representation. + +While these functions are not exactly logarithms, they do work for +building logarithmic grids. The smoothness of the transformation +mapping from linear to "not-quite-log" space does matter for +interpolation, however. Linear interpolation in "not-quite-log" space +converges at second order only in the :math:`L^1` norm for the linear +version of the approximate log. The quadratic version of the fast log +provides second-order convergence in all norms, however. + +Finally, while ``frexp`` and ``ldexp`` are portable and performant, +they are less performant than hand-implemented, low-level methods that +leverage the bitwise structure of floating point numbers. These +"bithacked" or "integer aliased" implementations are what are used in +practice in the code. + +This approach is described in more detail in our `short note`_ on the +topic. .. _Short note: https://arxiv.org/abs/2206.08957 diff --git a/doc/sphinx/src/models.rst b/doc/sphinx/src/models.rst index c3517e7529..1087807b2b 100644 --- a/doc/sphinx/src/models.rst +++ b/doc/sphinx/src/models.rst @@ -1736,12 +1736,17 @@ return a ``Real`` number. .. warning:: As with the SpinerEOS models, the stellar collapse models use fast logs. You can switch the logs to true logs with the - ``SINGULARITY_USE_TRUE_LOG_GRIDDING`` cmake option. + ``SINGULARITY_USE_TRUE_LOG_GRIDDING`` cmake option. This may be + desirable on ARM-based architectures (e.g., ``aarch64``), where + a hardware log intrinsic is available. + .. note:: - A more performant implementation of fast logs is available, but it - might not be portable. Enable it with the - ``SINGULARITY_USE_HIGH_RISK_MATH`` cmake option. + The default implementation of our fast logs assumes little endian + numbers. If you are on a big-endian machine, they will not work + properly. If you encounter a big-endian machine, please report it + to us in the issues and (for now) enable the portable + implementation of fast logs with ``-DSINGULARITY_NQT_PORTABLE=ON``. .. _Stellar Collapse: https://stellarcollapse.org/equationofstate.html @@ -1750,7 +1755,6 @@ return a ``Real`` number. .. _median filter: https://en.wikipedia.org/wiki/Median_filter - Helmholtz EOS `````````````` diff --git a/singularity-eos/base/fast-math/logs.hpp b/singularity-eos/base/fast-math/logs.hpp index cfbe274bff..09dc2be059 100644 --- a/singularity-eos/base/fast-math/logs.hpp +++ b/singularity-eos/base/fast-math/logs.hpp @@ -16,9 +16,10 @@ #define _SINGULARITY_EOS_UTILS_FAST_MATH_LOGS_ #include #include +#include #include - +#include /* * These functions are for use when moving to and from the gridding space * for Spiner grids. @@ -36,38 +37,21 @@ * * 3. The function and its inverse must be fast. * - * To meet these constraints, we approximate a logarithm by using - * frexp and ldexp to split a real number into its mantissa + exponent - * in base 2. The mantissa is a real number on the interval [0.5, 1) - * and the exponent is an integer such that + * To meet these constraints, we split a real number into its mantissa + * + exponent in base 2. The mantissa is a real number on the interval + * [0.5, 1) and the exponent is an integer such that * * mantissa * 2^exponent = number * - * To truly take the log, one needs to take the log of the mantissa. - * However, the first term in the logarithm's Taylor series is linear. - * Thus a linear approximation to log, for small numbers is valid. - * We use that linear approximation, which turns out to be: - * - * 2*(mantissa - 1) - * - * So that mantissa = 0.5 returns log_2(0.5) = -1 and mantissa (1) - * yields 0. - * - * Then the approximate log of the whole thing is the approximation of - * the log of the mantissa, plus the exponent: + * The log in base 2 is then * - * 2*(mantissa - 1) + exponent + * lg(number) = lg(mantissa) + exponent * - * Which is a continuous, invertible function that approximates lg - * only relatively accurately. The absolute difference of this - * function from lg is about 0.1 at its largest. This translates to a - * relative difference of at most 25%. - * - * However, we don't mind this difference, as we're only interested in - * generating a grid spacing "close enough" to log spacing. As long as - * we can go into and out of "grid space" reliably and quickly, we're - * happy, and this function is an EXACT map into and out of that - * space. + * so our goal is to approximate lg(mantissa) satisfying the above + * criteria. If we do, then we will achieve the desired goals. We have + * two approaches. The first is a linear approximation of + * lg(mantissa). The second is a quadratic one. The latter allows for + * continuity of the derivative. * * See ArXiv:2206.08957 */ @@ -78,76 +62,166 @@ namespace singularity { namespace FastMath { -// Integer Aliased versions. Likely not as portable -// TODO(JMM): Add single-precision versions +template +PORTABLE_FORCEINLINE_FUNCTION auto sgn(const T &x) { + return (T(0) < x) - (x < T(0)); +} + PORTABLE_FORCEINLINE_FUNCTION -auto as_long_int(double f) { return *reinterpret_cast(&f); } +auto as_int(double f) { return *reinterpret_cast(&f); } PORTABLE_FORCEINLINE_FUNCTION -auto as_double(long long int i) { return *reinterpret_cast(&i); } +auto as_double(std::int64_t i) { return *reinterpret_cast(&i); } +// First order interpolation based NQTs +// ---------------------------------------------------------------------- +// Reference implementations, however the integer cast implementation +// below is probably faster. PORTABLE_FORCEINLINE_FUNCTION -double fastlg_aliased(const double x) { +double lg_o1_portable(const double x) { + int e; + PORTABLE_REQUIRE(x > 0, "log divergent for x <= 0"); + const double m = frexp(x, &e); + return 2 * (m - 1) + e; +} + +PORTABLE_FORCEINLINE_FUNCTION +double pow2_o1_portable(const double x) { + const int flr = std::floor(x); + const double remainder = x - flr; + const double mantissa = 0.5 * (remainder + 1); + const double exponent = flr + 1; + return ldexp(mantissa, exponent); +} + +// Integer aliased versions +PORTABLE_FORCEINLINE_FUNCTION +double lg_o1_aliased(const double x) { + PORTABLE_REQUIRE(x > 0, "log divergent for x <= 0"); // Magic numbers constexpr because C++ doesn't constexpr reinterpret casts // these are floating point numbers as reinterpreted as integers. - // as_long_int(1.0) - constexpr long long int one_as_long_int = 4607182418800017408; - // 1./static_cast(as_long_int(2.0) - as_long_int(1.0)) + // as_int(1.0) + constexpr std::int64_t one_as_int = 4607182418800017408; + // 1./static_cast(as_int(2.0) - as_int(1.0)) constexpr double scale_down = 2.22044604925031e-16; - return static_cast(as_long_int(x) - one_as_long_int) * scale_down; + return static_cast(as_int(x) - one_as_int) * scale_down; } + PORTABLE_FORCEINLINE_FUNCTION -double fastpow2_aliased(const double x) { +double pow2_o1_aliased(const double x) { // Magic numbers constexpr because C++ doesn't constexpr reinterpret casts // these are floating point numbers as reinterpreted as integers. - // as_long_int(1.0) - constexpr long long int one_as_long_int = 4607182418800017408; - // as_long_int(2.0) - as_long_int(1.0) + // as_int(1.0) + constexpr std::int64_t one_as_int = 4607182418800017408; + // as_int(2.0) - as_int(1.0) + constexpr double scale_up = 4503599627370496; + return as_double(static_cast(x * scale_up) + one_as_int); +} +// ---------------------------------------------------------------------- + +// Second-order interpolation based NQTs +// These implementations are due to Peter Hammond +// ---------------------------------------------------------------------- +// Portable versions that use frexp/ldexp rather than integer aliasing +PORTABLE_FORCEINLINE_FUNCTION +double lg_o2_portable(const double x) { + PORTABLE_REQUIRE(x > 0, "log divergent for x <= 0"); + constexpr double four_thirds = 4. / 3.; + int e; + const double m = frexp(x, &e); + return e - four_thirds * (m - 2) * (m - 1); +} + +// This version uses the exact formula +PORTABLE_FORCEINLINE_FUNCTION +double pow2_o2_portable(const double x) { + // log2(mantissa). should go between -1 and 0 + const int flr = std::floor(x); + const double lm = x - flr - 1; + const double mantissa = 0.5 * (3 - std::sqrt(1 - 3 * lm)); + const double exponent = flr + 1; + return ldexp(mantissa, exponent); +} + +// Integer aliased/bithacked versions +PORTABLE_FORCEINLINE_FUNCTION +double lg_o2_aliased(const double x) { + PORTABLE_REQUIRE(x > 0, "log divergent for x <= 0"); + // as_int(1.0) == 2^62 - 2^52 + constexpr std::int64_t one_as_int = 4607182418800017408; + // 1/(as_int(2.0) - as_int(1.0)) == 2^-52 + constexpr double scale_down = 2.220446049250313e-16; + // 2^52 - 1 + constexpr std::int64_t mantissa_mask = 4503599627370495; + // 2^26 - 1 + constexpr std::int64_t low_mask = 67108863; + + const std::int64_t x_as_int = as_int(x) - one_as_int; + const std::int64_t frac_as_int = x_as_int & mantissa_mask; + const std::int64_t frac_high = frac_as_int >> 26; + const std::int64_t frac_low = frac_as_int & low_mask; + const std::int64_t frac_squared = + frac_high * frac_high + ((frac_high * frac_low) >> 25); + + return static_cast(x_as_int + ((frac_as_int - frac_squared) / 3)) * scale_down; +} + +PORTABLE_FORCEINLINE_FUNCTION +double pow2_o2_aliased(const double x) { + // as_int(1.0) == 2^62 - 2^52 + constexpr std::int64_t one_as_int = 4607182418800017408; + // as_int(2.0) - as_int(1.0) == 2^52 constexpr double scale_up = 4503599627370496; - return as_double(static_cast(x * scale_up) + one_as_long_int); + constexpr std::int64_t mantissa_mask = 4503599627370495; // 2^52 - 1 + constexpr std::int64_t a = 9007199254740992; // 2 * 2^52 + constexpr double b = 67108864; // 2^26 + constexpr std::int64_t c = 18014398509481984; // 4 * 2^52 + + const std::int64_t x_as_int = static_cast(x * scale_up); + const std::int64_t frac_as_int = x_as_int & mantissa_mask; + const std::int64_t frac_sqrt = + static_cast(b * std::sqrt(static_cast(c - 3 * frac_as_int))); + + return as_double(x_as_int + a - frac_sqrt - frac_as_int + one_as_int); } +// ---------------------------------------------------------------------- PORTABLE_FORCEINLINE_FUNCTION double fastlg(const double x) { -#ifdef SINGULARITY_USE_HIGH_RISK_MATH - return fastlg_aliased(x); +#ifdef SINGULARITY_NQT_ORDER_1 +#ifdef SINGULARITY_NQT_PORTABLE + return lg_o1_portable(x); #else - int n; - assert(x > 0 && "log divergent for x <= 0"); -#ifndef SINGULARITY_USE_SINGLE_LOGS - // double precision is default - const double y = frexp(x, &n); + return lg_o1_aliased(x); +#endif // PORTABLE #else - // faster but less accurate - const float y = frexpf((float)x, &n); -#endif // SINGULARITY_USE_SINGLE_LOGS - return 2 * (y - 1) + n; -#endif // SINGULARITY_USE_HIGH_RISK_MATH +#ifdef SINGULARITY_NQT_PORTABLE + return lg_o2_portable(x); +#else + return lg_o2_aliased(x); +#endif // PORTABLE +#endif // NQT_ORDER } PORTABLE_FORCEINLINE_FUNCTION double fastpow2(const double x) { -#ifdef SINGULARITY_USE_HIGH_RISK_MATH - return fastpow2_aliased(x); +#ifdef SINGULARITY_NQT_ORDER_1 +#ifdef SINGULARITY_NQT_PORTABLE + return pow2_o1_portable(x); #else - const int flr = std::floor(x); - const double remainder = x - flr; - const double mantissa = 0.5 * (remainder + 1); - const double exponent = flr + 1; -#ifndef SINGULARITY_USE_SINGLE_LOGS - // double precision is default - return ldexp(mantissa, exponent); + return pow2_o1_aliased(x); +#endif // PORTABLE #else - // Faster but less accurate - return ldexpf((float)mantissa, exponent); -#endif // SINGULARITY_USE_SINGLE_LOGS -#endif // SINGULARITY_USE_HIGH_RISK_MATH +#ifdef SINGULARITY_NQT_PORTABLE + return pow2_o2_portable(x); +#else + return pow2_o2_aliased(x); +#endif // PORTABLE +#endif // NQT_ORDER } PORTABLE_FORCEINLINE_FUNCTION double lg(const double x) { - - assert(x > 0 && "log divergent for x <= 0"); - + PORTABLE_REQUIRE(x > 0, "log divergent for x <= 0"); #ifndef SINGULARITY_USE_TRUE_LOG_GRIDDING // Default expression return fastlg(x);