Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nqt o2 #433

Merged
merged 20 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ jobs:
#..
make
make install
make test
ctest --output-on-failure
63 changes: 49 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,17 @@ option(SINGULARITY_BETTER_DEBUG_FLAGS "Better debug flags for singularity" ON)
option(SINGULARITY_HIDE_MORE_WARNINGS "hide more warnings" OFF)

# toggle code options
option(SINGULARITY_USE_SINGLE_LOGS
"Use single precision logs. Can harm accuracy." OFF)
option(SINGULARITY_USE_TRUE_LOG_GRIDDING
"Use grids that conform to log spacing." OFF)
# TODO(JMM): Should this automatically be activated when true log gridding is
# off?
cmake_dependent_option(
SINGULARITY_USE_HIGH_RISK_MATH
"Use integer aliased logs, may not be portable" OFF
"NOT SINGULARITY_USE_TRUE_LOG_GRIDDING" OFF)
"Use grids that conform to log spacing." OFF)
cmake_dependent_option(SINGULARITY_USE_SINGLE_LOGS
"Use single precision logs. Only available for true log gridding. Can harm accuracy."
OFF "SINGULARITY_USE_TRUE_LOG_GRIDDING" OFF)
option(SINGULARITY_NQT_ORDER_1
Yurlungur marked this conversation as resolved.
Show resolved Hide resolved
"In NQT logs, use first order. Faster but less accurate."
OFF)
option(SINGULARITY_NQT_PORTABLE
"In NQT logs, use portable, rather than bithacked implementation. Slower, but more likely to function on exotic architectures."
OFF)

# misc options
option(SINGULARITY_FORCE_SUBMODULE_MODE "Submodule mode" OFF)
Expand All @@ -126,6 +127,26 @@ set(SINGULARITY_PLUGINS "" CACHE STRING "List of paths to plugin directories")
set(SINGULARITY_VARIANT "singularity-eos/eos/default_variant.hpp" CACHE STRING
"The include path for the file containing the definition of singularity::EOS.")

# Detect ARM architecture
set(SINGULARITY_ON_ARM OFF CACHE BOOL "We are running on an ARM system")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)")
if (NOT SINGULARITY_USE_CUDA)
message(STATUS
"ARM architecture detected: ${CMAKE_SYSTEM_PROCESSOR}")
set(SINGULARITY_ON_ARM ON CACHE BOOL
"We are running on an ARM system")
endif()
endif()

if (SINGULAIRTY_ON_ARM)
if (NOT SINGULARITY_USE_TRUE_LOG_GRIDDING)
message(WARNING
"Fast logs not necessarily better on ARM CPU systems. "
"You may wish to build with "
"-DSINGULARITY_USE_TRUE_LOG_GRIDDING=ON.")
endif()
endif()

# ------------------------------------------------------------------------------#
# singularity-eos Library
# ------------------------------------------------------------------------------#
Expand Down Expand Up @@ -163,6 +184,15 @@ if(SINGULARITY_USE_FORTRAN)
include(CMakeDetermineFortranCompiler)
endif()

# Big endianness
include(TestBigEndian)
TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
if (BIG_ENDIAN)
message(WARNING "Big endian detected! "
"Integer aliasing as currently implemented will not function. "
"Please set -DSINGULARITY_NQT_PORTABLE=ON.")
endif()

include(GNUInstallDirs)

if(SINGULARITY_BUILD_PYTHON)
Expand Down Expand Up @@ -255,16 +285,21 @@ if(SINGULARITY_BUILD_SESAME2SPINER)
endif()

# defines
if (SINGULARITY_USE_TRUE_LOG_GRIDDING)
target_compile_definitions(singularity-eos_Interface
INTERFACE SINGULARITY_USE_TRUE_LOG_GRIDDING)
endif()
if(SINGULARITY_USE_SINGLE_LOGS)
target_compile_definitions(singularity-eos_Interface INTERFACE SINGULARITY_USE_SINGLE_LOGS)
target_compile_definitions(singularity-eos_Interface
INTERFACE SINGULARITY_USE_SINGLE_LOGS)
endif()
if(SINGULARITY_USE_HIGH_RISK_MATH)
if(SINGULARITY_NQT_ORDER_1)
target_compile_definitions(singularity-eos_Interface
INTERFACE SINGULARITY_USE_HIGH_RISK_MATH)
INTERFACE SINGULARITY_NQT_ORDER_1)
endif()
if (SINGULARITY_USE_TRUE_LOG_GRIDDING)
if(SINGULARITY_NQT_PORTABLE)
target_compile_definitions(singularity-eos_Interface
INTERFACE SINGULARITY_USE_TRUE_LOG_GRIDDING)
INTERFACE SINGULARITY_NQT_PORTABLE)
endif()
if(SINGULARITY_TEST_SESAME)
target_compile_definitions(singularity-eos_Interface INTERFACE SINGULARITY_TEST_SESAME)
Expand Down
4 changes: 3 additions & 1 deletion doc/sphinx/src/building.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,10 @@ The main CMake options to configure building are in the following table:
``SINGULARITY_BETTER_DEBUG_FLAGS`` ON Enables nicer GPU debug flags. May interfere with in-tree builds as a submodule.
``SINGULARITY_HIDE_MORE_WARNINGS`` OFF Makes warnings less verbose. May interfere with in-tree builds as a submodule.
``SINGULARITY_FORCE_SUBMODULE_MODE`` OFF Force build in _submodule_ mode.
``SINGULARITY_USE_SINGLE_LOGS`` OFF Use single precision logarithms (may degrade accuracy).
``SINGULARITY_USE_TRUE_LOG_GRIDDING`` OFF Use grids that conform to logarithmic spacing.
``SINGULARITY_USE_SINGLE_LOGS`` OFF Use single precision logarithms (may degrade accuracy).
``SINGULARITY_NQT_ORDER_1`` OFF For fast logs, use the less accurate but faster 1st-order version.
``SINGULARITY_NQT_PORTABLE`` OFF For fast logs, use the slower but endianness-independent implementation.
====================================== ======= ===========================================

More options are available to modify only if certain other options or
Expand Down
48 changes: 28 additions & 20 deletions doc/sphinx/src/contributing.rst
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change to this documentation is where the core of the method is. Essentially the quadratic interpolation of $\lg(mantissa)$ on the interval $[0.5, 1)$ has the advantage that the NQT function is C1 everywhere, which improves convergence of linear interpolation. The function itself is still a 100% accurate version of itself, but improving the continuity class of the method improves convergence of stencil ops.

Original file line number Diff line number Diff line change
Expand Up @@ -667,34 +667,42 @@ of the mantissa plus the exponent:
\lg(x) = \lg(m) + e

Therefore, if we can find a fast, invertible approximation to
:math:`\lg(m)`, we will have achieved our goal. It turns out the
expression
:math:`\lg(m)`, we will have achieved our goal. The linear
interpolation of :math:`\lg(m)` on the given interval is

.. math::

2 (x - 1)

works pretty well, so we use that. (To convince yourself of this note
that for :math:`x=1/2` this expression returns -1 and for :math:`x=1`,
it returns 0, which are the correct values of :math:`\lg(x)` at the
bounds of the interval.) Thus our approximate, invertible expression
for :math:`\lg` is just
and the quadratic is

.. math::

2 (m - 1) + e

for the mantissa and exponent extracted via ``frexp``. This differs
from :math:`lg` by a maximum of about 0.1, which translates to at most
a 25 percent difference. As discussed above, however, the function
itself is an exact representation of itself and the difference from
:math:`lg` is acceptable.

To invert, we use the built in function that inverts ``frexp``,
``ldexp``, which combines the mantissa and exponent into the original
floating point representation.

This approach is described in more detail in our `short note`_ on the topic.
-\frac{4}{3} (m -2) (m - 1)

where the former produces a function that is piecewise :math:`C^1` and
everywhere continuous. The latter produces a function that is
everywhere :math:`C^1` and piecewise :math:`C^2`. Both functions are
exactly exactly invertible. To invert, we use the built in function
that inverts ``frexp``, ``ldexp``, which combines the mantissa and
exponent into the original floating point representation.

While these functions are not exactly logarithms, they do work for
building logarithmic grids. The smoothness of the transformation
mapping from linear to "not-quite-log" space does matter for
interpolation, however. Linear interpolation in "not-quite-log" space
converges at second order only in the :math:`L^1` norm for the linear
version of the approximate log. The quadratic version of the fast log
provides second-order convergence in all norms, however.

Finally, while ``frexp`` and ``ldexp`` are portable and performant,
they are less performant than hand-implemented, low-level methods that
leverage the bitwise structure of floating point numbers. These
"bithacked" or "integer aliased" implementations are what are used in
practice in the code.

This approach is described in more detail in our `short note`_ on the
topic.

Yurlungur marked this conversation as resolved.
Show resolved Hide resolved
.. _Short note: https://arxiv.org/abs/2206.08957

Expand Down
14 changes: 9 additions & 5 deletions doc/sphinx/src/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1736,12 +1736,17 @@ return a ``Real`` number.
.. warning::
As with the SpinerEOS models, the stellar collapse models use fast
logs. You can switch the logs to true logs with the
``SINGULARITY_USE_TRUE_LOG_GRIDDING`` cmake option.
``SINGULARITY_USE_TRUE_LOG_GRIDDING`` cmake option. This may be
desirable on ARM-based architectures (e.g., ``aarch64``), where
a hardware log intrinsic is available.


.. note::
A more performant implementation of fast logs is available, but it
might not be portable. Enable it with the
``SINGULARITY_USE_HIGH_RISK_MATH`` cmake option.
The default implementation of our fast logs assumes little endian
numbers. If you are on a big-endian machine, they will not work
properly. If you encounter a big-endian machine, please report it
to us in the issues and (for now) enable the portable
implementation of fast logs with ``-DSINGULARITY_NQT_PORTABLE=ON``.

.. _Stellar Collapse: https://stellarcollapse.org/equationofstate.html

Expand All @@ -1750,7 +1755,6 @@ return a ``Real`` number.
.. _median filter: https://en.wikipedia.org/wiki/Median_filter



Helmholtz EOS
``````````````

Expand Down
6 changes: 6 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ add_executable(get_sound_speed_press
target_link_libraries(get_sound_speed_press PRIVATE
singularity-eos::singularity-eos)

if (SINGULARITY_USE_SPINER_WITH_HDF5)
add_executable(eos_grid eos_grid.cpp)
target_link_libraries(eos_grid PRIVATE
singularity-eos::singularity-eos)
endif()

if(SINGULARITY_USE_EOSPAC AND SINGULARITY_USE_SPINER_WITH_HDF5)
add_executable(get_sesame_state
get_sesame_state.cpp)
Expand Down
165 changes: 165 additions & 0 deletions example/eos_grid.cpp
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New example showing how to profile. Can be set to profile any EOS, even analytic.

Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
//------------------------------------------------------------------------------
// © 2024. Triad National Security, LLC. All rights reserved. This
// program was produced under U.S. Government contract 89233218CNA000001
// for Los Alamos National Laboratory (LANL), which is operated by Triad
// National Security, LLC for the U.S. Department of Energy/National
// Nuclear Security Administration. All rights in the program are
// reserved by Triad National Security, LLC, and the U.S. Department of
// Energy/National Nuclear Security Administration. The Government is
// granted for itself and others acting on its behalf a nonexclusive,
// paid-up, irrevocable worldwide license in this material to reproduce,
// prepare derivative works, distribute copies to the public, perform
// publicly and display publicly, and to permit others to do so.
//------------------------------------------------------------------------------

// C headers
#include <cmath>

// C++ headers
#include <chrono>
#include <iostream>
#include <string>

// HDF5 we'll need this for I/O
#include <hdf5.h>
#include <hdf5_hl.h>

// This library contains portable utilities
#include <ports-of-call/portability.hpp>

// This contains useful tools for preventing things like divide by zero
#include <singularity-eos/base/robust_utils.hpp>
// Needed to import the eos models
#include <singularity-eos/eos/eos.hpp>

// This library contains the spiner table object, which we will use to
// store our output
#include <spiner/databox.hpp>
#include <spiner/interpolation.hpp>

// for timing
using duration = std::chrono::nanoseconds;

// These are the specializations of spiner we will use
using DataBox = Spiner::DataBox<Real>;
using RegularGrid1D = Spiner::RegularGrid1D<Real>;

// Set the EOS you want to use here.
using EOS = singularity::StellarCollapse;

int main(int argc, char *argv[]) {
// This is needed for Kokkos
#ifdef PORTABILITY_STRATEGY_KOKKOS
Kokkos::initialize();
#endif
// note the scoping here. This means Kokkos objects are cleaned up
// before finalization
{
if (argc < 8) {
std::cerr << "Usage: " << argv[0]
<< " savefilename log10rhomin log10rhomax numrho log10Tmin log10Tmax "
"numT eosargs..."
<< std::endl;
std::exit(1);
}
const std::string savefilename = argv[1];
const double lrho_min = atof(argv[2]);
const double lrho_max = atof(argv[3]);
const int nrho = atoi(argv[4]);
RegularGrid1D lrhoGrid(lrho_min, lrho_max, nrho);

const double lT_min = atof(argv[5]);
const double lT_max = atof(argv[6]);
const int nT = atoi(argv[7]);
RegularGrid1D lTGrid(lT_min, lT_max, nT);

// This is the databox we will evaluate onto
DataBox press_d(Spiner::AllocationTarget::Device, nrho, nT);
press_d.setRange(0, lTGrid); // note 0 is rightmost index
press_d.setRange(1, lrhoGrid);
// This is the host mirror, we will copy into it
DataBox press_h(Spiner::AllocationTarget::Host, nrho, nT);
press_h.setRange(0, lTGrid);
press_h.setRange(1, lrhoGrid);

// These are the pre-evaluated density and temperature points
std::cout << "Filling grid points..." << std::endl;
DataBox rhos(Spiner::AllocationTarget::Device, nrho);
portableFor(
"Set rho", 0, nrho,
PORTABLE_LAMBDA(const int i) { rhos(i) = std::pow(10., lrhoGrid.x(i)); });
DataBox Ts(Spiner::AllocationTarget::Device, nT);
portableFor(
"Set T", 0, nT,
PORTABLE_LAMBDA(const int i) { Ts(i) = std::pow(10., lTGrid.x(i)); });

// if you have arguments you want to pass into your EOS load them here
const std::string loadname = argv[8];
const double Ye = atof(argv[9]);

std::cout << "Initializing EOS..." << std::endl;
EOS eos_h(loadname);
EOS eos_d = eos_h.GetOnDevice();

std::cout << "Evaluating..." << std::endl;
// start timers
#ifdef PORTABILITY_STRATEGY_KOKKOS
Kokkos::fence();
#endif
auto start = std::chrono::high_resolution_clock::now();
portableFor(
"P(rho, T)", 0, nrho, 0, nT, PORTABLE_LAMBDA(const int j, const int i) {
// Some EOS objects take lambdas. If you want to set
// them locally, do it like this. Otherwise, you may
// want to create a device-side array for them to
// avoid race conditions.
Real lambda[2];
lambda[0] = Ye;
press_d(j, i) = eos_d.PressureFromDensityTemperature(rhos(j), Ts(i), lambda);
});
// stop timers
#ifdef PORTABILITY_STRATEGY_KOKKOS
Kokkos::fence();
#endif
auto stop = std::chrono::high_resolution_clock::now();
auto duration_device = std::chrono::duration_cast<duration>(stop - start);

// copy data from device to host
std::cout << "Copying to host..." << std::endl;
portableCopyToHost(press_h.data(), press_d.data(), press_h.sizeBytes());

// For fun lets also evalaute all on host
std::cout << "Generate arrays on host..." << std::endl;
DataBox rhos_h(Spiner::AllocationTarget::Host, nrho);
for (int i = 0; i < nrho; ++i) {
rhos_h(i) = std::pow(10., lrhoGrid.x(i));
}
DataBox Ts_h(Spiner::AllocationTarget::Host, nT);
for (int i = 0; i < nT; ++i) {
Ts_h(i) = std::pow(10., lTGrid.x(i));
}
std::cout << "Looping on host" << std::endl;
start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < nT; ++j) {
for (int i = 0; i < nrho; ++i) {
Real lambda[2];
lambda[0] = Ye;
press_h(j, i) = eos_h.PressureFromDensityTemperature(rhos_h(j), Ts_h(i), lambda);
}
}
stop = std::chrono::high_resolution_clock::now();
auto duration_host = std::chrono::duration_cast<duration>(stop - start);

std::cout << "Saving file..." << std::endl;
press_h.saveHDF(savefilename); // if you have HDF5 enabled, thats all it takes
std::cout << "Time per point, (ns):\n"
<< "host = " << duration_host.count() / (static_cast<Real>(nrho) * nT)
<< "\n"
<< "device = " << duration_device.count() / (static_cast<Real>(nrho) * nT)
<< std::endl;
}
#ifdef PORTABILITY_STRATEGY_KOKKOS
Kokkos::finalize();
#endif
return 0;
}
2 changes: 1 addition & 1 deletion example/get_sound_speed_press.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ inline void PressureSoundSpeedFromDensityEnergyDensity(double *rho, // inputs

// Loop through cells and use the FillEos function call
for (int i = 0; i < Ncells; ++i) {
double eps, temp, cv;
double temp, cv;
// FillEos is very general and is capable of modifying any of the inputs,
// so const vars cannot be passed into it. However, it is often more performant
// than making individual function calls.
Expand Down
Loading
Loading