Skip to content

Commit

Permalink
Merge branch 'master' into cpython
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia authored Jun 24, 2024
2 parents 0186612 + 4b5d452 commit 4867f4f
Show file tree
Hide file tree
Showing 20 changed files with 1,765 additions and 782 deletions.
1 change: 0 additions & 1 deletion .github/workflows/C++.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ jobs:

steps:
- uses: actions/checkout@v4

- name: 'Setup MSYS2'
uses: msys2/setup-msys2@v2
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python_wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ jobs:
- uses: actions/checkout@v4

- name: Install GCC and make
run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw"
run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw git"

- name: Build and Test Python 3.8
uses: actions/setup-python@v5
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
List of features / changes made / release notes, in reverse chronological order.
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).

V 2.3.0beta (6/21/24)

* Major acceleration of spread/interp kernels using XSIMD header-only lib,
kernel evaluation, templating by ns with AVX-width-dependent decisions.
Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).
NOTE: introduces new dependency (XSIMD), added to cMake and makefile.
* new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett).
* new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone).
* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
order, 1 FFT-style mode order.
Expand Down
54 changes: 31 additions & 23 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ project(FINUFFT VERSION 2.2.0 LANGUAGES C CXX)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
# Set custom compiler flags for gcc-compatible compilers
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -funroll-loops")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG -funroll-loops")
set(FINUFFT_CXX_FLAGS_RELEASE -O3 -funroll-loops -ffp-contract=fast)
set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -g ${FINUFFT_CXX_FLAGS_RELEASE})
endif ()

include(CTest)
Expand All @@ -19,13 +21,16 @@ if(NOT CMAKE_BUILD_TYPE)
endif()

if (NOT DEFINED FINUFFT_ARCH_FLAGS)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
# PowerPC arch does not have -march flag.
set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.")
else ()
set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
endif ()
endif()
if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
# PowerPC arch does not have -march flag.
set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.")
else ()
set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
endif ()
endif()
endif ()

set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")

Expand All @@ -41,7 +46,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable
option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF)
option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF)
option(FINUFFT_ENABLE_INSTALL "Whether to enable installation of FINUFFT library" ON)
# sphinx tag (don't remove): @cmake_opts_end

Expand All @@ -58,10 +63,11 @@ if (FINUFFT_USE_CPU)

set(CPM_DOWNLOAD_VERSION 0.38.0)
set(FFTW_VERSION 3.3.10)

set(XTL_VERSION 0.7.7)
set(XSIMD_VERSION 13.0.0)
include(cmake/setupCPM.cmake)
include(cmake/setupFFTW.cmake)

include(cmake/setupXSIMD.cmake)
endif ()

if (FINUFFT_BUILD_MATLAB)
Expand Down Expand Up @@ -103,8 +109,8 @@ function(enable_asan target)

if (CMAKE_CXX_COMPILER_ID IN_LIST FINUFFT_GNU_LIKE_COMPILERS)
# Enable only on clang / gcc compilers.
target_compile_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address>)
target_link_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address>)
target_compile_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>)
target_link_options(${target} PRIVATE $<$<CONFIG:DEBUG>:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>)
endif ()
endfunction()

Expand All @@ -130,17 +136,18 @@ endfunction()
# Utility function to set finufft compilation options.
function(set_finufft_options target)
set_property(TARGET ${target} PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET ${target} PROPERTY CMAKE_CXX_STANDARD 14)
target_compile_features(${target} PRIVATE cxx_std_17)

enable_asan(${target})

target_compile_options(${target} PRIVATE SHELL:$<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# Enable cx-limited-range on supported compilers
target_compile_options(${target} PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:-fcx-limited-range>)
endif ()

target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
target_include_directories(${target} SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
target_compile_options(${target} PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
target_compile_options(${target} PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
if (FINUFFT_USE_OPENMP)
target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
# there are issues on windows with OpenMP and CMake, so we need to manually add the flags
Expand All @@ -166,6 +173,8 @@ function(set_finufft_options target)
target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR})
endif ()

# XSIMD is a header only library, so we just need to include the headers
target_include_directories(${target} PUBLIC ${XSIMD_INCLUDE_DIR})
endfunction()

if (FINUFFT_USE_CPU)
Expand All @@ -174,12 +183,13 @@ if (FINUFFT_USE_CPU)
target_compile_definitions(finufft_f32 PRIVATE SINGLE)
set_finufft_options(finufft_f32)
target_link_libraries(finufft_f32 PUBLIC ${FINUFFT_FFTW_LIBRARIES})
target_link_libraries(finufft_f32 PRIVATE xsimd)

add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
target_compile_definitions(finufft_f64 PRIVATE)
set_finufft_options(finufft_f64)
target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES})

target_link_libraries(finufft_f64 PRIVATE xsimd)
if (WIN32)
add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL)
Expand All @@ -204,8 +214,7 @@ if (FINUFFT_USE_CPU)
if (NOT WIN32)
target_link_libraries(finufft PUBLIC m)
endif ()
target_include_directories(finufft PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
target_include_directories(finufft SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
target_include_directories(finufft PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")

add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
set_finufft_options(finufft_static)
Expand All @@ -214,8 +223,7 @@ if (FINUFFT_USE_CPU)
if (NOT WIN32)
target_link_libraries(finufft_static PUBLIC m)
endif ()
target_include_directories(finufft_static PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
target_include_directories(finufft_static SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
target_include_directories(finufft_static PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")

if (FINUFFT_ENABLE_INSTALL)
file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h")
Expand Down
22 changes: 22 additions & 0 deletions cmake/setupXSIMD.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
CPMAddPackage(
NAME xtl
GIT_REPOSITORY "https://github.com/xtensor-stack/xtl.git"
GIT_TAG ${XTL_VERSION}
EXCLUDE_FROM_ALL YES
GIT_SHALLOW YES
OPTIONS "XTL_DISABLE_EXCEPTIONS YES"
)

CPMAddPackage(
NAME xsimd
GIT_REPOSITORY "https://github.com/xtensor-stack/xsimd.git"
GIT_TAG ${XSIMD_VERSION}
EXCLUDE_FROM_ALL YES
GIT_SHALLOW YES
OPTIONS
"XSIMD_SKIP_INSTALL YES"
"XSIMD_ENABLE_XTL_COMPLEX YES"
)

get_property(XSIMD_SOURCE_DIR TARGET xsimd PROPERTY SOURCE_DIR)
set(XSIMD_INCLUDE_DIR ${XSIMD_SOURCE_DIR}/include)
6 changes: 4 additions & 2 deletions devel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,7 @@ if (benchmark_ADDED)
endif()

add_executable(foldrescale foldrescale.cpp)
target_link_libraries(foldrescale finufft benchmark)
target_compile_options(foldrescale PRIVATE -mavx2)
target_link_libraries(foldrescale finufft benchmark xsimd)
add_executable(padding padding.cpp)
target_link_libraries(padding finufft xsimd)
target_compile_options(padding PRIVATE -march=native)
60 changes: 0 additions & 60 deletions devel/foldrescale.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,6 @@ inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) {
return result * fN;
}

#ifdef __AVX2__

inline __attribute__((always_inline)) __m256d foldRescaleVec(__m256d x, BIGINT N) {
__m256d result;
__m256d fN = _mm256_set1_pd(FLT(N));
static const __m256d x2pi = _mm256_set1_pd(FLT(M_1_2PI));
static const __m256d half = _mm256_set1_pd(FLT(0.5));
result = _mm256_fmadd_pd(x, x2pi, half);
result = _mm256_sub_pd(result, _mm256_floor_pd(result));
return _mm256_mul_pd(result, fN);
}
#endif

static std::mt19937_64 gen;
static std::uniform_real_distribution<> dis(-10, 10);
Expand Down Expand Up @@ -197,21 +185,6 @@ static void BM_FoldRescale05N(benchmark::State &state) {
}
}

#ifdef __AVX2__
static void BM_FoldRescaleVec(benchmark::State &state) {
for (auto _ : state) {
// Generate 4 floating point numbers
double x1 = dis(gen);
double x2 = dis(gen);
double x3 = dis(gen);
double x4 = dis(gen);
// Pack them into an AVX vector
__m256d x = _mm256_set_pd(x1, x2, x3, x4);
// Call the foldRescaleVec function
benchmark::DoNotOptimize(foldRescaleVec(x, N));
}
}
#endif

BENCHMARK(BM_BASELINE)->Iterations(10000000);
BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000);
Expand All @@ -221,9 +194,6 @@ BENCHMARK(BM_FoldRescale02)->Iterations(1000000);
BENCHMARK(BM_FoldRescale03)->Iterations(10000000);
BENCHMARK(BM_FoldRescale04)->Iterations(1000000);
BENCHMARK(BM_FoldRescale05)->Iterations(1000000);
#ifdef __AVX2__
BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000 / 4);
#endif
BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000);
BENCHMARK(BM_FoldRescale00N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale01N)->Iterations(1000000);
Expand All @@ -232,33 +202,6 @@ BENCHMARK(BM_FoldRescale03N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale04N)->Iterations(1000000);
BENCHMARK(BM_FoldRescale05N)->Iterations(1000000);

#ifdef __AVX2__
void testFoldRescaleVec_avx256_vs_foldRescale00() {
// Generate 4 floating point numbers
double x1 = dis(gen);
double x2 = dis(gen);
double x3 = dis(gen);
double x4 = dis(gen);

// Pack them into an AVX vector
__m256d xVec = _mm256_set_pd(x1, x2, x3, x4);

// Call the foldRescaleVec function
__m256d resultVec = foldRescaleVec(xVec, N);

// Extract the results from the AVX vector

for (int i = 0; i < 4; ++i) {
double result00 = foldRescale03<true>(xVec[i], N);
if (std::abs(1 - result00 / resultVec[i]) > 1e-14) {
std::cout << "input: " << xVec[i] << " result00: " << result00
<< " result256: " << resultVec[i] << std::endl;
throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00");
}
}
}
#endif

void testFoldRescaleFunctions() {
for (bool p : {true}) {
for (int i = 0; i < 1024; ++i) { // Run the test 1000 times
Expand Down Expand Up @@ -341,9 +284,6 @@ int main(int argc, char **argv) {
std::cout << "Seed: " << seed << "\n";
gen.seed(seed);
testFoldRescaleFunctions();
#ifdef __AVX2__
testFoldRescaleVec_avx256_vs_foldRescale00();
#endif
::benchmark::Initialize(&argc, argv);
BaselineSubtractingReporter reporter;
::benchmark::RunSpecifiedBenchmarks(&reporter);
Expand Down
Loading

0 comments on commit 4867f4f

Please sign in to comment.