Skip to content

Commit

Permalink
cleanup special ops
Browse files Browse the repository at this point in the history
add SYCL as a CMake compiler
  • Loading branch information
jcosborn committed Nov 2, 2023
1 parent 86fc1ce commit b680199
Show file tree
Hide file tree
Showing 13 changed files with 221 additions and 468 deletions.
36 changes: 36 additions & 0 deletions cmake/CMakeDetermineSYCLCompiler.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
if(NOT CMAKE_SYCL_COMPILER)
set(CMAKE_SYCL_COMPILER ${CMAKE_CXX_COMPILER})
endif()
mark_as_advanced(CMAKE_SYCL_COMPILER)
message(STATUS "The SYCL compiler is " ${CMAKE_SYCL_COMPILER})

if(NOT CMAKE_SYCL_COMPILER_ID_RUN)
set(CMAKE_SYCL_COMPILER_ID_RUN 1)

# Try to identify the compiler.
set(CMAKE_SYCL_COMPILER_ID)
set(CMAKE_SYCL_PLATFORM_ID)
file(READ ${CMAKE_ROOT}/Modules/CMakePlatformId.h.in CMAKE_SYCL_COMPILER_ID_PLATFORM_CONTENT)

set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS_FIRST)
set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS)

set(CMAKE_CXX_COMPILER_ID_CONTENT "#if defined(__INTEL_LLVM_COMPILER)\n# define COMPILER_ID \"IntelLLVM\"\n")
string(APPEND CMAKE_CXX_COMPILER_ID_CONTENT "#elif defined(__clang__)\n# define COMPILER_ID \"Clang\"\n")
string(APPEND CMAKE_CXX_COMPILER_ID_CONTENT "#endif\n")
include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
CMAKE_DETERMINE_COMPILER_ID(SYCL SYCLFLAGS CMakeCXXCompilerId.cpp)

_cmake_find_compiler_sysroot(SYCL)
endif()


#set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS_FIRST)
#set(CMAKESYCL_COMPILER_ID_TEST_FLAGS "-c")
#include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
#CMAKE_DETERMINE_COMPILER_ID(SYCL SYCLFLAGS CMakeCXXCompilerId.cpp)

configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeSYCLCompiler.cmake.in
${CMAKE_PLATFORM_INFO_DIR}/CMakeSYCLCompiler.cmake)

set(CMAKE_SYCL_COMPILER_ENV_VAR "SYCL")
3 changes: 3 additions & 0 deletions cmake/CMakeSYCLCompiler.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
set(CMAKE_SYCL_COMPILER "@CMAKE_SYCL_COMPILER@")
set(CMAKE_SYCL_COMPILER_LOADED 1)
set(CMAKE_SYCL_COMPILER_ENV_VAR "SYCL")
31 changes: 31 additions & 0 deletions cmake/CMakeSYCLInformation.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
if(NOT CMAKE_SYCL_COMPILE_OPTIONS_PIC)
set(CMAKE_SYCL_COMPILE_OPTIONS_PIC ${CMAKE_CXX_COMPILE_OPTIONS_PIC})
endif()

if(NOT CMAKE_SYCL_COMPILE_OPTIONS_PIE)
set(CMAKE_SYCL_COMPILE_OPTIONS_PIE ${CMAKE_CXX_COMPILE_OPTIONS_PIE})
endif()
if(NOT CMAKE_SYCL_LINK_OPTIONS_PIE)
set(CMAKE_SYCL_LINK_OPTIONS_PIE ${CMAKE_CXX_LINK_OPTIONS_PIE})
endif()
if(NOT CMAKE_SYCL_LINK_OPTIONS_NO_PIE)
set(CMAKE_SYCL_LINK_OPTIONS_NO_PIE ${CMAKE_CXX_LINK_OPTIONS_NO_PIE})
endif()

if(NOT CMAKE_SYCL_OUTPUT_EXTENSION)
set(CMAKE_SYCL_OUTPUT_EXTENSION ${CMAKE_CXX_OUTPUT_EXTENSION})
endif()

if(NOT CMAKE_INCLUDE_FLAG_SYCL)
set(CMAKE_INCLUDE_FLAG_SYCL ${CMAKE_INCLUDE_FLAG_CXX})
endif()

if(NOT CMAKE_SYCL_COMPILE_OBJECT)
set(CMAKE_SYCL_COMPILE_OBJECT "<CMAKE_SYCL_COMPILER> <DEFINES> <INCLUDES> <FLAGS> -o <OBJECT> -c <SOURCE>")
endif()

if(NOT CMAKE_SYCL_LINK_EXECUTABLE)
set(CMAKE_SYCL_LINK_EXECUTABLE "<CMAKE_SYCL_COMPILER> <FLAGS> <CMAKE_SYCL_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
endif()

set(CMAKE_SYCL_INFORMATION_LOADED 1)
1 change: 1 addition & 0 deletions cmake/CMakeTestSYCLCompiler.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
set(CMAKE_SYCL_COMPILER_WORKS 1 CACHE INTERNAL "")
37 changes: 19 additions & 18 deletions include/targets/generic/block_reduce_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,6 @@ namespace quda
}
};

/**
@brief Dummy generic implementation of block_reduce
*/
//template <bool is_device> struct block_reduce {
//template <typename T, typename reducer_t, typename param_t, typename... BR>
//T operator()(const T &value, bool, int, bool, reducer_t, param_t, BR&...)
//{
//return value;
//}
//};
//template <typename T, typename P, typename O>
//struct block_reduce {

/**
@brief WarpReduce provides a generic interface for performing
perform reductions at the warp or sub-warp level
Expand Down Expand Up @@ -141,6 +128,21 @@ namespace quda
}
};

/**
@brief Dummy generic implementation of block_reduce
*/
//template <bool is_device> struct block_reduce {
//template <typename T, typename reducer_t, typename param_t, typename... BR>
//T operator()(const T &value, bool, int, bool, reducer_t, param_t, BR&...)
//{
//return value;
//}
//};
//template <typename T, typename P, typename O>
//struct block_reduce {
//template <typename T, int block_dim, int batch_size, bool = true>
//struct block_reduce {}

/**
@brief BlockReduce provides a generic interface for performing
reductions at the block level
Expand All @@ -149,19 +151,18 @@ namespace quda
@tparam batch_size Batch size of the reduction. Threads will be
ordered such that batch size is the slowest running index.
*/
template <typename T, int block_dim, int batch_size = 1> class BlockReduce :
public block_reduce<T, block_dim, batch_size>
template <typename T, int block_dim, int batch_size = 1>
class BlockReduce : public block_reduce<T, block_dim, batch_size>
{
static_assert(batch_size == 1 || block_dim <= 2, "Batching not possible with 3-d block reduction");
using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
using block_reduce_t = block_reduce<T, block_dim, batch_size>;
const int batch;

public:
//using block_reduce_t::shared_mem_size;

template <typename ...U>
HOSTDEVICE constexpr BlockReduce(SpecialOps<U...> &ops, int batch = 0) : block_reduce_t(ops), batch(batch) {
static_assert(hasSpecialOpType<BlockReduce_t, SpecialOps<U...>>);
checkSpecialOp<BlockReduce_t, U...>();
}

constexpr BlockReduce(const BlockReduce<T,block_dim,batch_size> &) = delete;
Expand Down
57 changes: 19 additions & 38 deletions include/targets/generic/special_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,12 @@ namespace quda {
using SpecialOpsT = SpecialOps<T...>;
using KernelOpsT = SpecialOps<T...>;
};
//template <typename ...T> struct SpecialOps : SpecialOpsTarget<T...> {
// using SpecialOpsT = SpecialOps<T...>;
//};

// getSpecialOps
template <typename T, typename U = void> struct getSpecialOpsS { using type = NoSpecialOps; };
template <typename T> struct getSpecialOpsS<T,std::conditional_t<true,void,typename T::SpecialOpsT>> {
using type = typename T::SpecialOpsT;
};
//template <typename ...T> struct getSpecialOpsS<SpecialOps<T...>,void> {
// using type = SpecialOps<T...>;
//};
template <typename T> using getSpecialOps = typename getSpecialOpsS<T>::type;

// hasSpecialOp: checks if first type matches any of the op
Expand All @@ -74,38 +68,13 @@ namespace quda {
static_assert((hasSpecialOp<T,typename Ops::SpecialOpsT> || ...));
}


// OLD

template <typename ...T> struct op_Concurrent {}; // set of op types used concurrently (needs separate resources)
template <typename ...T> struct op_Sequential {}; // set of op types used sequentially (can share resources)
struct op_Base {}; // base type for other op types
template <typename T, int N = 0> struct op_BaseT : op_Base {
//using op_ElementT = T;
using ElemT = T;
static constexpr int n = N;
};



// forward declarations of op types
struct op_blockSync;
template <typename T> struct op_warp_combine;
//template <typename T, int N> struct op_thread_array;
//template <typename T> struct op_BlockReduce;
//template <typename T, typename D = opDimsBlock> struct op_SharedMemoryCache;
//template <typename T, typename S = opSizeBlock> struct op_SharedMemory;
//template <typename T, int S> using op_SharedMemStatic = op_SharedMemory<T,opSizeStatic<S>>;

// only types for convenience
using only_blockSync = SpecialOps<op_blockSync>;
template <typename T> using only_warp_combine = SpecialOps<op_warp_combine<T>>;
//template <typename T, int N> using only_thread_array = SpecialOps<op_thread_array<T,N>>;
//template <typename T> using only_BlockReduce = SpecialOps<op_BlockReduce<T>>;
//template <typename T, typename D = opDimsBlock> using only_SharedMemoryCache = SpecialOps<op_SharedMemoryCache<T,D>>;
//template <typename T, typename S = opSizeBlock> using only_SharedMemory = SpecialOps<op_SharedMemory<T,S>>;
//template <typename T, unsigned int S> using only_SharedMemStatic = only_SharedMemory<T,opSizeStatic<S>>;
//template <typename ...T> using only_Concurrent = SpecialOps<op_Concurrent<T...>>;

// explicitSpecialOps
template <typename T, typename U = void> struct explicitSpecialOpsS : std::false_type {};
Expand All @@ -122,6 +91,11 @@ namespace quda {
template <typename T> inline constexpr bool hasSpecialOps = hasSpecialOpsImpl<T>::value;
#endif

// checkSpecialOp
template <typename T, typename... U> static constexpr void checkSpecialOp() {
static_assert((std::is_same_v<T,U> || ...) == true);
}

// combineOps
template <typename ...T> struct combineOpsS {};
template <typename ...T> struct combineOpsS<NoSpecialOps,SpecialOps<T...>> {
Expand All @@ -133,6 +107,17 @@ namespace quda {
template <typename T, typename U> using combineOps = typename combineOpsS<T, U>::type;


// OLD

#if 0
template <typename ...T> struct op_Concurrent {}; // set of op types used concurrently (needs separate resources)
template <typename ...T> struct op_Sequential {}; // set of op types used sequentially (can share resources)
struct op_Base {}; // base type for other op types
template <typename T, int N = 0> struct op_BaseT : op_Base {
//using op_ElementT = T;
using ElemT = T;
static constexpr int n = N;
};

// unwrapSpecialOps
template <typename T> struct unwrapSpecialOpsS { using type = T; };
Expand All @@ -148,12 +133,6 @@ namespace quda {
template <typename T, typename U, typename ...V> static constexpr bool hasSpecialOpType<T,U,V...> =
hasSpecialOpType2<unwrapSpecialOps<T>,unwrapSpecialOps<U>> || hasSpecialOpType<T,V...>;

// checkSpecialOp
template <typename T, typename... U> static constexpr void checkSpecialOp() {
static_assert((std::is_same_v<T,U> || ...) == true);
}

#if 0
// hasBlockSync
template <typename ...T> static constexpr bool hasBlockSync = hasSpecialOpType<op_blockSync,T...>;
template <typename ...T> static constexpr bool hasBlockSync<op_Concurrent<T...>> = hasSpecialOpType<op_blockSync,T...>;
Expand Down Expand Up @@ -184,7 +163,6 @@ namespace quda {
using type = std::tuple_element_t<n,std::tuple<T...>>;
};
template <typename T, int n> using SpecialOpsType = SpecialOps<typename SpecialOpsTypeS<unwrapSpecialOps<T>,n>::type>;
#endif

// SpecialOpsElemType: element type from corresponding op types
//template <typename ...T> struct SpecialOpsElemTypeS { using type = void; };
Expand All @@ -206,6 +184,7 @@ namespace quda {
//template <typename T> struct SpecialOpDependS<T,std::enable_if_t<is_instance<T,op_Base>,void>> {
using deps = SpecialOps<typename T::dependencies>;
};
#endif

#if 0
// SpecialOpDependencies: returns SpecialOps<all dependencies>, all Concurrent and Sequential lists are flattened
Expand Down Expand Up @@ -277,6 +256,7 @@ namespace quda {
}
#endif

#if 0
// sharedMemOffset
template <typename T, int n> struct sharedMemOffset {
template <typename ...Arg>
Expand All @@ -297,5 +277,6 @@ namespace quda {
+ sharedMemSize<std::tuple_element_t<n-1,std::tuple<T...>>>(block, arg...);
}
};
#endif

}
31 changes: 3 additions & 28 deletions include/targets/sycl/block_reduce_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ namespace quda
*/
template <typename T, int block_dim, int batch_size>
struct block_reduceG {
using dependencies = op_Sequential<op_blockSync>;
using dependentOps = SpecialOps<op_blockSync>;
//using dependencies = op_Sequential<op_blockSync>;
//using dependentOps = SpecialOps<op_blockSync>;
using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
template <typename S> inline block_reduceG(S &ops) {};
/**
Expand Down Expand Up @@ -131,28 +131,11 @@ namespace quda
/**
@brief SYCL specialization of block_reduce, building on the warp_reduce
*/
#define DYNAMIC_SLM
template <typename T, int block_dim, int batch_size>
//struct block_reduceW {
struct block_reduceW : SharedMemory<T,SizeBlockDivWarp> {
using Smem = SharedMemory<T,SizeBlockDivWarp>;
//using Smem::shared_mem_size;
#ifdef DYNAMIC_SLM
//using opSmem = op_SharedMemory<T,opSizeBlockDivWarp>;
//using opSmem = SharedMemory<T,opSizeBlockDivWarp>;
//using dependencies = op_Sequential<op_blockSync,opSmem>;
//using dependentOps = SpecialOps<op_blockSync,opSmem>;
//template <typename ...Arg>
//static constexpr size_t shared_mem_size(dim3 block, Arg &...arg) {
//return opSizeBlockDivWarp::size<T>(block, arg...);
//}
#else
#endif
using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
//dependentOps ops;
template <typename S>
//inline block_reduceW(S &ops) : ops(getDependentOps<BlockReduce_t>(ops)) {};
inline block_reduceW(S &ops) : Smem(ops) {};
template <typename S> inline block_reduceW(S &ops) : Smem(ops) {};

template <int width_> struct warp_reduce_param {
static constexpr int width = width_;
Expand Down Expand Up @@ -187,15 +170,7 @@ namespace quda
// now do reduction between warps
if (!async) __syncthreads(); // only synchronize if we are not pipelining

//__shared__ T storage[max_items];
#ifdef DYNAMIC_SLM
//auto storage = getSharedMemPtr<opSmem>(ops);
auto storage = Smem::sharedMem();
#else
static_assert(sizeof(T[max_items])<=device::shared_memory_size(), "Block reduce shared mem size too large");
auto mem = sycl::ext::oneapi::group_local_memory_for_overwrite<T[max_items]>(getGroup());
auto storage = &((*mem)[0]);
#endif

// if first thread in warp, write result to shared memory
if (thread_idx % device::warp_size() == 0) storage[batch * warp_items + warp_idx] = value;
Expand Down
Loading

0 comments on commit b680199

Please sign in to comment.