cleanup special ops

add SYCL as a CMake compiler
lattice · Nov 2, 2023 · b680199 · b680199
1 parent 86fc1ce
commit b680199
Show file tree

Hide file tree

Showing 13 changed files with 221 additions and 468 deletions.
diff --git a/cmake/CMakeDetermineSYCLCompiler.cmake b/cmake/CMakeDetermineSYCLCompiler.cmake
@@ -0,0 +1,36 @@
+if(NOT CMAKE_SYCL_COMPILER)
+  set(CMAKE_SYCL_COMPILER ${CMAKE_CXX_COMPILER})
+endif()
+mark_as_advanced(CMAKE_SYCL_COMPILER)
+message(STATUS "The SYCL compiler is " ${CMAKE_SYCL_COMPILER})
+
+if(NOT CMAKE_SYCL_COMPILER_ID_RUN)
+  set(CMAKE_SYCL_COMPILER_ID_RUN 1)
+
+  # Try to identify the compiler.
+  set(CMAKE_SYCL_COMPILER_ID)
+  set(CMAKE_SYCL_PLATFORM_ID)
+  file(READ ${CMAKE_ROOT}/Modules/CMakePlatformId.h.in CMAKE_SYCL_COMPILER_ID_PLATFORM_CONTENT)
+
+  set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS_FIRST)
+  set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS)
+
+  set(CMAKE_CXX_COMPILER_ID_CONTENT "#if defined(__INTEL_LLVM_COMPILER)\n# define COMPILER_ID \"IntelLLVM\"\n")
+  string(APPEND CMAKE_CXX_COMPILER_ID_CONTENT "#elif defined(__clang__)\n# define COMPILER_ID \"Clang\"\n")
+  string(APPEND CMAKE_CXX_COMPILER_ID_CONTENT "#endif\n")
+  include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
+  CMAKE_DETERMINE_COMPILER_ID(SYCL SYCLFLAGS CMakeCXXCompilerId.cpp)
+
+  _cmake_find_compiler_sysroot(SYCL)
+endif()
+
+
+#set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS_FIRST)
+#set(CMAKESYCL_COMPILER_ID_TEST_FLAGS "-c")
+#include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
+#CMAKE_DETERMINE_COMPILER_ID(SYCL SYCLFLAGS CMakeCXXCompilerId.cpp)
+
+configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeSYCLCompiler.cmake.in
+               ${CMAKE_PLATFORM_INFO_DIR}/CMakeSYCLCompiler.cmake)
+
+set(CMAKE_SYCL_COMPILER_ENV_VAR "SYCL")
diff --git a/cmake/CMakeSYCLCompiler.cmake.in b/cmake/CMakeSYCLCompiler.cmake.in
@@ -0,0 +1,3 @@
+set(CMAKE_SYCL_COMPILER "@CMAKE_SYCL_COMPILER@")
+set(CMAKE_SYCL_COMPILER_LOADED 1)
+set(CMAKE_SYCL_COMPILER_ENV_VAR "SYCL")
diff --git a/cmake/CMakeSYCLInformation.cmake b/cmake/CMakeSYCLInformation.cmake
@@ -0,0 +1,31 @@
+if(NOT CMAKE_SYCL_COMPILE_OPTIONS_PIC)
+  set(CMAKE_SYCL_COMPILE_OPTIONS_PIC ${CMAKE_CXX_COMPILE_OPTIONS_PIC})
+endif()
+
+if(NOT CMAKE_SYCL_COMPILE_OPTIONS_PIE)
+  set(CMAKE_SYCL_COMPILE_OPTIONS_PIE ${CMAKE_CXX_COMPILE_OPTIONS_PIE})
+endif()
+if(NOT CMAKE_SYCL_LINK_OPTIONS_PIE)
+  set(CMAKE_SYCL_LINK_OPTIONS_PIE ${CMAKE_CXX_LINK_OPTIONS_PIE})
+endif()
+if(NOT CMAKE_SYCL_LINK_OPTIONS_NO_PIE)
+  set(CMAKE_SYCL_LINK_OPTIONS_NO_PIE ${CMAKE_CXX_LINK_OPTIONS_NO_PIE})
+endif()
+
+if(NOT CMAKE_SYCL_OUTPUT_EXTENSION)
+  set(CMAKE_SYCL_OUTPUT_EXTENSION ${CMAKE_CXX_OUTPUT_EXTENSION})
+endif()
+
+if(NOT CMAKE_INCLUDE_FLAG_SYCL)
+  set(CMAKE_INCLUDE_FLAG_SYCL ${CMAKE_INCLUDE_FLAG_CXX})
+endif()
+
+if(NOT CMAKE_SYCL_COMPILE_OBJECT)
+  set(CMAKE_SYCL_COMPILE_OBJECT "<CMAKE_SYCL_COMPILER> <DEFINES> <INCLUDES> <FLAGS> -o <OBJECT> -c <SOURCE>")
+endif()
+
+if(NOT CMAKE_SYCL_LINK_EXECUTABLE)
+  set(CMAKE_SYCL_LINK_EXECUTABLE "<CMAKE_SYCL_COMPILER> <FLAGS> <CMAKE_SYCL_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+endif()
+
+set(CMAKE_SYCL_INFORMATION_LOADED 1)
diff --git a/cmake/CMakeTestSYCLCompiler.cmake b/cmake/CMakeTestSYCLCompiler.cmake
@@ -0,0 +1 @@
+set(CMAKE_SYCL_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/include/targets/generic/block_reduce_helper.h b/include/targets/generic/block_reduce_helper.h
@@ -54,19 +54,6 @@ namespace quda
     }
   };
 
-  /**
-     @brief Dummy generic implementation of block_reduce
-  */
-  //template <bool is_device> struct block_reduce {
-  //template <typename T, typename reducer_t, typename param_t, typename... BR>
-  //T operator()(const T &value, bool, int, bool, reducer_t, param_t, BR&...)
-  //{
-  //return value;
-  //}
-  //};
-  //template <typename T, typename P, typename O>
-  //struct block_reduce {
-
   /**
      @brief WarpReduce provides a generic interface for performing
      perform reductions at the warp or sub-warp level
@@ -141,6 +128,21 @@ namespace quda
     }
   };
 
+  /**
+     @brief Dummy generic implementation of block_reduce
+  */
+  //template <bool is_device> struct block_reduce {
+  //template <typename T, typename reducer_t, typename param_t, typename... BR>
+  //T operator()(const T &value, bool, int, bool, reducer_t, param_t, BR&...)
+  //{
+  //return value;
+  //}
+  //};
+  //template <typename T, typename P, typename O>
+  //struct block_reduce {
+  //template <typename T, int block_dim, int batch_size, bool = true>
+  //struct block_reduce {}
+
   /**
      @brief BlockReduce provides a generic interface for performing
      reductions at the block level
@@ -149,19 +151,18 @@ namespace quda
      @tparam batch_size Batch size of the reduction.  Threads will be
      ordered such that batch size is the slowest running index.
   */
-  template <typename T, int block_dim, int batch_size = 1> class BlockReduce :
-    public block_reduce<T, block_dim, batch_size>
+  template <typename T, int block_dim, int batch_size = 1>
+  class BlockReduce : public block_reduce<T, block_dim, batch_size>
   {
+    static_assert(batch_size == 1 || block_dim <= 2, "Batching not possible with 3-d block reduction");
     using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
     using block_reduce_t = block_reduce<T, block_dim, batch_size>;
     const int batch;
 
   public:
-    //using block_reduce_t::shared_mem_size;
-
     template <typename ...U>
     HOSTDEVICE constexpr BlockReduce(SpecialOps<U...> &ops, int batch = 0) : block_reduce_t(ops), batch(batch) {
-      static_assert(hasSpecialOpType<BlockReduce_t, SpecialOps<U...>>);
+      checkSpecialOp<BlockReduce_t, U...>();
     }
 
     constexpr BlockReduce(const BlockReduce<T,block_dim,batch_size> &) = delete;

diff --git a/include/targets/generic/special_ops.h b/include/targets/generic/special_ops.h
@@ -46,18 +46,12 @@ namespace quda {
     using SpecialOpsT = SpecialOps<T...>;
     using KernelOpsT = SpecialOps<T...>;
   };
-  //template <typename ...T> struct SpecialOps : SpecialOpsTarget<T...> {
-  //  using SpecialOpsT = SpecialOps<T...>;
-  //};
 
   // getSpecialOps
   template <typename T, typename U = void> struct getSpecialOpsS { using type = NoSpecialOps; };
   template <typename T> struct getSpecialOpsS<T,std::conditional_t<true,void,typename T::SpecialOpsT>> {
     using type = typename T::SpecialOpsT;
   };
-  //template <typename ...T> struct getSpecialOpsS<SpecialOps<T...>,void> {
-  //  using type = SpecialOps<T...>;
-  //};
   template <typename T> using getSpecialOps = typename getSpecialOpsS<T>::type;
 
   // hasSpecialOp: checks if first type matches any of the op
@@ -74,38 +68,13 @@ namespace quda {
     static_assert((hasSpecialOp<T,typename Ops::SpecialOpsT> || ...));
   }
 
-
-  // OLD
-
-  template <typename ...T> struct op_Concurrent {};  // set of op types used concurrently (needs separate resources)
-  template <typename ...T> struct op_Sequential {};  // set of op types used sequentially (can share resources)
-  struct op_Base {};  // base type for other op types
-  template <typename T, int N = 0> struct op_BaseT : op_Base {
-    //using op_ElementT = T;
-    using ElemT = T;
-    static constexpr int n = N;
-  };
-
-
-
   // forward declarations of op types
   struct op_blockSync;
   template <typename T> struct op_warp_combine;
-  //template <typename T, int N> struct op_thread_array;
-  //template <typename T> struct op_BlockReduce;
-  //template <typename T, typename D = opDimsBlock> struct op_SharedMemoryCache;
-  //template <typename T, typename S = opSizeBlock> struct op_SharedMemory;
-  //template <typename T, int S> using op_SharedMemStatic = op_SharedMemory<T,opSizeStatic<S>>;
 
   // only types for convenience
   using only_blockSync = SpecialOps<op_blockSync>;
   template <typename T> using only_warp_combine = SpecialOps<op_warp_combine<T>>;
-  //template <typename T, int N> using only_thread_array = SpecialOps<op_thread_array<T,N>>;
-  //template <typename T> using only_BlockReduce = SpecialOps<op_BlockReduce<T>>;
-  //template <typename T, typename D = opDimsBlock> using only_SharedMemoryCache = SpecialOps<op_SharedMemoryCache<T,D>>;
-  //template <typename T, typename S = opSizeBlock> using only_SharedMemory = SpecialOps<op_SharedMemory<T,S>>;
-  //template <typename T, unsigned int S> using only_SharedMemStatic = only_SharedMemory<T,opSizeStatic<S>>;
-  //template <typename ...T> using only_Concurrent = SpecialOps<op_Concurrent<T...>>;
 
   // explicitSpecialOps
   template <typename T, typename U = void> struct explicitSpecialOpsS : std::false_type {};
@@ -122,6 +91,11 @@ namespace quda {
   template <typename T> inline constexpr bool hasSpecialOps = hasSpecialOpsImpl<T>::value;
 #endif
 
+  // checkSpecialOp
+  template <typename T, typename... U> static constexpr void checkSpecialOp() {
+    static_assert((std::is_same_v<T,U> || ...) == true);
+  }
+
   // combineOps
   template <typename ...T> struct combineOpsS {};
   template <typename ...T> struct combineOpsS<NoSpecialOps,SpecialOps<T...>> {
@@ -133,6 +107,17 @@ namespace quda {
   template <typename T, typename U> using combineOps = typename combineOpsS<T, U>::type;
 
 
+  // OLD
+
+#if 0
+  template <typename ...T> struct op_Concurrent {};  // set of op types used concurrently (needs separate resources)
+  template <typename ...T> struct op_Sequential {};  // set of op types used sequentially (can share resources)
+  struct op_Base {};  // base type for other op types
+  template <typename T, int N = 0> struct op_BaseT : op_Base {
+    //using op_ElementT = T;
+    using ElemT = T;
+    static constexpr int n = N;
+  };
 
   // unwrapSpecialOps
   template <typename T> struct unwrapSpecialOpsS { using type = T; };
@@ -148,12 +133,6 @@ namespace quda {
   template <typename T, typename U, typename ...V> static constexpr bool hasSpecialOpType<T,U,V...> =
     hasSpecialOpType2<unwrapSpecialOps<T>,unwrapSpecialOps<U>> || hasSpecialOpType<T,V...>;
 
-  // checkSpecialOp
-  template <typename T, typename... U> static constexpr void checkSpecialOp() {
-    static_assert((std::is_same_v<T,U> || ...) == true);
-  }
-
-#if 0
   // hasBlockSync
   template <typename ...T> static constexpr bool hasBlockSync = hasSpecialOpType<op_blockSync,T...>;
   template <typename ...T> static constexpr bool hasBlockSync<op_Concurrent<T...>> = hasSpecialOpType<op_blockSync,T...>;
@@ -184,7 +163,6 @@ namespace quda {
     using type = std::tuple_element_t<n,std::tuple<T...>>;
   };
   template <typename T, int n> using SpecialOpsType = SpecialOps<typename SpecialOpsTypeS<unwrapSpecialOps<T>,n>::type>;
-#endif
 
   // SpecialOpsElemType: element type from corresponding op types
   //template <typename ...T> struct SpecialOpsElemTypeS { using type = void; };
@@ -206,6 +184,7 @@ namespace quda {
   //template <typename T> struct SpecialOpDependS<T,std::enable_if_t<is_instance<T,op_Base>,void>> {
     using deps = SpecialOps<typename T::dependencies>;
   };
+#endif
 
 #if 0
   // SpecialOpDependencies: returns SpecialOps<all dependencies>, all Concurrent and Sequential lists are flattened
@@ -277,6 +256,7 @@ namespace quda {
   }
 #endif
 
+#if 0
   // sharedMemOffset
   template <typename T, int n> struct sharedMemOffset {
     template <typename ...Arg>
@@ -297,5 +277,6 @@ namespace quda {
 	+ sharedMemSize<std::tuple_element_t<n-1,std::tuple<T...>>>(block, arg...);
     }
   };
+#endif
 
 }
diff --git a/include/targets/sycl/block_reduce_helper.h b/include/targets/sycl/block_reduce_helper.h
@@ -93,8 +93,8 @@ namespace quda
   */
   template <typename T, int block_dim, int batch_size>
   struct block_reduceG {
-    using dependencies = op_Sequential<op_blockSync>;
-    using dependentOps = SpecialOps<op_blockSync>;
+    //using dependencies = op_Sequential<op_blockSync>;
+    //using dependentOps = SpecialOps<op_blockSync>;
     using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
     template <typename S> inline block_reduceG(S &ops) {};
     /**
@@ -131,28 +131,11 @@ namespace quda
   /**
      @brief SYCL specialization of block_reduce, building on the warp_reduce
   */
-#define DYNAMIC_SLM
   template <typename T, int block_dim, int batch_size>
-  //struct block_reduceW {
   struct block_reduceW : SharedMemory<T,SizeBlockDivWarp> {
     using Smem = SharedMemory<T,SizeBlockDivWarp>;
-    //using Smem::shared_mem_size;
-#ifdef DYNAMIC_SLM
-    //using opSmem = op_SharedMemory<T,opSizeBlockDivWarp>;
-    //using opSmem = SharedMemory<T,opSizeBlockDivWarp>;
-    //using dependencies = op_Sequential<op_blockSync,opSmem>;
-    //using dependentOps = SpecialOps<op_blockSync,opSmem>;
-    //template <typename ...Arg>
-    //static constexpr size_t shared_mem_size(dim3 block, Arg &...arg) {
-    //return opSizeBlockDivWarp::size<T>(block, arg...);
-    //}
-#else
-#endif
     using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
-    //dependentOps ops;
-    template <typename S>
-    //inline block_reduceW(S &ops) : ops(getDependentOps<BlockReduce_t>(ops)) {};
-    inline block_reduceW(S &ops) : Smem(ops) {};
+    template <typename S> inline block_reduceW(S &ops) : Smem(ops) {};
 
     template <int width_> struct warp_reduce_param {
       static constexpr int width = width_;
@@ -187,15 +170,7 @@ namespace quda
       // now do reduction between warps
       if (!async) __syncthreads(); // only synchronize if we are not pipelining
 
-      //__shared__ T storage[max_items];
-#ifdef DYNAMIC_SLM
-      //auto storage = getSharedMemPtr<opSmem>(ops);
       auto storage = Smem::sharedMem();
-#else
-      static_assert(sizeof(T[max_items])<=device::shared_memory_size(), "Block reduce shared mem size too large");
-      auto mem = sycl::ext::oneapi::group_local_memory_for_overwrite<T[max_items]>(getGroup());
-      auto storage = &((*mem)[0]);
-#endif
 
       // if first thread in warp, write result to shared memory
       if (thread_idx % device::warp_size() == 0) storage[batch * warp_items + warp_idx] = value;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		set(CMAKE_SYCL_COMPILER_WORKS 1 CACHE INTERNAL "")