Split Quantize Ops, pt. 2 (#1864)

Summary: Pull Request resolved: #1864 - Break up `quantize_ops.cu`, pt. 2 Reviewed By: sryap Differential Revision: D47271497 fbshipit-source-id: 9ed80e62120c53ca5d1a83c9c5b96d017b932a75
pytorch · Jul 7, 2023 · b92a351 · b92a351
1 parent f94db2c
commit b92a351
Show file tree

Hide file tree

Showing 7 changed files with 1,271 additions and 1,194 deletions.
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -632,9 +632,11 @@ if(NOT FBGEMM_CPU_ONLY)
       src/permute_pooled_embedding_ops_split.cu
       src/permute_pooled_embedding_ops.cu
       src/quantize_ops/quantize_bfloat16.cu
+      src/quantize_ops/quantize_fp8_rowwise.cu
+      src/quantize_ops/quantize_fused_8bit_rowwise.cu
+      src/quantize_ops/quantize_fused_nbit_rowwise.cu
       src/quantize_ops/quantize_hfp8.cu
       src/quantize_ops/quantize_msfp.cu
-      src/quantize_ops/quantize_ops.cu
       src/quantize_ops/quantize_padded_fp8_rowwise.cu
       src/sparse_ops/sparse_async_cumsum.cu
       src/sparse_ops/sparse_block_bucketize_features.cu

diff --git a/fbgemm_gpu/src/quantize_ops/common.cuh b/fbgemm_gpu/src/quantize_ops/common.cuh
@@ -30,3 +30,21 @@
 #define QUANTIZE_OPS_MIN(a, b) ((a) < (b) ? (a) : (b))
 
 using Tensor = at::Tensor;
+
+namespace fbgemm_gpu {
+
+namespace {
+
+template <typename T>
+__device__ inline __attribute__((always_inline)) T
+quantize_ops_shfl_xor(const T val, int laneMask, int width) {
+#if defined(__HIP_PLATFORM_HCC__) || CUDA_VERSION < 9000
+  return __shfl_xor(val, laneMask, width);
+#else
+  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
+#endif
+}
+
+} // namespace
+
+} // namespace fbgemm_gpu