Use less thread blocks for find_uncached kernel

Summary: Find uncached takes quite long time and this often overalap with the main stream in prefetch mode. We do want it to be overlapped with other kernels so it has minimal effect on the main stream. This will likely make cache prefetch taking slightly longer (not significantly as prefetch spent longest time in data transfer), but will make overlapping main stream kernels faster. Differential Revision: D62401630
pytorch · Sep 9, 2024 · 2d24e5e · 2d24e5e
1 parent 88cbd86
commit 2d24e5e
Showing 1 changed file with 11 additions and 3 deletions.
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
@@ -217,11 +217,19 @@ lru_cache_find_uncached_cuda(
 #ifdef FBGEMM_GPU_MEMCHECK
         const char* func_name = "lru_cache_find_uncached_kernel";
 #endif
+        // During concurrent prefetch, cache lines are locked and we use less
+        // SMs for some of the prefetch kernels to leave SMs for main stream to
+        // overlap
+        constexpr int PREFETCH_KERNEL_MAX_BLOCKS = 8;
+
+        auto grid_size = std::min(
+            div_round_up(N, kMaxThreads / kWarpSize),
+            lock_cache_line ? PREFETCH_KERNEL_MAX_BLOCKS
+                            : get_max_thread_blocks_for_cache_kernels_());
+
         // Find uncached indices
         lru_cache_find_uncached_kernel<<<
-            std::min(
-                div_round_up(N, kMaxThreads / kWarpSize),
-                get_max_thread_blocks_for_cache_kernels_()),
+            grid_size,
             dim3(kWarpSize, kMaxThreads / kWarpSize),
             0,
             at::cuda::getCurrentCUDAStream()>>>(