Add cache conflict miss support (#2596)

Summary: Pull Request resolved: #2596 Prior to this diff, SSD TBE lacked support for the conflict cache miss scenario. It operated under the assumption that the cache, located in GPU memory, was sufficiently large to hold all prefetched data from SSD. In the event of a conflict cache miss, the behavior of SSD TBE would be unpredictable (it could either fail or potentially access illegal memory). Note that a conflict cache miss happens when an embedding row is absent in the cache, and after being fetched from SSD, it cannot be inserted into the cache due to capacity constraints or associativity limitations. This diff introduces support for conflict cache misses by storing rows that cannot be inserted into the cache due to conflicts in a scratch pad, which is a temporary GPU tensor. In the case where rows are missed from the cache, TBE kernels can access the scratch pad. Prior to this diff, during the SSD prefetch stage, any row that was missed the cache and required fetching from SSD would be first fetched into a CPU scratch pad and then transferred to GPU. Rows that could be inserted into the cache would subsequently be copied from the GPU scratch pad into the cache. If conflict misses occurred, the prefetch behavior would be unpredictable. With this diff, conflict missed rows are now retained in the scratch pad, which is kept alive until the current iteration completes. Throughout the forward and backward + optimizer stages of TBE, both the cache and scratch pad are equivalent in terms of usage. However, following the completion of the backward + optimizer step, rows in the scratch pad are flushed back to SSD, unlike rows residing in the cache which are not evicted for future usage (see the diagram below for more details). {F1645878181} Reviewed By: spcyppt Differential Revision: D55998215 fbshipit-source-id: 79846dd7ae0ec95752fa1cb427790f685900bf6e
pytorch · May 24, 2024 · db4d379 · db4d379
1 parent ab05ca9
commit db4d379
Show file tree

Hide file tree

Showing 23 changed files with 1,157 additions and 490 deletions.
diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
@@ -100,6 +100,10 @@ set(GWD_OPTIMIZERS
 set(DEFUSED_OPTIMIZERS
     rowwise_adagrad)
 
+# Optimizers with the SSD support
+set(SSD_OPTIMIZERS
+    rowwise_adagrad)
+
 set(WEIGHT_OPTIONS
     weighted
     unweighted_nobag
@@ -146,6 +150,7 @@ set(gen_gpu_kernel_source_files
     "gen_embedding_forward_split_unweighted_codegen_cuda.cu"
     "gen_embedding_backward_dense_indice_weights_codegen_cuda.cu"
     "gen_embedding_backward_split_indice_weights_codegen_cuda.cu"
+    "gen_embedding_backward_ssd_indice_weights_codegen_cuda.cu"
     "gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu"
     "gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu"
     "gen_batch_index_select_dim0_forward_codegen_cuda.cu"
@@ -156,10 +161,13 @@ set(gen_gpu_kernel_source_files
     "gen_batch_index_select_dim0_backward_kernel_warp.cu"
     "gen_embedding_backward_split_grad_embedding_ops.cu"
     "gen_embedding_backward_split_grad_index_select.cu"
-    "gen_embedding_backward_common_split_device_kernel.cuh"
-    "gen_embedding_backward_batch_index_select_split_device_kernel.cuh"
+    "gen_embedding_backward_split_common_device_kernel.cuh"
+    "gen_embedding_backward_split_batch_index_select_device_kernel.cuh"
     "gen_embedding_forward_split_weighted_gwd_codegen_cuda.cu"
     "gen_embedding_forward_split_unweighted_gwd_codegen_cuda.cu"
+    "gen_embedding_forward_ssd_weighted_codegen_cuda.cu"
+    "gen_embedding_forward_ssd_unweighted_codegen_cuda.cu"
+    "gen_embedding_forward_ssd_unweighted_nobag_kernel_small.cu"
 )
 
 if(NOT USE_ROCM)
@@ -182,7 +190,8 @@ foreach(wdesc ${WEIGHT_OPTIONS})
       "gen_embedding_backward_dense_split_${wdesc}_kernel_cta.cu"
       "gen_embedding_backward_dense_split_${wdesc}_kernel_warp.cu"
       "gen_embedding_forward_split_${wdesc}_kernel.cu"
-      "gen_embedding_backward_${wdesc}_split_device_kernel.cuh")
+      "gen_embedding_forward_ssd_${wdesc}_kernel.cu"
+      "gen_embedding_backward_split_${wdesc}_device_kernel.cuh")
 
   foreach(etype fp32 fp16 fp8 int8 int4 int2)
     list(APPEND gen_gpu_kernel_source_files
@@ -194,7 +203,7 @@ endforeach()
 foreach(wdesc weighted unweighted)
   list(APPEND gen_gpu_kernel_source_files
       "gen_embedding_forward_split_${wdesc}_vbe_kernel.cu"
-      "gen_embedding_backward_${wdesc}_vbe_split_device_kernel.cuh")
+      "gen_embedding_backward_split_${wdesc}_vbe_device_kernel.cuh")
 endforeach()
 
 # Generate GWD files
@@ -210,22 +219,31 @@ set(gen_cpu_source_files
 
 set(gen_python_source_files
   ${CMAKE_BINARY_DIR}/__init__.py
-  ${CMAKE_BINARY_DIR}/lookup_args.py)
+  ${CMAKE_BINARY_DIR}/lookup_args.py
+  ${CMAKE_BINARY_DIR}/lookup_args_ssd.py
+)
 
 # For each of the optimizers, generate the backward split variant by adding
 # the Python, CPU-only, GPU host, and GPU kernel source files
 
-# Generate the Python functions only if there is the backend support
+# Generate the Python functions only if there is the backend support (for all
+# optimizers)
 foreach(optimizer
     ${COMMON_OPTIMIZERS}
     ${CPU_ONLY_OPTIMIZERS}
     ${GPU_ONLY_OPTIMIZERS})
   list(APPEND gen_python_source_files
-    "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py")
-  list(APPEND gen_python_source_files
+    "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py"
     "${CMAKE_BINARY_DIR}/lookup_${optimizer}_pt2.py")
 endforeach()
 
+# Generate the Python functions only if there is the backend support (for SSD
+# optimizers)
+foreach(optimizer ${SSD_OPTIMIZERS})
+  list(APPEND gen_python_source_files
+    "${CMAKE_BINARY_DIR}/lookup_${optimizer}_ssd.py")
+endforeach()
+
 # Generate the backend API for all optimizers to preserve the backward
 # compatibility
 list(APPEND gen_cpu_source_files
@@ -288,6 +306,24 @@ foreach(optimizer ${DEFUSED_OPTIMIZERS})
     "${CMAKE_BINARY_DIR}/split_embedding_optimizer_${optimizer}.py")
 endforeach()
 
+foreach(optimizer ${SSD_OPTIMIZERS})
+  list(APPEND gen_gpu_kernel_source_files
+    "gen_embedding_optimizer_${optimizer}_ssd_device_kernel.cuh"
+  )
+
+  list(APPEND gen_gpu_host_source_files
+    "gen_embedding_backward_ssd_${optimizer}.cpp"
+  )
+
+  foreach(wdesc weighted unweighted unweighted_nobag)
+    list(APPEND gen_gpu_kernel_source_files
+      "gen_embedding_backward_${optimizer}_ssd_${wdesc}_cuda.cu"
+      "gen_embedding_backward_${optimizer}_ssd_${wdesc}_kernel_cta.cu"
+      "gen_embedding_backward_${optimizer}_ssd_${wdesc}_kernel_warp.cu")
+  endforeach()
+
+endforeach()
+
 list(APPEND gen_defused_optim_py_files
     ${CMAKE_BINARY_DIR}/optimizer_args.py)