pytorch · liligwu · Jan 25, 2022 · Jan 26, 2022 · Jan 26, 2022 · Jan 31, 2022
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "third_party/googletest"]
 	path = third_party/googletest
 	url = https://github.com/google/googletest
+[submodule "third_party/hipify_torch"]
+	path = third_party/hipify_torch
+	url = https://github.com/ROCmSoftwarePlatform/hipify_torch.git
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -9,13 +9,27 @@ message("${message_line}")
 if(SKBUILD)
   message("The project is built using scikit-build")
 endif()
+
+if(EXISTS "/usr/bin/nvidia-smi")
+  message("NVIDIA GPU detected.")
+  option(USE_CUDA "Use CUDA" ON)
+  option(USE_ROCM "Use ROCm" OFF)
+elseif(EXISTS "/opt/rocm/bin/rocm-smi")
+  message("AMD GPU detected.")
+  option(USE_CUDA "Use CUDA" OFF)
+  option(USE_ROCM "Use ROCm" ON)
+else()
+  message("Unable to detect GPU vendor")
+  message(FATAL_ERROR "")
+endif()
+
 if(FBGEMM_CPU_ONLY)
   message("Building for CPU-only")
 endif()
 
 message("${message_line}")
 
-if(FBGEMM_CPU_ONLY)
+if(FBGEMM_CPU_ONLY OR USE_ROCM)
   project(
     fbgemm_gpu
     VERSION 0.0.1
@@ -27,11 +41,18 @@ else()
     LANGUAGES CXX C CUDA)
 endif()
 
-find_package(Torch REQUIRED)
-find_package(PythonExtensions REQUIRED)
+if(USE_CUDA)
+  set(default_cuda_architectures 60 61 70 75 80)
+  set(cuda_architectures_doc
+      "CUDA architectures to build for. Default is ${default_cuda_architectures}")
+  set(cuda_architectures
+      "${default_cuda_architectures}"
+      CACHE STRING "${cuda_architectures_doc}")
 
-set(FBGEMM ${CMAKE_CURRENT_SOURCE_DIR}/..)
-set(THIRDPARTY ${FBGEMM}/third_party)
+  message("${message_line}")
+  message("fbgemm_gpu:")
+  message("Building for cuda_architectures = \"${cuda_architectures}\"")
+  message("${message_line}")
 
 if(DEFINED GLIBCXX_USE_CXX11_ABI)
   if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
@@ -47,13 +68,48 @@ endif()
 # constructor exists to convert from "int" to "__half" errors in
 # gen_embedding_forward_quantized_split_[un]weighted_codegen_cuda.cu
 #
+  set(TORCH_CUDA_OPTIONS
+      --expt-relaxed-constexpr
+      -D__CUDA_NO_HALF_OPERATORS__
+      # -D__CUDA_NO_HALF_CONVERSIONS__
+      -D__CUDA_NO_BFLOAT16_CONVERSIONS__
+      -D__CUDA_NO_HALF2_OPERATORS__)
+endif()
+
+find_package(Torch REQUIRED)
+find_package(PythonExtensions REQUIRED)
+
+set(FBGEMM ${CMAKE_CURRENT_SOURCE_DIR}/..)
+set(THIRDPARTY ${FBGEMM}/third_party)
+
+if(USE_ROCM)
+  if(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
+    SET(FBGEMM_ROCM_ARCH gfx900;gfx906;gfx908;gfx90a)
+  else()
+    SET(FBGEMM_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
+  endif()
+
+  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${THIRDPARTY}/hipify_torch/cmake")
+  include(Hip)
+  if(NOT FBGEMM_HAVE_HIP)
+    message(FATAL_ERROR "Not able to find HIP installation.")
+  endif()  
+  include(Hipify)
+  list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+  set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
+
+  find_package(rocBLAS REQUIRED)
+  find_package(hipFFT REQUIRED)
+  find_package(hipRAND REQUIRED)
+  find_package(rocRAND REQUIRED)
+  find_package(hipSPARSE REQUIRED)
+  find_package(OpenMP REQUIRED)
+  find_package(rocPRIM REQUIRED)
+
+  message("${message_line}")
+  message(STATUS "hip found ${ROCM_FOUND}")
+endif()
 
-set(TORCH_CUDA_OPTIONS
-    --expt-relaxed-constexpr
-    -D__CUDA_NO_HALF_OPERATORS__
-    # -D__CUDA_NO_HALF_CONVERSIONS__
-    -D__CUDA_NO_BFLOAT16_CONVERSIONS__
-    -D__CUDA_NO_HALF2_OPERATORS__)
 
 #
 # GENERATED CUDA, CPP and Python code
@@ -147,18 +203,38 @@ set(codegen_dependencies
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h
 )
 
-add_custom_command(
-  OUTPUT ${gen_cpu_source_files} ${gen_gpu_source_files}
-         ${gen_gpu_host_source_files} ${gen_python_files}
-  COMMAND
-    "${PYTHON_EXECUTABLE}"
-    "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py"
-    "--opensource"
-  DEPENDS "${codegen_dependencies}")
+if(USE_CUDA)
+  add_custom_command(
+    OUTPUT ${gen_cpu_source_files} ${gen_gpu_source_files} 
+           ${gen_gpu_host_source_files} ${gen_python_files}
+    COMMAND
+      "${PYTHON_EXECUTABLE}"
+      "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py"
+      "--opensource"
+    DEPENDS "${codegen_dependencies}")
+
+    set_source_files_properties(
+      ${gen_cpu_source_files} PROPERTIES COMPILE_OPTIONS
+                              "-mavx2;-mf16c;-mfma;-fopenmp")
+elseif(USE_ROCM)
+  execute_process(
+        COMMAND
+        "${PYTHON_EXECUTABLE}"
+        "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py"
+        "--opensource")
+
+  set(header_include_dir
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+  hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR} HEADER_INCLUDE_DIR ${header_include_dir})
+
+  set_source_files_properties(
+      ${gen_cpu_source_files} PROPERTIES COMPILE_OPTIONS
+                              "-mavx2;-mf16c;-mfma")
+endif()
 
-set_source_files_properties(
-  ${gen_cpu_source_files} PROPERTIES COMPILE_OPTIONS
-                                     "-mavx2;-mf16c;-mfma;-fopenmp")
 set_source_files_properties(
   ${gen_cpu_source_files}
   PROPERTIES
@@ -209,15 +285,15 @@ set(cpp_fbgemm_files_avx2 "../src/EmbeddingSpMDMAvx2.cc"
 set_source_files_properties(${cpp_fbgemm_files_avx2}
                             PROPERTIES COMPILE_OPTIONS "-mavx2;-mf16c;-mfma")
 
+set(cpp_fbgemm_files ${cpp_fbgemm_files_normal} ${cpp_fbgemm_files_avx2})
 set(cpp_fbgemm_files_avx512 "../src/EmbeddingSpMDMAvx512.cc")
-
-set_source_files_properties(
-  ${cpp_fbgemm_files_avx512}
-  PROPERTIES COMPILE_OPTIONS
-             "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
-
-set(cpp_fbgemm_files ${cpp_fbgemm_files_normal} ${cpp_fbgemm_files_avx2}
-                     ${cpp_fbgemm_files_avx512})
+if(USE_CUDA)
+  set_source_files_properties(
+    ${cpp_fbgemm_files_avx512}
+    PROPERTIES COMPILE_OPTIONS
+              "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
+  list(APPEND cpp_fbgemm_files ${cpp_fbgemm_files_avx512})
+endif()
 
 set(cpp_fbgemm_files_include_directories
     ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include
@@ -280,9 +356,12 @@ if(NOT FBGEMM_CPU_ONLY)
     endif()
 endif()
 
-set_source_files_properties(
-  ${fbgemm_gpu_sources_cpu} PROPERTIES COMPILE_OPTIONS
-                                       "-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
+set(fbgemm_gpu_sources_cpu_option "-mavx;-mf16c;-mfma;-mavx2")
+if(USE_CUDA)
+  set_source_files_properties(
+    ${fbgemm_gpu_sources_cpu} PROPERTIES COMPILE_OPTIONS
+                                        "${fbgemm_gpu_sources_cpu_option};-fopenmp")
+endif()
 
 if(NOT FBGEMM_CPU_ONLY)
   set(fbgemm_gpu_sources_gpu
@@ -312,15 +391,41 @@ else()
   set(fbgemm_gpu_sources ${fbgemm_gpu_sources_cpu})
 endif()
 
-#
-# MODULE
-#
-
-add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files}
-                                 ${cpp_asmjit_files} ${cpp_fbgemm_files})
+if(USE_ROCM)
+  set(abspath_gen_source_files)
+  foreach(filename_gen_source_file ${gen_source_files})
+    list(APPEND abspath_gen_source_files "${CMAKE_BINARY_DIR}/${filename_gen_source_file}")
+  endforeach()
+endif()
 
-if(NOT FBGEMM_CPU_ONLY)
-  target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE)
+if(USE_CUDA)
+  add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files}
+                                  ${cpp_asmjit_files} ${cpp_fbgemm_files})
+  set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES
+                                  "${cuda_architectures}")
+  if(NOT FBGEMM_CPU_ONLY)
+    target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE)
+  endif()
+  set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
+elseif(USE_ROCM)
+  get_hipified_list("${fbgemm_gpu_sources}" fbgemm_gpu_sources)
+  get_hipified_list("${abspath_gen_source_files}" abspath_gen_source_files)
+  get_hipified_list("${cpp_fbgemm_files}" cpp_fbgemm_files)
+
+  set(FBGEMM_ALL_HIP_FILES ${fbgemm_gpu_sources} ${abspath_gen_source_files} ${cpp_fbgemm_files})
+  set_source_files_properties(${FBGEMM_ALL_HIP_FILES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  hip_include_directories("${cpp_fbgemm_files_include_directories}")
+
+  hip_add_library(fbgemm_gpu_py SHARED ${cpp_asmjit_files} ${FBGEMM_ALL_HIP_FILES} ${FBGEMM_HIP_HCC_LIBRARIES} 
+                  HIPCC_OPTIONS ${HIP_HCC_FLAGS})
+  target_include_directories(fbgemm_gpu_py PUBLIC ${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
+endif()
+list (GET TORCH_INCLUDE_DIRS 0 TORCH_PATH)
+if(EXISTS "${TORCH_PATH}/ATen/cuda/CUDAGeneratorImpl.h")
+  target_compile_definitions(fbgemm_gpu_py PRIVATE NEW_GENERATOR_PATH)
+endif()
+if(EXISTS "${TORCH_PATH}/ATen/cuda/Atomic.cuh")
+  target_compile_definitions(fbgemm_gpu_py PRIVATE NEW_ATOMIC_PATH)
 endif()
 
 set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "")
@@ -330,7 +435,9 @@ if(NVML_LIB_PATH)
   target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH})
 endif()
 target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS})
-set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
+if(USE_CUDA)
+  set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17)
+endif()
 
 install(TARGETS fbgemm_gpu_py DESTINATION fbgemm_gpu)
 

diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -797,8 +797,10 @@ def uvm(
             offsets = torch.tensor(([0] + np.cumsum(lengths).tolist())).int().cuda()
             per_sample_weights = None
             if weighted:
-                assert (this_rs_uvm_weights := rs_uvm[2]) is not None
-                assert (this_rs_gpu_weights := rs_gpu[2]) is not None
+                this_rs_uvm_weights = rs_uvm[2]
+                assert this_rs_uvm_weights is not None
+                this_rs_gpu_weights = rs_gpu[2]
+                assert this_rs_gpu_weights is not None
                 per_sample_weights = torch.cat(
                     [this_rs_uvm_weights, this_rs_gpu_weights]
                 )
@@ -1634,8 +1636,10 @@ def nbit_uvm(
             offsets = torch.tensor(([0] + np.cumsum(lengths).tolist())).int().cuda()
             per_sample_weights = None
             if weighted:
-                assert (this_rs_uvm_weights := rs_uvm[2]) is not None
-                assert (this_rs_gpu_weights := rs_gpu[2]) is not None
+                this_rs_uvm_weights = rs_uvm[2]
+                assert this_rs_uvm_weights is not None
+                this_rs_gpu_weights = rs_gpu[2]
+                assert this_rs_gpu_weights is not None
                 per_sample_weights = torch.cat(
                     [this_rs_uvm_weights, this_rs_gpu_weights]
                 )