using different mechanism for host mapped pinned memory (#1638)

Summary: Pull Request resolved: #1638 This diff adds another mechanism for allocating the host mapped pinned memory to reduce adverse affect on other processes running on the same host when one process is doing some large allocations. Reviewed By: jianyuh Differential Revision: D43950253 fbshipit-source-id: e359496feef883ea570b22a1241db139006a7357
pytorch · Mar 10, 2023 · a3d5c81 · a3d5c81
1 parent 8616ed7
commit a3d5c81
Showing 1 changed file with 29 additions and 4 deletions.
diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu
@@ -16,6 +16,11 @@
 #include "fbgemm_gpu/enum_utils.h"
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
 
+DEFINE_bool(
+    use_large_pages_for_cuda_host_mapped_memory,
+    true,
+    "Use 2M large pages for host mapped memory");
+
 using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
@@ -41,7 +46,8 @@ struct CUDAHostMappedContext {
   ~CUDAHostMappedContext() {
     at::cuda::OptionalCUDAGuard device_guard;
     device_guard.set_index(cuda_device_);
-    AT_CUDA_CHECK(cudaFreeHost(ptr_));
+    AT_CUDA_CHECK(cudaHostUnregister(ptr_));
+    free(ptr_);
   }
 
   static void release(void* ptr) {
@@ -206,9 +212,28 @@ Tensor new_host_mapped_tensor(
   auto strides = defaultStrides(sizes);
   size_t size_bytes =
       at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
-  void* ptr;
-  AT_CUDA_CHECK(cudaHostAlloc(
-      &ptr, size_bytes, cudaHostAllocWriteCombined | cudaHostAllocMapped));
+
+  // When using cudaHostAlloc for large allocations, we found that it can
+  // potentially take a global lock and lock out CUDA APIs from other processes.
+  // The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead
+  // of using this cuda API, we can do regular malloc, pre-fault the pages, and
+  // then do cudaHostRegister with GPU mapping flags to lock the pages, so we
+  // can minimize the cost while holding this global lock.
+  void* const ptr = malloc(size_bytes);
+  size_t pageSize = 4 * 1024;
+  if (FLAGS_use_large_pages_for_cuda_host_mapped_memory) {
+    // advise the kernel to allocate large 2M pages
+    madvise(ptr, size_bytes, MADV_HUGEPAGE);
+    pageSize = (1 << 21);
+  }
+  uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1));
+  for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes);
+       p += pageSize) {
+    memset((void*)p, 0, 1);
+  }
+
+  AT_CUDA_CHECK(cudaHostRegister(
+      ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable));
   void* dev_ptr;
   AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0));