Skip to content

Commit

Permalink
using different mechanism for host mapped pinned memory (#1638)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1638

This diff adds another mechanism for allocating the host mapped pinned memory to reduce adverse affect on other processes running on the same host when one process is doing some large allocations.

Reviewed By: jianyuh

Differential Revision: D43950253

fbshipit-source-id: e359496feef883ea570b22a1241db139006a7357
  • Loading branch information
banitag1 authored and facebook-github-bot committed Mar 10, 2023
1 parent 8616ed7 commit a3d5c81
Showing 1 changed file with 29 additions and 4 deletions.
33 changes: 29 additions & 4 deletions fbgemm_gpu/src/cumem_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
#include "fbgemm_gpu/enum_utils.h"
#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"

DEFINE_bool(
use_large_pages_for_cuda_host_mapped_memory,
true,
"Use 2M large pages for host mapped memory");

using Tensor = at::Tensor;

namespace fbgemm_gpu {
Expand All @@ -41,7 +46,8 @@ struct CUDAHostMappedContext {
~CUDAHostMappedContext() {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(cuda_device_);
AT_CUDA_CHECK(cudaFreeHost(ptr_));
AT_CUDA_CHECK(cudaHostUnregister(ptr_));
free(ptr_);
}

static void release(void* ptr) {
Expand Down Expand Up @@ -206,9 +212,28 @@ Tensor new_host_mapped_tensor(
auto strides = defaultStrides(sizes);
size_t size_bytes =
at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
void* ptr;
AT_CUDA_CHECK(cudaHostAlloc(
&ptr, size_bytes, cudaHostAllocWriteCombined | cudaHostAllocMapped));

// When using cudaHostAlloc for large allocations, we found that it can
// potentially take a global lock and lock out CUDA APIs from other processes.
// The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead
// of using this cuda API, we can do regular malloc, pre-fault the pages, and
// then do cudaHostRegister with GPU mapping flags to lock the pages, so we
// can minimize the cost while holding this global lock.
void* const ptr = malloc(size_bytes);
size_t pageSize = 4 * 1024;
if (FLAGS_use_large_pages_for_cuda_host_mapped_memory) {
// advise the kernel to allocate large 2M pages
madvise(ptr, size_bytes, MADV_HUGEPAGE);
pageSize = (1 << 21);
}
uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1));
for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes);
p += pageSize) {
memset((void*)p, 0, 1);
}

AT_CUDA_CHECK(cudaHostRegister(
ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable));
void* dev_ptr;
AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0));

Expand Down

0 comments on commit a3d5c81

Please sign in to comment.