Skip to content

Commit

Permalink
check for vmm support, disable for hip
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
slaren committed Dec 23, 2023
1 parent bd78dc9 commit 872408c
Showing 1 changed file with 47 additions and 26 deletions.
73 changes: 47 additions & 26 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6564,18 +6564,16 @@ struct scoped_spin_lock {

static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;

#if 0
#define DEBUG_CUDA_MALLOC
// #define DEBUG_CUDA_MALLOC
struct cuda_buffer {
void * ptr = nullptr;
size_t size = 0;
};

static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];

static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};

static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaGetDevice(&id));
Expand Down Expand Up @@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr;
}

static void ggml_cuda_pool_free(void * ptr, size_t size) {
static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaGetDevice(&id));
Expand All @@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
CUDA_CHECK(cudaFree(ptr));
g_cuda_pool_size[id] -= size;
}
#else

#if !defined(GGML_USE_HIPBLAS)
// pool with virtual memory
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB

static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36; // 64 GB

//#define DEBUG_CUDA_MALLOC

#define ggml_cuda_pool_malloc(size, actual_size) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const char * call) {
static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaGetDevice(&id));
Expand All @@ -6681,14 +6675,14 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
// round up to the nearest granularity
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);

GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);

CUmemGenericAllocationHandle handle;
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));

// reserve virtual address space (if not already reserved)
if (g_cuda_pool_addr[id] == 0) {
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0, 0, 0));
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
}

// map at the end of the pool
Expand All @@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
g_cuda_pool_handles[id].push_back(handle);
g_cuda_pool_size[id] += reserve_size;

printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n",
id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
(unsigned long long) (reserve_size/1024/1024), call);
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
// (unsigned long long) (reserve_size/1024/1024));
}

GGML_ASSERT(g_cuda_pool_addr[id] != 0);
Expand All @@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
g_cuda_pool_used[id] += size;

#ifdef DEBUG_CUDA_MALLOC
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
#endif

return ptr;

GGML_UNUSED(call);
}

#define ggml_cuda_pool_free(ptr, size) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
static void ggml_cuda_pool_free_(void * ptr, size_t size, const char * call) {
static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaGetDevice(&id));

#ifdef DEBUG_CUDA_MALLOC
printf("cuda pool[%d]: free %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
#endif

g_cuda_pool_used[id] -= size;

// all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
}

GGML_UNUSED(call);
static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false};

static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
if (g_device_vmm[id]) {
return ggml_cuda_pool_malloc_vmm(size, actual_size);
} else {
return ggml_cuda_pool_malloc_leg(size, actual_size);
}
}

static void ggml_cuda_pool_free(void * ptr, size_t size) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
if (g_device_vmm[id]) {
ggml_cuda_pool_free_vmm(ptr, size);
} else {
ggml_cuda_pool_free_leg(ptr, size);
}
}
#else
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
#endif

static bool g_cublas_loaded = false;
Expand Down Expand Up @@ -6783,9 +6796,17 @@ void ggml_init_cublas() {
#endif
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
for (int id = 0; id < g_device_count; ++id) {
int deviceSupportsVmm = 0;
#if !defined(GGML_USE_HIPBLAS)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
#endif
g_device_vmm[id] = !!deviceSupportsVmm;

cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, g_device_vmm[id] ? "yes" : "no");

g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
Expand Down

0 comments on commit 872408c

Please sign in to comment.