From b108c71bdddba04ab40c4cc6cda6086bd41d9136 Mon Sep 17 00:00:00 2001 From: Alex Xiong Date: Wed, 6 Mar 2024 20:30:09 +0800 Subject: [PATCH 1/3] feat: add rust api for cudaFreeAsync --- .../rust/icicle-cuda-runtime/src/memory.rs | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/wrappers/rust/icicle-cuda-runtime/src/memory.rs b/wrappers/rust/icicle-cuda-runtime/src/memory.rs index 0e12fd822..7f92b06e8 100644 --- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs +++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs @@ -1,12 +1,15 @@ use crate::bindings::{ - cudaFree, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind, + cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, + cudaMemcpyAsync, cudaMemcpyKind, }; use crate::device::get_device; use crate::device_context::check_device; use crate::error::{CudaError, CudaResult, CudaResultWrap}; use crate::stream::CudaStream; use std::mem::{size_of, MaybeUninit}; -use std::ops::{Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}; +use std::ops::{ + Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive, +}; use std::os::raw::c_void; use std::slice::from_raw_parts_mut; @@ -47,14 +50,18 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { pub fn as_mut_slice(&mut self) -> &mut [T] { match self { - Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"), + Self::Device(_, _) => { + panic!("Use copy_to_host and copy_to_host_async to move device data to a slice") + } Self::Host(v) => v.as_mut_slice(), } } pub fn as_slice(&self) -> &[T] { match self { - Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"), + Self::Device(_, _) => { + panic!("Use copy_to_host and copy_to_host_async to move device data to a slice") + } Self::Host(v) => v.as_slice(), } } @@ -79,9 +86,7 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { } pub fn cuda_malloc(count: usize) -> CudaResult { - let size = count - .checked_mul(size_of::()) - .unwrap_or(0); + let size = count.checked_mul(size_of::()).unwrap_or(0); if size == 0 { return Err(CudaError::cudaErrorMemoryAllocation); //TODO: only CUDA backend should return CudaError } @@ -97,16 +102,19 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { } pub fn cuda_malloc_async(count: usize, stream: &CudaStream) -> CudaResult { - let size = count - .checked_mul(size_of::()) - .unwrap_or(0); + let size = count.checked_mul(size_of::()).unwrap_or(0); if size == 0 { return Err(CudaError::cudaErrorMemoryAllocation); } let mut device_ptr = MaybeUninit::<*mut c_void>::uninit(); unsafe { - cudaMallocAsync(device_ptr.as_mut_ptr(), size, stream.handle as *mut _ as *mut _).wrap()?; + cudaMallocAsync( + device_ptr.as_mut_ptr(), + size, + stream.handle as *mut _ as *mut _, + ) + .wrap()?; Ok(Self::Device( from_raw_parts_mut(device_ptr.assume_init() as *mut T, count), get_device().unwrap() as i32, @@ -114,6 +122,22 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { } } + pub fn cuda_free_async(&mut self, stream: &CudaStream) -> CudaResult<()> { + if let Self::Device(s, device_id) = self { + check_device(*device_id); + if !s.is_empty() { + unsafe { + cudaFreeAsync( + s.as_mut_ptr() as *mut c_void, + stream.handle as *mut _ as *mut _, + ) + .wrap()?; + } + } + } + Ok(()) + } + pub fn copy_from_host(&mut self, val: &[T]) -> CudaResult<()> { match self { Self::Device(_, device_id) => check_device(*device_id), @@ -259,9 +283,7 @@ impl<'a, T> Drop for HostOrDeviceSlice<'a, T> { } unsafe { - cudaFree(s.as_mut_ptr() as *mut c_void) - .wrap() - .unwrap(); + cudaFree(s.as_mut_ptr() as *mut c_void).wrap().unwrap(); } } Self::Host(_) => {} From b22aa02e9107c575549b2497da55698d1cff1dfb Mon Sep 17 00:00:00 2001 From: Alex Xiong Date: Wed, 6 Mar 2024 12:59:53 +0000 Subject: [PATCH 2/3] fix: cargo fmt --- .github/workflows/test-deploy-docs.yml | 2 +- .../rust/icicle-cuda-runtime/src/memory.rs | 32 ++++++++----------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/.github/workflows/test-deploy-docs.yml b/.github/workflows/test-deploy-docs.yml index 17702cd4e..6bc3b64b3 100644 --- a/.github/workflows/test-deploy-docs.yml +++ b/.github/workflows/test-deploy-docs.yml @@ -9,7 +9,7 @@ on: jobs: test-deploy: - name: Test deployment of docs webiste + name: Test deployment of docs website runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/wrappers/rust/icicle-cuda-runtime/src/memory.rs b/wrappers/rust/icicle-cuda-runtime/src/memory.rs index 7f92b06e8..669349946 100644 --- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs +++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs @@ -1,15 +1,12 @@ use crate::bindings::{ - cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, - cudaMemcpyAsync, cudaMemcpyKind, + cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind, }; use crate::device::get_device; use crate::device_context::check_device; use crate::error::{CudaError, CudaResult, CudaResultWrap}; use crate::stream::CudaStream; use std::mem::{size_of, MaybeUninit}; -use std::ops::{ - Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive, -}; +use std::ops::{Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}; use std::os::raw::c_void; use std::slice::from_raw_parts_mut; @@ -86,7 +83,9 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { } pub fn cuda_malloc(count: usize) -> CudaResult { - let size = count.checked_mul(size_of::()).unwrap_or(0); + let size = count + .checked_mul(size_of::()) + .unwrap_or(0); if size == 0 { return Err(CudaError::cudaErrorMemoryAllocation); //TODO: only CUDA backend should return CudaError } @@ -102,19 +101,16 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { } pub fn cuda_malloc_async(count: usize, stream: &CudaStream) -> CudaResult { - let size = count.checked_mul(size_of::()).unwrap_or(0); + let size = count + .checked_mul(size_of::()) + .unwrap_or(0); if size == 0 { return Err(CudaError::cudaErrorMemoryAllocation); } let mut device_ptr = MaybeUninit::<*mut c_void>::uninit(); unsafe { - cudaMallocAsync( - device_ptr.as_mut_ptr(), - size, - stream.handle as *mut _ as *mut _, - ) - .wrap()?; + cudaMallocAsync(device_ptr.as_mut_ptr(), size, stream.handle as *mut _ as *mut _).wrap()?; Ok(Self::Device( from_raw_parts_mut(device_ptr.assume_init() as *mut T, count), get_device().unwrap() as i32, @@ -127,11 +123,7 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { check_device(*device_id); if !s.is_empty() { unsafe { - cudaFreeAsync( - s.as_mut_ptr() as *mut c_void, - stream.handle as *mut _ as *mut _, - ) - .wrap()?; + cudaFreeAsync(s.as_mut_ptr() as *mut c_void, stream.handle as *mut _ as *mut _).wrap()?; } } } @@ -283,7 +275,9 @@ impl<'a, T> Drop for HostOrDeviceSlice<'a, T> { } unsafe { - cudaFree(s.as_mut_ptr() as *mut c_void).wrap().unwrap(); + cudaFree(s.as_mut_ptr() as *mut c_void) + .wrap() + .unwrap(); } } Self::Host(_) => {} From 7185657ff7db8db239517e7d4a89da5912dfae47 Mon Sep 17 00:00:00 2001 From: DmytroTym Date: Wed, 6 Mar 2024 18:13:23 +0200 Subject: [PATCH 3/3] Warmup function --- wrappers/rust/icicle-core/src/msm/tests.rs | 11 ++++++----- wrappers/rust/icicle-cuda-runtime/build.rs | 1 + wrappers/rust/icicle-cuda-runtime/src/device.rs | 17 ++++++++++++++++- wrappers/rust/icicle-cuda-runtime/src/memory.rs | 14 +------------- 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/wrappers/rust/icicle-core/src/msm/tests.rs b/wrappers/rust/icicle-core/src/msm/tests.rs index aa6afb3e0..fa103e3bb 100644 --- a/wrappers/rust/icicle-core/src/msm/tests.rs +++ b/wrappers/rust/icicle-core/src/msm/tests.rs @@ -1,7 +1,7 @@ use crate::curve::{Affine, Curve, Projective}; use crate::msm::{msm, MSMConfig, MSM}; use crate::traits::{FieldImpl, GenerateRandom}; -use icicle_cuda_runtime::device::{get_device_count, set_device}; +use icicle_cuda_runtime::device::{get_device_count, set_device, warmup}; use icicle_cuda_runtime::memory::HostOrDeviceSlice; use icicle_cuda_runtime::stream::CudaStream; use rayon::iter::IntoParallelIterator; @@ -108,6 +108,8 @@ where { let test_sizes = [1000, 1 << 16]; let batch_sizes = [1, 3, 1 << 4]; + let stream = CudaStream::create().unwrap(); + warmup(&stream).unwrap(); for test_size in test_sizes { for batch_size in batch_sizes { let points = generate_random_affine_points_with_zeroes(test_size, 10); @@ -123,7 +125,6 @@ where let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap(); let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap(); let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap(); - let stream = CudaStream::create().unwrap(); points_d .copy_from_host_async(&points_cloned, &stream) .unwrap(); @@ -147,9 +148,6 @@ where stream .synchronize() .unwrap(); - stream - .destroy() - .unwrap(); let points_ark: Vec<_> = points_h .as_slice() @@ -172,6 +170,9 @@ where } } } + stream + .destroy() + .unwrap(); } pub fn check_msm_skewed_distributions>() diff --git a/wrappers/rust/icicle-cuda-runtime/build.rs b/wrappers/rust/icicle-cuda-runtime/build.rs index b203e3dc5..7858bf5c8 100644 --- a/wrappers/rust/icicle-cuda-runtime/build.rs +++ b/wrappers/rust/icicle-cuda-runtime/build.rs @@ -77,6 +77,7 @@ fn main() { .allowlist_function("cudaMemset") .allowlist_function("cudaMemsetAsync") .allowlist_function("cudaDeviceGetDefaultMemPool") + .allowlist_function("cudaMemGetInfo") .rustified_enum("cudaMemcpyKind") // Stream Ordered Memory Allocator // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html diff --git a/wrappers/rust/icicle-cuda-runtime/src/device.rs b/wrappers/rust/icicle-cuda-runtime/src/device.rs index 80750b2f6..bc6cedb03 100644 --- a/wrappers/rust/icicle-cuda-runtime/src/device.rs +++ b/wrappers/rust/icicle-cuda-runtime/src/device.rs @@ -1,7 +1,9 @@ use crate::{ - bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice}, + bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice}, error::{CudaResult, CudaResultWrap}, + stream::CudaStream, }; +use std::mem::MaybeUninit; pub fn set_device(device_id: usize) -> CudaResult<()> { unsafe { cudaSetDevice(device_id as i32) }.wrap() @@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult { let mut device_id = 0; unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize) } + +// This function pre-allocates default memory pool and warms the GPU up +// so that subsequent memory allocations and other calls are not slowed down +pub fn warmup(stream: &CudaStream) -> CudaResult<()> { + let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit(); + let mut free_memory: usize = 0; + let mut _total_memory: usize = 0; + unsafe { + cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?; + cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?; + cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap() + } +} diff --git a/wrappers/rust/icicle-cuda-runtime/src/memory.rs b/wrappers/rust/icicle-cuda-runtime/src/memory.rs index 669349946..4596585b6 100644 --- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs +++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs @@ -1,5 +1,5 @@ use crate::bindings::{ - cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind, + cudaFree, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind, }; use crate::device::get_device; use crate::device_context::check_device; @@ -118,18 +118,6 @@ impl<'a, T> HostOrDeviceSlice<'a, T> { } } - pub fn cuda_free_async(&mut self, stream: &CudaStream) -> CudaResult<()> { - if let Self::Device(s, device_id) = self { - check_device(*device_id); - if !s.is_empty() { - unsafe { - cudaFreeAsync(s.as_mut_ptr() as *mut c_void, stream.handle as *mut _ as *mut _).wrap()?; - } - } - } - Ok(()) - } - pub fn copy_from_host(&mut self, val: &[T]) -> CudaResult<()> { match self { Self::Device(_, device_id) => check_device(*device_id),