From b108c71bdddba04ab40c4cc6cda6086bd41d9136 Mon Sep 17 00:00:00 2001
From: Alex Xiong <alex.xiong.tech@gmail.com>
Date: Wed, 6 Mar 2024 20:30:09 +0800
Subject: [PATCH 1/3] feat: add rust api for cudaFreeAsync

---
 .../rust/icicle-cuda-runtime/src/memory.rs    | 50 +++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)
diff --git a/wrappers/rust/icicle-cuda-runtime/src/memory.rs b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
index 0e12fd822..7f92b06e8 100644
--- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
@@ -1,12 +1,15 @@
 use crate::bindings::{
-    cudaFree, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
+    cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy,
+    cudaMemcpyAsync, cudaMemcpyKind,
 };
 use crate::device::get_device;
 use crate::device_context::check_device;
 use crate::error::{CudaError, CudaResult, CudaResultWrap};
 use crate::stream::CudaStream;
 use std::mem::{size_of, MaybeUninit};
-use std::ops::{Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive};
+use std::ops::{
+    Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive,
+};
 use std::os::raw::c_void;
 use std::slice::from_raw_parts_mut;
 
@@ -47,14 +50,18 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
 
     pub fn as_mut_slice(&mut self) -> &mut [T] {
         match self {
-            Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
+            Self::Device(_, _) => {
+                panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
+            }
             Self::Host(v) => v.as_mut_slice(),
         }
     }
 
     pub fn as_slice(&self) -> &[T] {
         match self {
-            Self::Device(_, _) => panic!("Use copy_to_host and copy_to_host_async to move device data to a slice"),
+            Self::Device(_, _) => {
+                panic!("Use copy_to_host and copy_to_host_async to move device data to a slice")
+            }
             Self::Host(v) => v.as_slice(),
         }
     }
@@ -79,9 +86,7 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
     }
 
     pub fn cuda_malloc(count: usize) -> CudaResult<Self> {
-        let size = count
-            .checked_mul(size_of::<T>())
-            .unwrap_or(0);
+        let size = count.checked_mul(size_of::<T>()).unwrap_or(0);
         if size == 0 {
             return Err(CudaError::cudaErrorMemoryAllocation); //TODO: only CUDA backend should return CudaError
         }
@@ -97,16 +102,19 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
     }
 
     pub fn cuda_malloc_async(count: usize, stream: &CudaStream) -> CudaResult<Self> {
-        let size = count
-            .checked_mul(size_of::<T>())
-            .unwrap_or(0);
+        let size = count.checked_mul(size_of::<T>()).unwrap_or(0);
         if size == 0 {
             return Err(CudaError::cudaErrorMemoryAllocation);
         }
 
         let mut device_ptr = MaybeUninit::<*mut c_void>::uninit();
         unsafe {
-            cudaMallocAsync(device_ptr.as_mut_ptr(), size, stream.handle as *mut _ as *mut _).wrap()?;
+            cudaMallocAsync(
+                device_ptr.as_mut_ptr(),
+                size,
+                stream.handle as *mut _ as *mut _,
+            )
+            .wrap()?;
             Ok(Self::Device(
                 from_raw_parts_mut(device_ptr.assume_init() as *mut T, count),
                 get_device().unwrap() as i32,
@@ -114,6 +122,22 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
         }
     }
 
+    pub fn cuda_free_async(&mut self, stream: &CudaStream) -> CudaResult<()> {
+        if let Self::Device(s, device_id) = self {
+            check_device(*device_id);
+            if !s.is_empty() {
+                unsafe {
+                    cudaFreeAsync(
+                        s.as_mut_ptr() as *mut c_void,
+                        stream.handle as *mut _ as *mut _,
+                    )
+                    .wrap()?;
+                }
+            }
+        }
+        Ok(())
+    }
+
     pub fn copy_from_host(&mut self, val: &[T]) -> CudaResult<()> {
         match self {
             Self::Device(_, device_id) => check_device(*device_id),
@@ -259,9 +283,7 @@ impl<'a, T> Drop for HostOrDeviceSlice<'a, T> {
                 }
 
                 unsafe {
-                    cudaFree(s.as_mut_ptr() as *mut c_void)
-                        .wrap()
-                        .unwrap();
+                    cudaFree(s.as_mut_ptr() as *mut c_void).wrap().unwrap();
                 }
             }
             Self::Host(_) => {}

From b22aa02e9107c575549b2497da55698d1cff1dfb Mon Sep 17 00:00:00 2001
From: Alex Xiong <alex.xiong.tech@gmail.com>
Date: Wed, 6 Mar 2024 12:59:53 +0000
Subject: [PATCH 2/3] fix: cargo fmt

---
 .github/workflows/test-deploy-docs.yml        |  2 +-
 .../rust/icicle-cuda-runtime/src/memory.rs    | 32 ++++++++-----------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test-deploy-docs.yml b/.github/workflows/test-deploy-docs.yml
index 17702cd4e..6bc3b64b3 100644
--- a/.github/workflows/test-deploy-docs.yml
+++ b/.github/workflows/test-deploy-docs.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   test-deploy:
-    name: Test deployment of docs webiste
+    name: Test deployment of docs website
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
diff --git a/wrappers/rust/icicle-cuda-runtime/src/memory.rs b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
index 7f92b06e8..669349946 100644
--- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
@@ -1,15 +1,12 @@
 use crate::bindings::{
-    cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy,
-    cudaMemcpyAsync, cudaMemcpyKind,
+    cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
 };
 use crate::device::get_device;
 use crate::device_context::check_device;
 use crate::error::{CudaError, CudaResult, CudaResultWrap};
 use crate::stream::CudaStream;
 use std::mem::{size_of, MaybeUninit};
-use std::ops::{
-    Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive,
-};
+use std::ops::{Index, IndexMut, Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive};
 use std::os::raw::c_void;
 use std::slice::from_raw_parts_mut;
 
@@ -86,7 +83,9 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
     }
 
     pub fn cuda_malloc(count: usize) -> CudaResult<Self> {
-        let size = count.checked_mul(size_of::<T>()).unwrap_or(0);
+        let size = count
+            .checked_mul(size_of::<T>())
+            .unwrap_or(0);
         if size == 0 {
             return Err(CudaError::cudaErrorMemoryAllocation); //TODO: only CUDA backend should return CudaError
         }
@@ -102,19 +101,16 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
     }
 
     pub fn cuda_malloc_async(count: usize, stream: &CudaStream) -> CudaResult<Self> {
-        let size = count.checked_mul(size_of::<T>()).unwrap_or(0);
+        let size = count
+            .checked_mul(size_of::<T>())
+            .unwrap_or(0);
         if size == 0 {
             return Err(CudaError::cudaErrorMemoryAllocation);
         }
 
         let mut device_ptr = MaybeUninit::<*mut c_void>::uninit();
         unsafe {
-            cudaMallocAsync(
-                device_ptr.as_mut_ptr(),
-                size,
-                stream.handle as *mut _ as *mut _,
-            )
-            .wrap()?;
+            cudaMallocAsync(device_ptr.as_mut_ptr(), size, stream.handle as *mut _ as *mut _).wrap()?;
             Ok(Self::Device(
                 from_raw_parts_mut(device_ptr.assume_init() as *mut T, count),
                 get_device().unwrap() as i32,
@@ -127,11 +123,7 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
             check_device(*device_id);
             if !s.is_empty() {
                 unsafe {
-                    cudaFreeAsync(
-                        s.as_mut_ptr() as *mut c_void,
-                        stream.handle as *mut _ as *mut _,
-                    )
-                    .wrap()?;
+                    cudaFreeAsync(s.as_mut_ptr() as *mut c_void, stream.handle as *mut _ as *mut _).wrap()?;
                 }
             }
         }
@@ -283,7 +275,9 @@ impl<'a, T> Drop for HostOrDeviceSlice<'a, T> {
                 }
 
                 unsafe {
-                    cudaFree(s.as_mut_ptr() as *mut c_void).wrap().unwrap();
+                    cudaFree(s.as_mut_ptr() as *mut c_void)
+                        .wrap()
+                        .unwrap();
                 }
             }
             Self::Host(_) => {}

From 7185657ff7db8db239517e7d4a89da5912dfae47 Mon Sep 17 00:00:00 2001
From: DmytroTym <dmytrotym1@gmail.com>
Date: Wed, 6 Mar 2024 18:13:23 +0200
Subject: [PATCH 3/3] Warmup function

---
 wrappers/rust/icicle-core/src/msm/tests.rs      | 11 ++++++-----
 wrappers/rust/icicle-cuda-runtime/build.rs      |  1 +
 wrappers/rust/icicle-cuda-runtime/src/device.rs | 17 ++++++++++++++++-
 wrappers/rust/icicle-cuda-runtime/src/memory.rs | 14 +-------------
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/wrappers/rust/icicle-core/src/msm/tests.rs b/wrappers/rust/icicle-core/src/msm/tests.rs
index aa6afb3e0..fa103e3bb 100644
--- a/wrappers/rust/icicle-core/src/msm/tests.rs
+++ b/wrappers/rust/icicle-core/src/msm/tests.rs
@@ -1,7 +1,7 @@
 use crate::curve::{Affine, Curve, Projective};
 use crate::msm::{msm, MSMConfig, MSM};
 use crate::traits::{FieldImpl, GenerateRandom};
-use icicle_cuda_runtime::device::{get_device_count, set_device};
+use icicle_cuda_runtime::device::{get_device_count, set_device, warmup};
 use icicle_cuda_runtime::memory::HostOrDeviceSlice;
 use icicle_cuda_runtime::stream::CudaStream;
 use rayon::iter::IntoParallelIterator;
@@ -108,6 +108,8 @@ where
 {
     let test_sizes = [1000, 1 << 16];
     let batch_sizes = [1, 3, 1 << 4];
+    let stream = CudaStream::create().unwrap();
+    warmup(&stream).unwrap();
     for test_size in test_sizes {
         for batch_size in batch_sizes {
             let points = generate_random_affine_points_with_zeroes(test_size, 10);
@@ -123,7 +125,6 @@ where
             let mut msm_results_1 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
             let mut msm_results_2 = HostOrDeviceSlice::cuda_malloc(batch_size).unwrap();
             let mut points_d = HostOrDeviceSlice::cuda_malloc(test_size * batch_size).unwrap();
-            let stream = CudaStream::create().unwrap();
             points_d
                 .copy_from_host_async(&points_cloned, &stream)
                 .unwrap();
@@ -147,9 +148,6 @@ where
             stream
                 .synchronize()
                 .unwrap();
-            stream
-                .destroy()
-                .unwrap();
 
             let points_ark: Vec<_> = points_h
                 .as_slice()
@@ -172,6 +170,9 @@ where
             }
         }
     }
+    stream
+        .destroy()
+        .unwrap();
 }
 
 pub fn check_msm_skewed_distributions<C: Curve + MSM<C>>()
diff --git a/wrappers/rust/icicle-cuda-runtime/build.rs b/wrappers/rust/icicle-cuda-runtime/build.rs
index b203e3dc5..7858bf5c8 100644
--- a/wrappers/rust/icicle-cuda-runtime/build.rs
+++ b/wrappers/rust/icicle-cuda-runtime/build.rs
@@ -77,6 +77,7 @@ fn main() {
         .allowlist_function("cudaMemset")
         .allowlist_function("cudaMemsetAsync")
         .allowlist_function("cudaDeviceGetDefaultMemPool")
+        .allowlist_function("cudaMemGetInfo")
         .rustified_enum("cudaMemcpyKind")
         // Stream Ordered Memory Allocator
         // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
diff --git a/wrappers/rust/icicle-cuda-runtime/src/device.rs b/wrappers/rust/icicle-cuda-runtime/src/device.rs
index 80750b2f6..bc6cedb03 100644
--- a/wrappers/rust/icicle-cuda-runtime/src/device.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/device.rs
@@ -1,7 +1,9 @@
 use crate::{
-    bindings::{cudaGetDevice, cudaGetDeviceCount, cudaSetDevice},
+    bindings::{cudaFreeAsync, cudaGetDevice, cudaGetDeviceCount, cudaMallocAsync, cudaMemGetInfo, cudaSetDevice},
     error::{CudaResult, CudaResultWrap},
+    stream::CudaStream,
 };
+use std::mem::MaybeUninit;
 
 pub fn set_device(device_id: usize) -> CudaResult<()> {
     unsafe { cudaSetDevice(device_id as i32) }.wrap()
@@ -16,3 +18,16 @@ pub fn get_device() -> CudaResult<usize> {
     let mut device_id = 0;
     unsafe { cudaGetDevice(&mut device_id) }.wrap_value(device_id as usize)
 }
+
+// This function pre-allocates default memory pool and warms the GPU up
+// so that subsequent memory allocations and other calls are not slowed down
+pub fn warmup(stream: &CudaStream) -> CudaResult<()> {
+    let mut device_ptr = MaybeUninit::<*mut std::ffi::c_void>::uninit();
+    let mut free_memory: usize = 0;
+    let mut _total_memory: usize = 0;
+    unsafe {
+        cudaMemGetInfo(&mut free_memory as *mut usize, &mut _total_memory as *mut usize).wrap()?;
+        cudaMallocAsync(device_ptr.as_mut_ptr(), free_memory >> 1, stream.handle).wrap()?;
+        cudaFreeAsync(device_ptr.assume_init(), stream.handle).wrap()
+    }
+}
diff --git a/wrappers/rust/icicle-cuda-runtime/src/memory.rs b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
index 669349946..4596585b6 100644
--- a/wrappers/rust/icicle-cuda-runtime/src/memory.rs
+++ b/wrappers/rust/icicle-cuda-runtime/src/memory.rs
@@ -1,5 +1,5 @@
 use crate::bindings::{
-    cudaFree, cudaFreeAsync, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
+    cudaFree, cudaMalloc, cudaMallocAsync, cudaMemPool_t, cudaMemcpy, cudaMemcpyAsync, cudaMemcpyKind,
 };
 use crate::device::get_device;
 use crate::device_context::check_device;
@@ -118,18 +118,6 @@ impl<'a, T> HostOrDeviceSlice<'a, T> {
         }
     }
 
-    pub fn cuda_free_async(&mut self, stream: &CudaStream) -> CudaResult<()> {
-        if let Self::Device(s, device_id) = self {
-            check_device(*device_id);
-            if !s.is_empty() {
-                unsafe {
-                    cudaFreeAsync(s.as_mut_ptr() as *mut c_void, stream.handle as *mut _ as *mut _).wrap()?;
-                }
-            }
-        }
-        Ok(())
-    }
-
     pub fn copy_from_host(&mut self, val: &[T]) -> CudaResult<()> {
         match self {
             Self::Device(_, device_id) => check_device(*device_id),