[atomicCAS] exclude h2d copy from the timing measurement

CHIP-SPV · Apr 19, 2023 · bd2f002 · bd2f002
1 parent d4876a9
commit bd2f002
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 6 deletions.
diff --git a/atomicCAS-cuda/main.cu b/atomicCAS-cuda/main.cu
@@ -19,10 +19,11 @@
 
 template <typename T>
 void testMin (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
+  cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice);
+
   auto start = std::chrono::steady_clock::now();
 
   for (int n = 0; n < repeat; n++) {
-    cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice);
     atomicMinDerived<T><<<NUM_BLOCKS, BLOCK_SIZE>>> (d_ptr);
   }
 
@@ -37,10 +38,11 @@ void testMin (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
 
 template <typename T>
 void testMax (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
+  cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice);
+
   auto start = std::chrono::steady_clock::now();
 
   for (int n = 0; n < repeat; n++) {
-    cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice);
     atomicMaxDerived<T><<<NUM_BLOCKS, BLOCK_SIZE>>> (d_ptr);
   }
 
@@ -55,10 +57,11 @@ void testMax (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
 
 template <typename T>
 void testAdd (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
+  cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice);
+
   auto start = std::chrono::steady_clock::now();
 
   for (int n = 0; n < repeat; n++) {
-    cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice);
     atomicAddDerived<T><<<NUM_BLOCKS, BLOCK_SIZE>>> (d_ptr);
   }
 

diff --git a/atomicCAS-hip/main.cu b/atomicCAS-hip/main.cu
@@ -19,10 +19,11 @@
 
 template <typename T>
 void testMin (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
+  hipMemcpy(d_ptr, h_ptr, sizeof(T), hipMemcpyHostToDevice);
+
   auto start = std::chrono::steady_clock::now();
 
   for (int n = 0; n < repeat; n++) {
-    hipMemcpy(d_ptr, h_ptr, sizeof(T), hipMemcpyHostToDevice);
     hipLaunchKernelGGL(HIP_KERNEL_NAME(atomicMinDerived<T>), NUM_BLOCKS, BLOCK_SIZE, 0, 0, d_ptr);
   }
 
@@ -37,10 +38,11 @@ void testMin (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
 
 template <typename T>
 void testMax (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
+  hipMemcpy(d_ptr, h_ptr, sizeof(T), hipMemcpyHostToDevice);
+
   auto start = std::chrono::steady_clock::now();
 
   for (int n = 0; n < repeat; n++) {
-    hipMemcpy(d_ptr, h_ptr, sizeof(T), hipMemcpyHostToDevice);
     hipLaunchKernelGGL(HIP_KERNEL_NAME(atomicMaxDerived<T>), NUM_BLOCKS, BLOCK_SIZE, 0, 0, d_ptr);
   }
 
@@ -55,10 +57,11 @@ void testMax (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
 
 template <typename T>
 void testAdd (T *h_ptr, T *d_ptr, const int repeat, const char* name) {
+  hipMemcpy(d_ptr, h_ptr, sizeof(T), hipMemcpyHostToDevice);
+
   auto start = std::chrono::steady_clock::now();
 
   for (int n = 0; n < repeat; n++) {
-    hipMemcpy(d_ptr, h_ptr, sizeof(T), hipMemcpyHostToDevice);
     hipLaunchKernelGGL(HIP_KERNEL_NAME(atomicAddDerived<T>), NUM_BLOCKS, BLOCK_SIZE, 0, 0, d_ptr);
   }