From 9b1618f02f76c70d1f1f4df5c9cb00790142dcc6 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Thu, 24 Mar 2022 10:08:24 -0500
Subject: [PATCH] test: add test/mpi/impls/mpich/cuda/stream.cu

---
 test/mpi/impls/mpich/cuda/Makefile.am |  3 +-
 test/mpi/impls/mpich/cuda/stream.cu   | 91 +++++++++++++++++++++++++++
 test/mpi/impls/mpich/cuda/testlist    |  1 +
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 test/mpi/impls/mpich/cuda/stream.cu

diff --git a/test/mpi/impls/mpich/cuda/Makefile.am b/test/mpi/impls/mpich/cuda/Makefile.am
index 59b18b8778e..8f29cb4189d 100644
--- a/test/mpi/impls/mpich/cuda/Makefile.am
+++ b/test/mpi/impls/mpich/cuda/Makefile.am
@@ -8,4 +8,5 @@ include $(top_srcdir)/Makefile_cuda.mtest
 LDADD += -lm
 
 noinst_PROGRAMS = \
-    saxpy
+    saxpy \
+    stream
diff --git a/test/mpi/impls/mpich/cuda/stream.cu b/test/mpi/impls/mpich/cuda/stream.cu
new file mode 100644
index 00000000000..001580d004a
--- /dev/null
+++ b/test/mpi/impls/mpich/cuda/stream.cu
@@ -0,0 +1,91 @@
+/* CC: nvcc -g */
+/* lib_list: -lmpi */
+/* run: mpirun -l -n 2 */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <assert.h>
+
+__global__
+void saxpy(int n, float a, float *x, float *y)
+{
+  int i = blockIdx.x*blockDim.x + threadIdx.x;
+  if (i < n) y[i] = a*x[i] + y[i];
+}
+
+int main(void)
+{
+    int mpi_errno;
+    int rank, size;
+    MPI_Init(NULL, NULL);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    int N = 1000000;
+    float *x, *y, *d_x, *d_y;
+    x = (float*)malloc(N*sizeof(float));
+    y = (float*)malloc(N*sizeof(float));
+
+    cudaMalloc(&d_x, N*sizeof(float)); 
+    cudaMalloc(&d_y, N*sizeof(float));
+
+    if (rank == 0) {
+        for (int i = 0; i < N; i++) {
+            x[i] = 1.0f;
+        }
+    } else if (rank == 1) {
+        for (int i = 0; i < N; i++) {
+            y[i] = 2.0f;
+        }
+    }
+
+    if (rank == 0) {
+      #if 0  
+        cudaMemcpyAsync(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice, stream);
+
+        mpi_errno = MPIX_Send_enqueue(d_x, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stream);
+      #else  
+        mpi_errno = MPIX_Send_enqueue(x, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stream);
+      #endif  
+        assert(mpi_errno == MPI_SUCCESS);
+
+        cudaStreamSynchronize(stream);
+    } else if (rank == 1) {
+        cudaMemcpyAsync(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice, stream);
+
+        mpi_errno = MPIX_Recv_enqueue(d_x, N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, &stream);
+        assert(mpi_errno == MPI_SUCCESS);
+
+        // Perform SAXPY on 1M elements
+        saxpy<<<(N+255)/256, 256, 0, stream>>>(N, 2.0f, d_x, d_y);
+
+        cudaMemcpyAsync(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost, stream);
+
+        cudaStreamSynchronize(stream);
+    }
+
+    if (rank == 1) {
+        float maxError = 0.0f;
+        int errs = 0;
+        for (int i = 0; i < N; i++) {
+            if (abs(y[i] - 4.0f) > 0.01) {
+                errs++;
+                maxError = max(maxError, abs(y[i]-4.0f));
+            }
+        }
+        if (errs > 0) {
+            printf("%d errors, Max error: %f\n", errs, maxError);
+        }
+    }
+
+    cudaFree(d_x);
+    cudaFree(d_y);
+    free(x);
+    free(y);
+
+    cudaStreamDestroy(stream);
+    MPI_Finalize();
+}
diff --git a/test/mpi/impls/mpich/cuda/testlist b/test/mpi/impls/mpich/cuda/testlist
index 68bfd0db9cc..85ef4b462ff 100644
--- a/test/mpi/impls/mpich/cuda/testlist
+++ b/test/mpi/impls/mpich/cuda/testlist
@@ -1 +1,2 @@
 saxpy 2
+stream 2