Skip to content

Commit

Permalink
temporary: A testing code
Browse files Browse the repository at this point in the history
  • Loading branch information
hzhou committed Mar 24, 2022
1 parent 2830361 commit f84418c
Showing 1 changed file with 108 additions and 0 deletions.
108 changes: 108 additions & 0 deletions test/mpi/impls/mpich/gpu/stream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/* CC: nvcc -g */
/* lib_list: -lmpi */
/* run: mpirun -l -n 2 */

#include <mpi.h>
#include <stdio.h>
#include <assert.h>

__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i];
}

int main(void)
{
int mpi_errno;
int rank, size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
printf("Process %d / %d\n", rank, size);

cudaStream_t stream;
cudaStreamCreate(&stream);

int N = 1000000;
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));

cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));

if (rank == 0) {
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
}
} else if (rank == 1) {
for (int i = 0; i < N; i++) {
y[i] = 2.0f;
}
}

#if 1
if (rank == 0) {
#if 0
cudaMemcpyAsync(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice, stream);

mpi_errno = MPIX_Send_stream(d_x, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stream);
#else
mpi_errno = MPIX_Send_stream(x, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stream);
#endif
assert(mpi_errno == MPI_SUCCESS);

cudaStreamSynchronize(stream);
} else if (rank == 1) {
cudaMemcpyAsync(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice, stream);

mpi_errno = MPIX_Recv_stream(d_x, N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, &stream);
assert(mpi_errno == MPI_SUCCESS);

// Perform SAXPY on 1M elements
saxpy<<<(N+255)/256, 256, 0, stream>>>(N, 2.0f, d_x, d_y);

cudaMemcpyAsync(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost, stream);

cudaStreamSynchronize(stream);
}

#else
if (rank == 0) {
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
mpi_errno = MPI_Send(d_x, N, MPI_FLOAT, 1, 0, MPI_COMM_WORLD);
assert(mpi_errno == MPI_SUCCESS);
} else if (rank == 1) {
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
mpi_errno = MPI_Recv(d_x, N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
assert(mpi_errno == MPI_SUCCESS);
// Perform SAXPY on 1M elements
saxpy<<<(N+255)/256, 256, 0>>>(N, 2.0f, d_x, d_y);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
}
#endif
if (rank == 1) {
float maxError = 0.0f;
int errs = 0;
for (int i = 0; i < N; i++) {
if (abs(y[i] - 4.0f) > 0.01) {
errs++;
maxError = max(maxError, abs(y[i]-4.0f));
}
}
printf("%d errors, Max error: %f\n", errs, maxError);
}

cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);

cudaStreamDestroy(stream);
MPI_Finalize();
}

0 comments on commit f84418c

Please sign in to comment.