From 027c4c36b512a219daf18cecd6f970aa4f49eaef Mon Sep 17 00:00:00 2001 From: "Daniel J. Magee" Date: Wed, 15 May 2024 10:07:22 -0600 Subject: [PATCH 1/3] Update dgemm source. Now works with libsci, libsci_acc, cublas, cublasxt. --- microbenchmarks/dgemm/src/CMakeLists.txt | 15 +- microbenchmarks/dgemm/src/mt-dgemm.c | 172 ++++++++++++++++++----- 2 files changed, 148 insertions(+), 39 deletions(-) diff --git a/microbenchmarks/dgemm/src/CMakeLists.txt b/microbenchmarks/dgemm/src/CMakeLists.txt index 275f0126..34c97bb3 100644 --- a/microbenchmarks/dgemm/src/CMakeLists.txt +++ b/microbenchmarks/dgemm/src/CMakeLists.txt @@ -18,7 +18,7 @@ LANGUAGES C) site_name( SITENAME ) if ( NOT DEFINED BLAS_NAME ) - message( SEND_ERROR "BLAS NAME MUST BE SPECIFIED: cblas, mkl, essl or raw") + message( SEND_ERROR "BLAS NAME MUST BE SPECIFIED: cblas, mkl, essl, cublas, libsci or raw") endif() string( TOUPPER ${BLAS_NAME} BLAS_NAME_UPPER ) @@ -63,7 +63,11 @@ message("\n") add_executable(dgemm mt-dgemm.c) if ( DEFINED BLAS_ROOT ) - target_link_directories( dgemm PRIVATE "${BLAS_ROOT}/lib") + if ( ${BLAS_NAME} STREQUAL "cublas" ) + target_link_directories( dgemm PRIVATE "${BLAS_ROOT}/lib64") + else() + target_link_directories( dgemm PRIVATE "${BLAS_ROOT}/lib") + endif() target_include_directories( dgemm PRIVATE "${BLAS_ROOT}/include") endif() @@ -79,6 +83,13 @@ if ( ${BLAS_NAME} STREQUAL "mkl" ) set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qmkl=parallel") elseif ( ${BLAS_NAME} STREQUAL "cblas" ) target_link_libraries( dgemm LINK_PUBLIC "openblas") +# elseif ( ${BLAS_NAME} STREQUAL "libsci" ) +# string( TOLOWER $ENV{PE_ENV} BLAS_COMPILER ) +# set( BLAS_COMPILER $ENV{PE_ENV} ) +# target_link_libraries( dgemm LINK_PUBLIC "sci_${BLAS_COMPILER}" ) +# elseif ( ${BLAS_NAME} STREQUAL "cublas" OR ${BLAS_NAME} STREQUAL "cublasxt" ) +elseif ( ${BLAS_NAME} MATCHES "cublas" ) + target_link_libraries( dgemm LINK_PUBLIC "-lcuda;-lcublas;-lcudart") elseif ( ${BLAS_NAME} STREQUAL "essl" ) target_link_libraries( dgemm LINK_PUBLIC "essl") endif() diff --git a/microbenchmarks/dgemm/src/mt-dgemm.c b/microbenchmarks/dgemm/src/mt-dgemm.c index 375757e2..fe1750d5 100644 --- a/microbenchmarks/dgemm/src/mt-dgemm.c +++ b/microbenchmarks/dgemm/src/mt-dgemm.c @@ -10,6 +10,29 @@ #define BLAS_LIB "mkl" #endif +#ifdef USE_CUBLAS +#include +#include +#define BLAS_LIB "cublas" +#endif + +#ifdef USE_CUBLASXT +#include +#include +#define BLAS_LIB "cublasXt" +#endif + +#ifdef USE_LIBSCI +#include +#define BLAS_LIB "libsci" +#endif + +#ifdef USE_LIBSCI_ACC +#include +#define BLAS_LIB "libsci_acc" +#endif + + #ifdef USE_CBLAS #include "cblas.h" #define BLAS_LIB "cblas" @@ -50,15 +73,16 @@ int main(int argc, char* argv[]) { // DO NOT CHANGE CODE BELOW // ------------------------------------------------------- // - int N = 256; + size_t N = 256; int repeats = 8; + size_t block_size = 0; - double alpha = 1.0; - double beta = 1.0; + double alpha = 1.0; + double beta = 1.0; if(argc > 1) { N = atoi(argv[1]); - printf("Matrix size input by command line: %d\n", N); + printf("Matrix size input by command line: %zu\n", N); if(argc > 2) { repeats = atoi(argv[2]); @@ -72,50 +96,94 @@ int main(int argc, char* argv[]) { if(argc > 3) { alpha = (double) atof(argv[3]); - if(argc > 4) { beta = (double) atof(argv[4]); + if(argc > 5) block_size = atoi(argv[5]); } } } else { printf("Repeat multiply defaulted to %d\n", repeats); } } else { - printf("Matrix size defaulted to %d\n", N); + printf("Matrix size defaulted to %zu\n", N); } - printf("Alpha = %f\n", alpha); - printf("Beta = %f\n", beta); - if(N < 128) { - printf("Error: N (%d) is less than 128, the matrix is too small.\n", N); + printf("Error: N (%zu) is less than 128, the matrix is too small.\n", N); exit(-1); } + + const size_t matrixsize = sizeof(double) * N * N; + if (block_size == 0) block_size = N/2; + printf("Alpha = %.2f\n", alpha); + printf("Beta = %.2f\n", beta); + printf("BlockSize = %zu\n", block_size); printf("Allocating Matrices...\n"); - double* DGEMM_RESTRICT matrixA = (double*) malloc(sizeof(double) * N * N); - double* DGEMM_RESTRICT matrixB = (double*) malloc(sizeof(double) * N * N); - double* DGEMM_RESTRICT matrixC = (double*) malloc(sizeof(double) * N * N); + double* DGEMM_RESTRICT matrixA = (double*) malloc(matrixsize); + double* DGEMM_RESTRICT matrixB = (double*) malloc(matrixsize); + double* DGEMM_RESTRICT matrixC = (double*) malloc(matrixsize); - printf("Allocation complete, populating with values...\n"); + printf("Allocation complete, populating with values..."); - int i, j, k, r; + size_t i, j, k, r; + double start, end, time_taken, time_section; - #pragma omp parallel for + start = get_seconds(); + #pragma omp parallel for private(i,j,k) for(i = 0; i < N; i++) { for(j = 0; j < N; j++) { - matrixA[i*N + j] = 2.0; - matrixB[i*N + j] = 0.5; - matrixC[i*N + j] = 1.0; + k=i*N + j; + matrixA[k] = 2.0; + matrixB[k] = 0.5; + matrixC[k] = 1.0; } } +#if defined(USE_CUBLAS) + // Create Cublas Handle + cublasHandle_t handle; + cublasCreate(&handle); + printf("-- CUDA!!\nAllocating and transferring values..."); + double *dMatrixA, *dMatrixB, *dMatrixC; + cudaMalloc((void **)&dMatrixA, matrixsize); + cudaMalloc((void **)&dMatrixB, matrixsize); + cudaMalloc((void **)&dMatrixC, matrixsize); + + cudaMemcpy(dMatrixA, matrixA, matrixsize, cudaMemcpyHostToDevice); + cudaMemcpy(dMatrixB, matrixB, matrixsize, cudaMemcpyHostToDevice); + cudaMemcpy(dMatrixC, matrixC, matrixsize, cudaMemcpyHostToDevice); +#endif + +#ifdef USE_CUBLASXT +// Create CublasXt Handle and select all available devices. +// You don't want to use explicit device memory here because it needs +// to be distributed across all devices and cudaMalloc only assigns +// to the current device. + int *devices = NULL; + cublasXtHandle_t handle; + int device_count, blockdim; + cudaGetDeviceCount(&device_count); + devices = (int *)malloc(sizeof(int) * device_count); + cublasXtCreate(&handle); + for (int i=0; i Date: Wed, 15 May 2024 10:14:59 -0600 Subject: [PATCH 2/3] Update dgemm doc --- doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst b/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst index ffdb4858..dcfbb6fe 100644 --- a/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst +++ b/doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst @@ -44,14 +44,14 @@ Load the compiler; make and enter a build directory. .. code-block:: bash - cmake -DBLAS_NAME= .. + cmake -DBLAS_NAME= -DBLAS_ROOT= .. make .. -Current `BLAS_NAME` options are mkl, cblas (openblas), essl, or the raw coded (OpenMP threaded) dgemm. +Current `BLAS_NAME` options are mkl, cblas (openblas), essl, libsci, libsci_acc, cublas, cublasxt or the raw coded (OpenMP threaded) dgemm. The `BLAS_NAME` argument is required. -If the headers or libraries aren't found provide `BLAS_LIB_DIR`, `BLAS_INCLUDE_DIR`, or `BLAS_ROOT` to cmake. +If the headers or libraries aren't found provide `BLAS_LIB_DIR` or `BLAS_INCLUDE_DIR` to cmake. If using a different blas library, modify the C source file to use the correct header and dgemm command. Running From d4f6935c51dbcc26a4c1c2474a787fbde7d5d07c Mon Sep 17 00:00:00 2001 From: "Daniel J. Magee" Date: Wed, 15 May 2024 10:49:09 -0600 Subject: [PATCH 3/3] Updated OSUMB docs to specify latency min and max values and GPU requirements. --- doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst | 2 ++ doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSU_req.csv | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst index 37f309b9..313aad9b 100644 --- a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst +++ b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst @@ -60,6 +60,8 @@ For example, on HPE-Cray systems: Running ======= +For any GPU enabled system, please also include the GPU variants of the following benchmarks. + .. csv-table:: OSU Microbenchmark Tests :file: OSU_req.csv :align: center diff --git a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSU_req.csv b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSU_req.csv index fc36a843..7042ab88 100644 --- a/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSU_req.csv +++ b/doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSU_req.csv @@ -1,5 +1,7 @@ "Program","Description","Msg Size","Num Nodes","Rank Config" -"osu_latency","P2p Latency","8 B","2","1 per node" +"osu_latency","P2p Latency","8 B","2","2 tests per node: + - Longest Path (worst case) + - Shortest Path (best case)" "osu_bibw","P2p Bi-directional BW","16 kB","2","1 per node" "osu_mbw_mr","P2p Multi-BW & Msg Rate","16 KB","2","Host-to-Host (two tests): - 1 per NIC