Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated dgemm and OSU benchmarks #100

Merged
merged 3 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSUMB.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ For example, on HPE-Cray systems:
Running
=======

For any GPU enabled system, please also include the GPU variants of the following benchmarks.

.. csv-table:: OSU Microbenchmark Tests
:file: OSU_req.csv
:align: center
Expand Down
4 changes: 3 additions & 1 deletion doc/sphinx/09_Microbenchmarks/M3_OSUMB/OSU_req.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"Program","Description","Msg Size","Num Nodes","Rank Config"
"osu_latency","P2p Latency","8 B","2","1 per node"
"osu_latency","P2p Latency","8 B","2","2 tests per node:
- Longest Path (worst case)
- Shortest Path (best case)"
"osu_bibw","P2p Bi-directional BW","16 kB","2","1 per node"
"osu_mbw_mr","P2p Multi-BW & Msg Rate","16 KB","2","Host-to-Host (two tests):
- 1 per NIC
Expand Down
6 changes: 3 additions & 3 deletions doc/sphinx/09_Microbenchmarks/M5_DGEMM/DGEMM.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ Load the compiler; make and enter a build directory.

.. code-block:: bash

cmake -DBLAS_NAME=<blas library name> ..
cmake -DBLAS_NAME=<blas library name> -DBLAS_ROOT=<root path to blas library> ..
make

..

Current `BLAS_NAME` options are mkl, cblas (openblas), essl, or the raw coded (OpenMP threaded) dgemm.
Current `BLAS_NAME` options are mkl, cblas (openblas), essl, libsci, libsci_acc, cublas, cublasxt or the raw coded (OpenMP threaded) dgemm.
The `BLAS_NAME` argument is required.
If the headers or libraries aren't found provide `BLAS_LIB_DIR`, `BLAS_INCLUDE_DIR`, or `BLAS_ROOT` to cmake.
If the headers or libraries aren't found provide `BLAS_LIB_DIR` or `BLAS_INCLUDE_DIR` to cmake.
If using a different blas library, modify the C source file to use the correct header and dgemm command.

Running
Expand Down
15 changes: 13 additions & 2 deletions microbenchmarks/dgemm/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ LANGUAGES C)
site_name( SITENAME )

if ( NOT DEFINED BLAS_NAME )
message( SEND_ERROR "BLAS NAME MUST BE SPECIFIED: cblas, mkl, essl or raw")
message( SEND_ERROR "BLAS NAME MUST BE SPECIFIED: cblas, mkl, essl, cublas, libsci or raw")
endif()

string( TOUPPER ${BLAS_NAME} BLAS_NAME_UPPER )
Expand Down Expand Up @@ -63,7 +63,11 @@ message("\n")
add_executable(dgemm mt-dgemm.c)

if ( DEFINED BLAS_ROOT )
target_link_directories( dgemm PRIVATE "${BLAS_ROOT}/lib")
if ( ${BLAS_NAME} STREQUAL "cublas" )
target_link_directories( dgemm PRIVATE "${BLAS_ROOT}/lib64")
else()
target_link_directories( dgemm PRIVATE "${BLAS_ROOT}/lib")
endif()
target_include_directories( dgemm PRIVATE "${BLAS_ROOT}/include")
endif()

Expand All @@ -79,6 +83,13 @@ if ( ${BLAS_NAME} STREQUAL "mkl" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qmkl=parallel")
elseif ( ${BLAS_NAME} STREQUAL "cblas" )
target_link_libraries( dgemm LINK_PUBLIC "openblas")
# elseif ( ${BLAS_NAME} STREQUAL "libsci" )
# string( TOLOWER $ENV{PE_ENV} BLAS_COMPILER )
# set( BLAS_COMPILER $ENV{PE_ENV} )
# target_link_libraries( dgemm LINK_PUBLIC "sci_${BLAS_COMPILER}" )
# elseif ( ${BLAS_NAME} STREQUAL "cublas" OR ${BLAS_NAME} STREQUAL "cublasxt" )
elseif ( ${BLAS_NAME} MATCHES "cublas" )
target_link_libraries( dgemm LINK_PUBLIC "-lcuda;-lcublas;-lcudart")
elseif ( ${BLAS_NAME} STREQUAL "essl" )
target_link_libraries( dgemm LINK_PUBLIC "essl")
endif()
Expand Down
172 changes: 135 additions & 37 deletions microbenchmarks/dgemm/src/mt-dgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,29 @@
#define BLAS_LIB "mkl"
#endif

#ifdef USE_CUBLAS
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define BLAS_LIB "cublas"
#endif

#ifdef USE_CUBLASXT
#include <cublasXt.h>
#include <cuda_runtime.h>
#define BLAS_LIB "cublasXt"
#endif

#ifdef USE_LIBSCI
#include <cblas.h>
#define BLAS_LIB "libsci"
#endif

#ifdef USE_LIBSCI_ACC
#include <libsci_acc.h>
#define BLAS_LIB "libsci_acc"
#endif


#ifdef USE_CBLAS
#include "cblas.h"
#define BLAS_LIB "cblas"
Expand Down Expand Up @@ -50,15 +73,16 @@ int main(int argc, char* argv[]) {
// DO NOT CHANGE CODE BELOW
// ------------------------------------------------------- //

int N = 256;
size_t N = 256;
int repeats = 8;
size_t block_size = 0;

double alpha = 1.0;
double beta = 1.0;
double alpha = 1.0;
double beta = 1.0;

if(argc > 1) {
N = atoi(argv[1]);
printf("Matrix size input by command line: %d\n", N);
printf("Matrix size input by command line: %zu\n", N);

if(argc > 2) {
repeats = atoi(argv[2]);
Expand All @@ -72,50 +96,94 @@ int main(int argc, char* argv[]) {

if(argc > 3) {
alpha = (double) atof(argv[3]);

if(argc > 4) {
beta = (double) atof(argv[4]);
if(argc > 5) block_size = atoi(argv[5]);
}
}
} else {
printf("Repeat multiply defaulted to %d\n", repeats);
}
} else {
printf("Matrix size defaulted to %d\n", N);
printf("Matrix size defaulted to %zu\n", N);
}

printf("Alpha = %f\n", alpha);
printf("Beta = %f\n", beta);

if(N < 128) {
printf("Error: N (%d) is less than 128, the matrix is too small.\n", N);
printf("Error: N (%zu) is less than 128, the matrix is too small.\n", N);
exit(-1);
}

const size_t matrixsize = sizeof(double) * N * N;
if (block_size == 0) block_size = N/2;

printf("Alpha = %.2f\n", alpha);
printf("Beta = %.2f\n", beta);
printf("BlockSize = %zu\n", block_size);
printf("Allocating Matrices...\n");

double* DGEMM_RESTRICT matrixA = (double*) malloc(sizeof(double) * N * N);
double* DGEMM_RESTRICT matrixB = (double*) malloc(sizeof(double) * N * N);
double* DGEMM_RESTRICT matrixC = (double*) malloc(sizeof(double) * N * N);
double* DGEMM_RESTRICT matrixA = (double*) malloc(matrixsize);
double* DGEMM_RESTRICT matrixB = (double*) malloc(matrixsize);
double* DGEMM_RESTRICT matrixC = (double*) malloc(matrixsize);

printf("Allocation complete, populating with values...\n");
printf("Allocation complete, populating with values...");

int i, j, k, r;
size_t i, j, k, r;
double start, end, time_taken, time_section;

#pragma omp parallel for
start = get_seconds();
#pragma omp parallel for private(i,j,k)
for(i = 0; i < N; i++) {
for(j = 0; j < N; j++) {
matrixA[i*N + j] = 2.0;
matrixB[i*N + j] = 0.5;
matrixC[i*N + j] = 1.0;
k=i*N + j;
matrixA[k] = 2.0;
matrixB[k] = 0.5;
matrixC[k] = 1.0;
}
}

#if defined(USE_CUBLAS)
// Create Cublas Handle
cublasHandle_t handle;
cublasCreate(&handle);
printf("-- CUDA!!\nAllocating and transferring values...");
double *dMatrixA, *dMatrixB, *dMatrixC;
cudaMalloc((void **)&dMatrixA, matrixsize);
cudaMalloc((void **)&dMatrixB, matrixsize);
cudaMalloc((void **)&dMatrixC, matrixsize);

cudaMemcpy(dMatrixA, matrixA, matrixsize, cudaMemcpyHostToDevice);
cudaMemcpy(dMatrixB, matrixB, matrixsize, cudaMemcpyHostToDevice);
cudaMemcpy(dMatrixC, matrixC, matrixsize, cudaMemcpyHostToDevice);
#endif

#ifdef USE_CUBLASXT
// Create CublasXt Handle and select all available devices.
// You don't want to use explicit device memory here because it needs
// to be distributed across all devices and cudaMalloc only assigns
// to the current device.
int *devices = NULL;
cublasXtHandle_t handle;
int device_count, blockdim;
cudaGetDeviceCount(&device_count);
devices = (int *)malloc(sizeof(int) * device_count);
cublasXtCreate(&handle);
for (int i=0; i<device_count; i++) devices[i] = i;
cublasXtDeviceSelect(handle, device_count, devices);
cublasXtSetPinningMemMode(handle, CUBLASXT_PINNING_ENABLED);
cublasXtSetBlockDim(handle, block_size);
cublasXtGetBlockDim(handle, &blockdim);
printf("CUBLASXT has block dim: %d\n", blockdim);
#endif

end = get_seconds();
time_section = (end - start);
printf(" %g seconds\n", time_section);

printf("Performing multiplication...\n");
printf("Using Blas Type: %s\n", BLAS_LIB);
printf("Iteration #:\n");

const double start = get_seconds();
start = get_seconds();

// ------------------------------------------------------- //
// VENDOR NOTIFICATION: START MODIFIABLE REGION
Expand All @@ -129,14 +197,22 @@ int main(int argc, char* argv[]) {

// Repeat multiple times
for(r = 0; r < repeats; r++) {
#if defined(USE_MKL) || defined(USE_CBLAS)
#if defined(USE_MKL) || defined(USE_CBLAS) || defined(USE_LIBSCI)
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N);
#elif USE_ESSL
dgemm("N", "N",
#elif defined(USE_CUBLAS)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, dMatrixA, N, dMatrixB, N,
&beta, dMatrixC, N);
cudaDeviceSynchronize();
#elif defined(USE_CUBLASXT)
cublasXtDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, matrixA, N, matrixB, N,
&beta, matrixC, N);
cudaDeviceSynchronize();
#elif defined(USE_ESSL) || defined(USE_LIBSCI_ACC)
dgemm('N', 'N',
N, N, N, alpha, matrixA, N, matrixB, N, beta, matrixC, N);
#else
#pragma omp parallel for private(sum)
#pragma omp parallel for private(sum, j, k)
for(i = 0; i < N; i++) {
for(j = 0; j < N; j++) {
sum = 0;
Expand All @@ -150,12 +226,16 @@ int main(int argc, char* argv[]) {
}
#endif
if ( r%10 == 0 ) {
printf("%d, ", r);
printf("%zu, ", r);
fflush(stdout);
}
}
printf("\n");

#if defined(USE_CUBLAS)
cudaMemcpy(matrixC, dMatrixC, matrixsize, cudaMemcpyDeviceToHost);
#endif

// ------------------------------------------------------- //
// VENDOR NOTIFICATION: END MODIFIABLE REGION
// ------------------------------------------------------- //
Expand All @@ -164,42 +244,60 @@ int main(int argc, char* argv[]) {
// DO NOT CHANGE CODE BELOW
// ------------------------------------------------------- //

const double end = get_seconds();
end = get_seconds();
time_taken = (end - start);

printf("Calculating matrix check...\n");
#ifdef USE_CUBLAS
cublasDestroy(handle);
cudaFree(dMatrixA);
cudaFree(dMatrixB);
cudaFree(dMatrixC);
cudaDeviceSynchronize();
#endif

#ifdef USE_CUBLASXT
cublasXtDestroy(handle);
free(devices);
#endif

printf("Calculating matrix check...");

double final_sum = 0;
double count = 0;
start = get_seconds();

#pragma omp parallel for reduction(+:final_sum, count)
#pragma omp parallel for reduction(+:final_sum, count) private(i,j)
for(i = 0; i < N; i++) {
for(j = 0; j < N; j++) {
final_sum += matrixC[i*N + j];
count += 1.0;
}
}

double N_dbl = (double) N;
double matrix_memory = (3 * N_dbl * N_dbl) * ((double) sizeof(double));
end = get_seconds();
time_section = (end - start);
printf(" %g seconds\n", time_section);

double matrix_memory = (3 * matrixsize);

printf("\n");
printf("===============================================================\n");

printf("Final Sum is: %f\n", (final_sum / (count * repeats)));
printf("Memory for Matrices: %f MB\n",
printf("Memory for Matrices: %.0f MB\n",
(matrix_memory / (1024 * 1024)));

const double time_taken = (end - start);
double N_dbl = (double) N;

printf("Multiply time: %f seconds\n", time_taken);
printf("Multiply time: %.6g seconds\n", time_taken);

// O(N**3) elements each with one add and three multiplies
// (alpha, beta and A_i*B_i).
const double flops_computed = (N_dbl * N_dbl * N_dbl * 2.0 * (double)(repeats)) +
(N_dbl * N_dbl * 2 * (double)(repeats));
double flops_computed = (N_dbl * N_dbl * 2.0 * (double)repeats)*(N_dbl+1.0);
double total_time = ( flops_computed / time_taken) / 1.0e9;

printf("FLOPs computed: %f\n", flops_computed);
printf("GFLOP/s rate: %f GF/s\n", (flops_computed / time_taken) / 1000000000.0);
printf("FLOPs computed: %.0g\n", flops_computed);
printf("GFLOP/s rate: %.8g GF/s\n", (total_time));

printf("===============================================================\n");
printf("\n");
Expand Down